In [220]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import re
import nltk
import warnings
warnings.filterwarnings('ignore')

In [221]:
df=pd.read_csv('../input/train.csv')
df_test=pd.read_csv('../input/test.csv')
df.info()
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5693 entries, 0 to 5692
Data columns (total 6 columns):
id                  5693 non-null object
App Version Code    4554 non-null float64
App Version Name    4554 non-null float64
Review Text         5692 non-null object
Review Title        602 non-null object
Star Rating         5693 non-null int64
dtypes: float64(2), int64(1), object(3)
memory usage: 266.9+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1424 entries, 0 to 1423
Data columns (total 5 columns):
id                  1424 non-null object
App Version Code    1151 non-null float64
App Version Name    1151 non-null float64
Review Text         1423 non-null object
Review Title        180 non-null object
dtypes: float64(2), object(3)
memory usage: 55.7+ KB


In [222]:
#dft=df[['Review Text','Star Rating']]
#dft.head()
df.dropna(subset=['Review Text'], how='any', inplace = True)
df=df.reset_index(drop=True)
df.isnull().sum()

id                     0
App Version Code    1138
App Version Name    1138
Review Text            0
Review Title        5091
Star Rating            0
dtype: int64

In [223]:
df_test.dropna(subset=['Review Text'],how='any',inplace =True)
df_test=df_test.reset_index(drop=True)
df_test.isnull().sum()


id                     0
App Version Code     272
App Version Name     272
Review Text            0
Review Title        1244
dtype: int64

In [224]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()

In [225]:
corpus=[]
for i in range(0,5692):
    review=re.sub('[^a-zA-Z]',' ',df['Review Text'][i])
    review=review.lower()
    review=review.split()
    review=[ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review=' '.join(review)
    corpus.append(review)
    

In [226]:
corpus_test=[]
for i in range(0,1423):
    review=re.sub('[^a-zA-Z]',' ',df_test['Review Text'][i])
    review=review.lower()
    review=review.split()
    review=[ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review=' '.join(review)
    corpus_test.append(review)

In [227]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer()
X=cv.fit_transform(corpus).toarray()
x_test=cv.transform(corpus_test).toarray()
print(X.shape)
print(x_test.shape)
y=df.iloc[:,5].values

(5692, 3499)
(1423, 3499)


In [228]:
print(y.shape)
print(X.shape)

(5692,)
(5692, 3499)


In [229]:
df['Star Rating'].value_counts()

5    2923
1    1788
4     611
3     216
2     154
Name: Star Rating, dtype: int64

In [230]:
from sklearn.feature_extraction.text import TfidfVectorizer
cv=TfidfVectorizer()
response=cv.fit_transform(corpus)
#print(response)

In [231]:
response.shape

(5692, 3499)

In [232]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2, random_state=42)
print(X_train.shape)
print(y_train.shape)

(4553, 3499)
(4553,)


In [233]:
from sklearn.svm import LinearSVC
clf=LinearSVC()
clf.fit(X_train,y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [234]:
y_pred=clf.predict(X_test)
y_pred
y_pred.shape

(1139,)

In [235]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test,y_pred))

0.7453906935908692


In [236]:
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

In [237]:
clf_list = [GaussianNB(), 
            AdaBoostClassifier(random_state = 0), 
            RandomForestClassifier(random_state = 0), 
            LogisticRegression(random_state = 0),
            DecisionTreeClassifier(random_state = 0)]
for a in clf_list:
    a.fit(X_train,y_train)
    print(accuracy_score(y_test,a.predict(X_test)))
    

0.2712906057945566
0.7111501316944688
0.742756804214223
0.7664618086040387
0.7050043898156277


In [238]:
clf_list[3].fit(X_train,y_train)
y_pred1=clf_list[3].predict(X_test)

In [239]:
from sklearn.model_selection import cross_val_score
accuracy=cross_val_score(clf_list[3],X,y,cv=10)
print(accuracy)

[0.76748252 0.75831874 0.7530648  0.75263158 0.77855888 0.75571178
 0.77112676 0.76408451 0.75837743 0.73897707]


In [240]:
accuracy_2=cross_val_score(clf,X,y,cv=10)
print(accuracy_2)

[0.75699301 0.74430823 0.73029772 0.73157895 0.75219684 0.71177504
 0.73591549 0.75352113 0.7372134  0.70194004]


In [241]:
parameters={'penalty': ['l1','l2'], 'C': [0.01,0.1,1,10,100]}
from sklearn.model_selection import GridSearchCV
grid_search=GridSearchCV(estimator=clf_list[3],
                        param_grid=parameters,
                        scoring='accuracy',
                        cv=5)
grid_search=grid_search.fit(X,y)

In [242]:
print(grid_search.best_score_)
print(grid_search.best_params_)

0.758257203092059
{'C': 1, 'penalty': 'l2'}


In [243]:
cl=LogisticRegression(C=1,penalty='l2',random_state=0)
cl.fit(X,y)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=0, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [244]:
y_pred2=cl.predict(X)
y_final=cl.predict(x_test)
print(accuracy_score(y,y_pred2))

0.848559381588194


In [245]:
accuracy_3=cross_val_score(cl,X,y,cv=10)
print(accuracy_3)

[0.76748252 0.75831874 0.7530648  0.75263158 0.77855888 0.75571178
 0.77112676 0.76408451 0.75837743 0.73897707]


In [246]:
bow=pd.DataFrame(X)
bow_t=pd.DataFrame(x_test)

In [247]:
from sklearn.decomposition import PCA
pca=PCA(n_components=100)
bow_1=pca.fit_transform(bow)
bow_test1=pca.transform(bow_t)
#print(pca.explained_variance_ratio_)
#print ("{:f}".format(float(pca.explained_variance_ratio_)

In [248]:
explained_variance=pca.explained_variance_ratio_
explained_variance.shape
sum(explained_variance)

0.6322802884494818

In [249]:
"""cl.fit(bow_1,y)
y_pred3=cl.predict(bow_1)
print(accuracy_score(y,y_pred3))"""

'cl.fit(bow_1,y)\ny_pred3=cl.predict(bow_1)\nprint(accuracy_score(y,y_pred3))'

In [250]:
#accuracy_4=cross_val_score(cl,bow_1,y,cv=10)
#print(accuracy_4)

In [251]:
#df.head()
#dft=df.loc[(df['Review Title'].notnull()),:]
#dft

In [252]:
#df['App Version Code'].describe()
#df['App Version Code'].value_counts()

In [253]:
#df.loc[(df['App Version Code']<10)]
#df.groupby(['App Version Code']).mean()

In [254]:
bow_train=pd.DataFrame(bow1)
bow_test=pd.DataFrame(bow_test1)

**Left out other columns such as Review Title was very few in number could have separated it differently and predicted test cases on that basis too but not worth all the work yet.**
**Dimensionality reduction to 100 components is also not explaining enough variance so using entire tokenized dataset.**

In [256]:
my_submission=pd.DataFrame({'Id': df_test.id, 'Star Rating': y_final})
my_submission.to_csv('Niki_Assignment.csv',index=False)

In [None]:
df_test.info()