In [1]:
import pandas as pd
import numpy as np

In [2]:
df1 = pd.read_csv('train_data.csv',
                header=0, sep=',',
                parse_dates=['date', 'release_date', 'update_date'],
                infer_datetime_format=True, encoding = 'latin1')

In [3]:
df2 = pd.read_csv('test_data.csv',
                header=0, sep=',',
                parse_dates=['date', 'release_date', 'update_date'],
                infer_datetime_format=True, encoding = 'latin1')

In [4]:
data = df1.append(df2, ignore_index = True)[['asin','categories', 'description', 'review_id','review_text','reviewer','star_rating']]

In [5]:
data = data.dropna(subset = ['categories'])
data = data[data['categories'].str.contains('game', case = False)]

In [6]:
data = data.dropna(subset = ['review_text'])

In [7]:
miss_star_rating = data[~np.isfinite(data['star_rating'])]
train = data[np.isfinite(data['star_rating'])]

In [8]:
train = train[['review_text', 'star_rating']]

In [9]:
text = train['review_text']

In [10]:
ls = text.values.tolist()
import string
exclude = set(string.punctuation)
for i in range(len(ls)):
    ls[i] = ''.join(ch for ch in ls[i] if ch not in exclude)

In [40]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(min_df = 0.002,max_df=0.8)
dtm = vectorizer.fit_transform(ls)
terms = vectorizer.get_feature_names()

In [None]:
X_train

In [14]:
Y = train['star_rating']

In [42]:
X_train = pd.DataFrame(X_train.todense(),columns = vectorizer.get_feature_names())

In [43]:
X_test = pd.DataFrame(X_test.todense(),columns = vectorizer.get_feature_names())

In [None]:
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
lsa = TruncatedSVD(30, algorithm = 'randomized')
dtm_lsa = lsa.fit_transform(dtm)
dtm_lsa = Normalizer(copy=False).fit_transform(dtm_lsa)

In [50]:
Vt = pd.DataFrame(lsa.components_,columns = vectorizer.get_feature_names())
U_sigma = pd.DataFrame(dtm_lsa)
Y = train['star_rating']

In [41]:
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(dtm, Y, test_size=0.5, random_state=0)

In [44]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred1 = gnb.predict(X_test)
print('Naive Bayes Misclassified samples: %d'% (y_test != y_pred1).sum())
from sklearn.metrics import accuracy_score
print('Naive Bayes Accuracy: %.2f'% accuracy_score(y_test, y_pred1))

Naive Bayes Misclassified samples: 109169
Naive Bayes Accuracy: 0.50


In [47]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(criterion='entropy',
                                n_estimators=50, 
                                random_state=1,
                                n_jobs=2)
forest.fit(X_train, y_train)

y_pred_forest = forest.predict(X_test)
print('RF Misclassified samples: %d'% (y_test != y_pred_forest).sum())

from sklearn.metrics import accuracy_score

print('RF Accuracy: %.2f'% accuracy_score(y_test, y_pred_forest))

RF Misclassified samples: 81543
RF Accuracy: 0.63


In [51]:
from sklearn import linear_model
# Create linear regression object
regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(X_train, y_train)

# The coefficients
print('Coefficients: \n', regr.coef_)


y_train_pred = regr.predict(X_train)
y_test_pred = regr.predict(X_test)
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

print('MSE train: %.3f, test: %.3f' %(mean_squared_error(y_train, y_train_pred),
                                      mean_squared_error(y_test, y_test_pred)))
 
print('R^2 train: %.3f, test: %.3f' %(r2_score(y_train, y_train_pred),
                                      r2_score(y_test, y_test_pred)))

Coefficients: 
 [ 0.1253525   0.17802713  0.23720847 ...,  0.25737311  0.405103   -0.05002917]
MSE train: 0.946, test: 0.955
R^2 train: 0.502, test: 0.497


In [52]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(C=1000.0, random_state=0)
lr.fit(X_train, y_train)
        
y_pred = lr.predict(X_test)
print('Misclassified samples: %d' % (y_test != y_pred).sum())

from sklearn.metrics import accuracy_score

print('Accuracy: %.2f' % accuracy_score(y_test, y_pred))    

print('Coefficients: \n', lr.coef_)

Misclassified samples: 75688
Accuracy: 0.65
Coefficients: 
 [[-0.44139622  0.13846687 -0.37570244 ..., -0.28690131  0.35510785
  -0.35502655]
 [ 0.47916432  0.58447602  1.1143142  ..., -0.13589656 -0.80188723
   0.17620971]
 [ 0.74079549  0.36614293 -0.46753793 ...,  0.1622233  -1.14569812
  -0.19325826]
 [-0.39792094 -0.78416393  0.3383127  ...,  0.34606905 -0.73060717
   0.47180589]
 [ 0.26259148  0.65216364  0.29575746 ...,  0.18232428  1.3106417
  -0.33293791]]


In [23]:
extremely_positive = ['amazed', 'amazing', 'amazingly','awesome','awsome','best','brilliant','excellent','fantastic','great','greater', 'greatest','incredible',
                     'love', 'loved', 'loves', 'loving','perfect', 'perfectly','terrific','wonderful']
positive = ['accurate','addicted', 'addicting', 'addictive','adorable','clean','compatible','clever','convenient', 'cool',
           'creative','cute','decent', 'entertained', 'enjoy', 'enjoyable', 'enjoyed', 'enjoying', 'enjoyment', 'entertaining','excited', 'exciting','fun','funny','interested', 'interesting','glad','good','happy','helpful',
           'like', 'liked', 'likes','nice', 'nicely','pleasant', 'pleasantly', 'pleased','popular','positive','pretty',
           'recommend', 'recommended','relax', 'relaxing','smart','smooth', 'smoothly','special','super','sweet','thank', 'thanks',
           'useful']
somewhat_positive = ['active','able','agree','average','clear','crafting','deserves','improved','ok', 'okay','properly',
                    'quick', 'quickly','simple', 'simply'] 
somewhat_negative = ['bye','challenge','challenges', 'challenging','complex', 'complicated','didnt','dont','frequently',
                     'isnt','lock', 'locked','need', 'needed', 'needs','offline','quit','reboot','stop', 'stopped', 'stops',
                    'twist','wasnt']
negative = ['angry','annoying','anymore','away','barely','bored', 'boring','complain', 'complaining', 'complaint', 
            'crazy','complaints','confused', 'confusing','creepy','disappointed', 'disappointing', 'disappointment',
           'error', 'errors','exit','expensive','frustrated', 'frustrating', 'frustration','hard','issue', 'issues',
           'lack', 'lacks','lag', 'laggy','late','latest','limit', 'limited','little','lose', 'losing', 'lost',
           'miss', 'missed', 'missing', 'mistake','no','not','never','problem', 'problems','reinstall','sad','scared', 'scary',
           'shame','shut', 'sick','slow','small','sorry','stuck','stumped','thrilled','tired','tough',
           'tricky', 'tried', 'tries','trouble','try', 'trying','unable','upset','weird']
extremely_negative = ['avoid','awful','bad','bug','buggy', 'bugs','crap','crash', 'crashed', 'crashes', 'crashing', 'delete', 
                      'deleted', 'deleting','destroy','dumb','failed','freeze', 'freezes', 'freezing','hate','horrible',
                     'impossible','negative', 'nether','regret','ridiculous','silly','stupid','suck', 'sucks','terrible',
                     'unfortunately', 'uninstall','uninstalled', 'uninstalling','useless','waste', 'wasted', 'waster', 'wasting',
                     'wont','worthless','worse','worst','wouldnt']
important_words = extremely_positive+positive+somewhat_positive+somewhat_negative+negative+extremely_negative
intersection = [val for val in important_words if val in vectorizer.get_feature_names()]

In [33]:
Vt[important_words]

Unnamed: 0,amazed,amazing,amazingly,awesome,awsome,best,brilliant,excellent,fantastic,great,...,useless,waste,wasted,waster,wasting,wont,worthless,worse,worst,wouldnt
0,0.000494,0.016885,0.000679,0.043031,0.001812,0.043340,0.000679,0.006718,0.003828,0.124486,...,0.000862,0.008833,0.002003,0.002379,0.000982,0.012633,0.000296,0.001232,0.002563,0.005940
1,0.000280,0.000378,0.000471,0.013796,0.000197,-0.001419,-0.000058,0.000984,0.000299,0.089588,...,0.000692,0.009893,0.001902,0.005140,0.001207,0.011830,0.000351,0.000763,-0.000270,0.005222
2,0.000272,0.007145,0.000385,0.963306,0.001074,0.017611,0.000032,-0.000830,-0.000004,-0.013812,...,0.000126,0.000173,0.000162,-0.001204,-0.000104,0.002457,0.000036,0.000136,0.000147,0.000280
3,-0.000274,-0.007868,0.000301,0.241191,-0.001147,-0.030070,-0.000267,-0.001834,-0.001152,0.002649,...,-0.001093,-0.009897,-0.002584,0.001573,-0.000370,-0.012604,-0.000490,-0.001505,-0.002688,-0.005692
4,-0.000039,-0.000532,-0.000016,0.004666,-0.000010,-0.002253,-0.000024,-0.000362,-0.000129,-0.009373,...,-0.000051,0.000489,0.000017,-0.000086,-0.000013,-0.000066,0.000006,0.000071,0.000473,-0.000161
5,-0.000330,-0.009925,-0.000280,0.063413,-0.000834,-0.025267,-0.000258,-0.002135,-0.002090,-0.160829,...,-0.000527,-0.003020,-0.000614,-0.000110,-0.000035,-0.009492,-0.000190,0.000265,-0.000252,-0.001223
6,-0.000110,0.002458,0.000035,-0.039966,0.001279,0.005243,-0.000055,-0.003656,-0.001260,-0.129238,...,-0.001900,-0.019974,-0.003291,-0.004673,-0.001254,-0.014512,-0.000752,-0.001605,-0.003667,-0.009253
7,-0.000121,-0.012329,-0.000289,0.046848,-0.001368,-0.022204,-0.000325,-0.001069,-0.001368,-0.171386,...,-0.000300,-0.002938,-0.000423,-0.000727,-0.000283,0.001818,-0.000255,-0.000297,-0.001326,-0.001890
8,-0.000025,-0.008557,-0.000002,0.038614,-0.001787,-0.013396,0.000436,0.007768,0.003334,0.630065,...,-0.000771,-0.005800,-0.001975,0.010861,0.001079,-0.011608,-0.000522,-0.000766,-0.002711,-0.004936
9,0.000098,-0.000402,0.000169,0.000875,0.000325,0.005942,0.000184,-0.000837,-0.002015,-0.620100,...,0.000232,0.003776,0.000681,0.002311,0.000510,0.001684,0.000120,0.000452,0.000980,0.002300


In [23]:
data.to_csv('data.csv', index = False)