In [103]:
import pickle
import pandas as pd
from sklearn.linear_model import LogisticRegression
import numpy as np
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from imblearn import pipeline
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler

In [22]:
df = pd.read_pickle('../data/clean-data.pkl')

In [23]:
df.head(3)

Unnamed: 0,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,num_of_sen,num_of_word
150523,0,0,1,939340800,EVERY book is educational,this witty little book makes my son laugh at l...,4,74
150500,2,2,1,940809600,This whole series is great way to spend time w...,i can remember seeing the show when it aired o...,5,78
451855,0,0,1,944092800,Entertainingl Funny,beetlejuice is a well written movie everything...,7,29


In [24]:
Y = df.Score
X = df.drop(labels=['Score','Text','Summary','Time'],axis=1)
pred = {}

In [25]:
num, den = X.HelpfulnessNumerator.tolist(), X.HelpfulnessDenominator.tolist()
usefulness = []
for i in range(len(num)):
    if(den[i]==0):
        usefulness.append(-1)
    else:
        usefulness.append(num[i]/den[i])
    #usefulness.append(num[i]/(den[i]+1e-5))

In [26]:
X['usefullness'] = usefulness

In [27]:
X.head(3)

Unnamed: 0,HelpfulnessNumerator,HelpfulnessDenominator,num_of_sen,num_of_word,usefullness
150523,0,0,4,74,-1.0
150500,2,2,5,78,1.0
451855,0,0,7,29,-1.0


In [28]:
l = int(0.8*df.shape[0])

### Logistic Regression:
***

In [44]:
lr = LogisticRegression(n_jobs=-1,penalty='l2',C=1e-5,class_weight='balanced',verbose=1) # I tried a bunch of parameters without actually using scikit-learn's GridSearchCV

In [45]:
lr.fit(X[:l],Y[:l])

  " = {}.".format(self.n_jobs))


[LibLinear]

LogisticRegression(C=1e-05, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=-1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=1, warm_start=False)

In [46]:
roc_auc_score(Y[l:],lr.predict(X[l:]))

0.60220112920430913

### Xgboost:
***

In [77]:
y = Y.tolist()
wt = y.count(0)/y.count(1)

In [78]:
xgb = XGBClassifier(silent=True,nthread=4,scale_pos_weight=wt)

In [79]:
# Fitting the model
xgb.fit(X[:l],Y[:l])

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=4,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=0.18695394423549763, seed=0, silent=True,
       subsample=1)

In [80]:
roc_auc_score(Y[l:],xgb.predict(X[l:]))

0.59909625567557723

### RF: 
***

In [111]:
n_jobs = 4
n_folds=10
my_rand_state = 0
skfold = StratifiedKFold(n_splits=n_folds,random_state=my_rand_state, shuffle=False)
vt = VarianceThreshold()
std_scale = StandardScaler()
threshold=[p*(1-p) for p in [0, 0.05, 0.1, 0.15]]
class_weight=['balanced']
class_weight.extend([{1: w} for w in [1, 2, 10]])

In [106]:
rf_clf=RandomForestClassifier()
n_estimators=[100,200]
max_features=[.1,.3,.5]

In [113]:
rf_clf_b = pipeline.Pipeline(steps=[('vt',vt),('scale',std_scale),('clf',rf_clf)])

rf_clf_est_b = GridSearchCV(estimator=rf_clf_b,cv=skfold,
              scoring='roc_auc',n_jobs=n_jobs,
              param_grid=dict(vt__threshold=threshold,
              clf__n_estimators=n_estimators,
              clf__max_features=max_features,
              clf__class_weight=class_weight))

In [114]:
# Fitting the model
rf_clf_est_b.fit(X[:l],Y[:l])

GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=0, shuffle=False),
       error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vt', VarianceThreshold(threshold=0.0)), ('scale', StandardScaler(copy=True, with_mean=True, with_std=True)), ('clf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_dec...n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
       fit_params=None, iid=True, n_jobs=4,
       param_grid={'vt__threshold': [0, 0.0475, 0.09000000000000001, 0.1275], 'clf__n_estimators': [100, 200], 'clf__max_features': [0.1, 0.3, 0.5], 'clf__class_weight': ['balanced', {1: 1}, {1: 2}, {1: 10}]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='roc_auc', verbose=0)

In [115]:
roc_auc_score(Y[l:],rf.predict(X[l:]))

0.5695267324165153