# Idea:

Take best current model, analyze misclassifications

In [3]:
import numpy as np
import pandas as pd

In [4]:
df = pd.read_csv('enh_data.csv')

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from textblob import TextBlob
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from vecstack import stacking




In [6]:
np.random.seed(42)
pdf = df.reindex(np.random.permutation(df.index))

In [7]:
T_train, T_test, X_train, X_test, y_train, y_test = train_test_split(pdf['clean_text'], pdf.loc[:, ['sentiment', 'subjectivity']], pdf['class'], test_size=.33, random_state=42)

In [6]:
vec = TfidfVectorizer(max_features=50000, ngram_range=(1, 4))
X_train_enc = vec.fit_transform(T_train)
X_test_enc = vec.transform(T_test)

In [12]:
lr = LogisticRegression(random_state=42, solver='lbfgs')
lr.fit(X_train_enc, y_train)
print(accuracy_score(y_test, lr.predict(X_test_enc)))

0.8272727272727273


In [13]:
svm = LinearSVC(random_state=42)
svm.fit(X_train_enc, y_train)
print(accuracy_score(y_test, svm.predict(X_test_enc)))

0.8621212121212121


In [9]:
df.head(1)

Unnamed: 0,text,clean_text,clean_text_lemma,clean_text_neg,text_lem_cor,sentiment,subjectivity,class
0,"plot : two teen couples go to a church party ,...",plot two teen couples go church party drink dr...,plot two teen couple go church party drink dri...,plot two teen couples go church party drink dr...,plot teen couple go church party drink drive g...,-0.011356,0.500699,0


In [21]:
vec = TfidfVectorizer(max_features=50000, ngram_range=(1, 4))
X_train_enc = vec.fit_transform(X_train)
X_test_enc = vec.transform(X_test)

In [12]:
lr = LogisticRegression(random_state=42, solver='lbfgs')
lr.fit(X_train_enc, y_train)
print(accuracy_score(y_test, lr.predict(X_test_enc)))

0.8227272727272728


In [13]:
svm = LinearSVC(random_state=42)
svm.fit(X_train_enc, y_train)
print(accuracy_score(y_test, svm.predict(X_test_enc)))

0.8469696969696969


In [14]:
misclassified = np.where(y_test != svm.predict(X_test_enc))

In [15]:
for id in misclassified[0][:5]:
    x, y = X_test.iloc[id], y_test.iloc[id]
    print(f"True label {y} | Pred label {svm.predict(X_test_enc[id])}")
    print(x[:100])
    xblob = TextBlob(x)
    print("Sentiment=", xblob.sentiment)

True label 0 | Pred label [1]
various film see seattle film festival true men raft u director orson welles recently uncover fourth
Sentiment= Sentiment(polarity=0.07661624551159435, subjectivity=0.46390566797543564)
True label 1 | Pred label [0]
lake placid definately typical creature attack people movie maybe enjoyable clever actually come com
Sentiment= Sentiment(polarity=0.11121455075536711, subjectivity=0.6121841593780368)
True label 1 | Pred label [0]
boom introduction music finish camera sweep red mountain see figure look barren red landscape kiss w
Sentiment= Sentiment(polarity=0.09758771929824561, subjectivity=0.5733474310776943)
True label 0 | Pred label [1]
giant begin monologue funny distinctive princess medieval fairy tale feel pleasantly surprise sharp 
Sentiment= Sentiment(polarity=0.09841327561327565, subjectivity=0.6619111592111593)
True label 0 | Pred label [1]
walt disney studio finally meet match lush animation twentieth century fox anastasia judge late effo
Sentimen

In [19]:
lr.predict_proba(X_train_enc)

array([[0.6763202 , 0.3236798 ],
       [0.66848483, 0.33151517],
       [0.61928957, 0.38071043],
       ...,
       [0.67088125, 0.32911875],
       [0.38278369, 0.61721631],
       [0.66781901, 0.33218099]])

In [20]:
lr_pred = np.hstack([lr.predict_proba(X_train_enc)[:, 0], lr.predict_proba(X_test_enc)[:, 0]])

In [15]:
from vecstack import stacking
import lightgbm


In [16]:
models = [LogisticRegression(random_state=42), LinearSVC(random_state=42)]

In [18]:
S_train, S_test = stacking(models,
                          X_train_enc, y_train, X_test_enc,
                           verbose=2, random_state=42, shuffle=True, regression=False,
                          n_folds=4, metric=accuracy_score, mode='oof_pred_bag')

task:         [classification]
n_classes:    [2]
metric:       [accuracy_score]
mode:         [oof_pred_bag]
n_models:     [5]

model  0:     [LogisticRegression]
    fold  0:  [0.77313433]
    fold  1:  [0.80298507]




    fold  2:  [0.78208955]
    fold  3:  [0.84477612]
    ----
    MEAN:     [0.80074627] + [0.02763210]
    FULL:     [0.80074627]

model  1:     [LinearSVC]
    fold  0:  [0.79701493]
    fold  1:  [0.81194030]
    fold  2:  [0.80597015]
    fold  3:  [0.85074627]
    ----
    MEAN:     [0.81641791] + [0.02051900]
    FULL:     [0.81641791]

model  2:     [RandomForestClassifier]
    fold  0:  [0.79701493]
    fold  1:  [0.77014925]
    fold  2:  [0.81492537]
    fold  3:  [0.77611940]
    ----
    MEAN:     [0.78955224] + [0.01772290]
    FULL:     [0.78955224]

model  3:     [GradientBoostingClassifier]
    fold  0:  [0.74626866]
    fold  1:  [0.82089552]
    fold  2:  [0.78805970]
    fold  3:  [0.80298507]
    ----
    MEAN:     [0.78955224] + [0.02756147]
    FULL:     [0.78955224]

model  4:     [LGBMClassifier]
    fold  0:  [0.79701493]
    fold  1:  [0.82089552]
    fold  2:  [0.80895522]
    fold  3:  [0.79701493]
    ----
    MEAN:     [0.80597015] + [0.00990037]
    FULL

In [19]:
gbm = lightgbm.LGBMClassifier(random_state=42)
gbm.fit(S_train, y_train)
accuracy_score(y_test, gbm.predict(S_test))

0.8242424242424242

In [22]:
S_train, S_test = stacking(models,
                          X_train_enc, y_train, X_test_enc,
                           verbose=2, random_state=42, shuffle=True, regression=False,
                          n_folds=4, metric=accuracy_score, mode='oof_pred_bag')

task:         [classification]
n_classes:    [2]
metric:       [accuracy_score]
mode:         [oof_pred_bag]
n_models:     [5]

model  0:     [LogisticRegression]
    fold  0:  [0.78208955]
    fold  1:  [0.79402985]




    fold  2:  [0.79701493]
    fold  3:  [0.82388060]
    ----
    MEAN:     [0.79925373] + [0.01527574]
    FULL:     [0.79925373]

model  1:     [LinearSVC]
    fold  0:  [0.80000000]
    fold  1:  [0.81194030]
    fold  2:  [0.82388060]
    fold  3:  [0.85671642]
    ----
    MEAN:     [0.82313433] + [0.02114720]
    FULL:     [0.82313433]

model  2:     [RandomForestClassifier]
    fold  0:  [0.80298507]
    fold  1:  [0.80895522]
    fold  2:  [0.80895522]
    fold  3:  [0.78507463]
    ----
    MEAN:     [0.80149254] + [0.00978722]
    FULL:     [0.80149254]

model  3:     [GradientBoostingClassifier]
    fold  0:  [0.77611940]
    fold  1:  [0.80000000]
    fold  2:  [0.76417910]
    fold  3:  [0.79104478]
    ----
    MEAN:     [0.78283582] + [0.01374026]
    FULL:     [0.78283582]

model  4:     [LGBMClassifier]
    fold  0:  [0.81194030]
    fold  1:  [0.83582090]
    fold  2:  [0.78208955]
    fold  3:  [0.80895522]
    ----
    MEAN:     [0.80970149] + [0.01904082]
    FULL

In [29]:
S_train.shape

(1340, 5)

In [30]:
X_train.shape

(1340,)

In [52]:
S_train_cmb = np.vstack((X_train.T, S_train.T)).T

In [53]:
S_train_cmb.shape

(1340, 7)

In [54]:
S_test_cmb = np.vstack((X_test.T, S_test.T)).T

In [60]:
gbm = lightgbm.LGBMClassifier(random_state=42)
gbm.fit(S_train, y_train)
accuracy_score(y_test, gbm.predict(S_test))

0.8272727272727273

In [61]:
gbm.fit(S_train_cmb, y_train)
accuracy_score(y_test, gbm.predict(S_test_cmb))

0.806060606060606

In [11]:
df_imdb = pd.read_csv('clean_imdb.csv')
np.random.seed(42)
pdf_imdb = df_imdb.reindex(np.random.permutation(df_imdb.index))

In [20]:
pdf.head()

Unnamed: 0,text,clean_text,clean_text_lemma,clean_text_neg,text_lem_cor,sentiment,subjectivity,class
1860,the verdict : spine - chilling drama from horr...,verdict spine chilling drama horror maestro st...,verdict spine chilling drama horror maestro st...,verdict spine chilling drama horror maestro st...,verdict spine chill drama horror maestro steph...,0.134226,0.567851,1
353,""" the 44 caliber killer has struck again . "" s...",caliber killer struck starring john leguizamo ...,caliber killer struck starring john leguizamo ...,caliber killer struck starring john leguizamo ...,caliber killer struck star john leguizamo mira...,0.069654,0.449724,0
1333,in the company of men made a splash at the sun...,company men made splash sundance film festival...,company men made splash sundance film festival...,company men made splash sundance film festival...,company men make splash sundance film festival...,0.028181,0.465218,1
905,"in the year 2029 , captain leo davidson ( mark...",year captain leo davidson mark wahlberg boogie...,year captain leo davidson mark wahlberg boogie...,year captain leo davidson mark wahlberg boogie...,year captain davidson mark wahlberg boogie nig...,0.173886,0.533527,0
1289,[ note that followups are directed to rec . ar...,note followups directed rec arts movies curren...,note followup directed rec art movie current f...,note followups directed rec arts movies curren...,note followup direct rec art movie current fil...,0.142112,0.409155,1


In [21]:
X_train, X_test, y_train, y_test = pdf_imdb['clean_text'], pdf['clean_text'], pdf_imdb['class'], pdf['class']

In [28]:
vec = TfidfVectorizer(max_features=50000, ngram_range=(1, 2))
X_train_enc = vec.fit_transform(X_train)
X_test_enc = vec.transform(X_test)

In [29]:
lr = LogisticRegression(random_state=42)
svc = LinearSVC(random_state=42)
models = [lr, svc, SGDClassifier(random_state=42, penalty='elasticnet', l1_ratio=.1, learning_rate='optimal')]

In [30]:
S_train, S_test = stacking(models,
                          X_train_enc, y_train, X_test_enc,
                           verbose=2, random_state=42, shuffle=True, regression=False,
                          n_folds=5, metric=accuracy_score, mode='oof_pred_bag')

task:         [classification]
n_classes:    [2]
metric:       [accuracy_score]
mode:         [oof_pred_bag]
n_models:     [3]

model  0:     [LogisticRegression]




    fold  0:  [0.90510000]
    fold  1:  [0.90910000]
    fold  2:  [0.89940000]
    fold  3:  [0.90000000]
    fold  4:  [0.90210000]
    ----
    MEAN:     [0.90314000] + [0.00358586]
    FULL:     [0.90314000]

model  1:     [LinearSVC]
    fold  0:  [0.91150000]
    fold  1:  [0.90850000]
    fold  2:  [0.90150000]
    fold  3:  [0.90680000]
    fold  4:  [0.90730000]
    ----
    MEAN:     [0.90712000] + [0.00324986]
    FULL:     [0.90712000]

model  2:     [SGDClassifier]
    fold  0:  [0.90010000]
    fold  1:  [0.90620000]
    fold  2:  [0.89420000]
    fold  3:  [0.89610000]
    fold  4:  [0.89960000]
    ----
    MEAN:     [0.89924000] + [0.00411271]
    FULL:     [0.89924000]



In [31]:
import lightgbm

In [32]:
gbm = lightgbm.LGBMClassifier(random_state=42)
gbm.fit(S_train, y_train)
accuracy_score(y_test, gbm.predict(S_test))

0.876

In [18]:
lr.fit(X_train_enc, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=42, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [19]:
accuracy_score(y_test, lr.predict(X_test_enc))

0.8695