# Idea:

Take best current model, analyze misclassifications

In [1]:
import numpy as np
import pandas as pd

In [15]:
df = pd.read_csv('enh_data.csv')

In [58]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from textblob import TextBlob

In [18]:
np.random.seed(42)
pdf = df.reindex(np.random.permutation(df.index))

In [19]:
X_train, X_test, y_train, y_test = train_test_split(pdf['clean_text'], pdf['class'], test_size=.33, random_state=42)

In [20]:
vec = TfidfVectorizer(max_features=50000, ngram_range=(1, 4))
X_train_enc = vec.fit_transform(X_train)
X_test_enc = vec.transform(X_test)

In [21]:
lr = LogisticRegression(random_state=42, solver='lbfgs')
lr.fit(X_train_enc, y_train)
print(accuracy_score(y_test, lr.predict(X_test_enc)))

0.8272727272727273


In [22]:
svm = LinearSVC(random_state=42)
svm.fit(X_train_enc, y_train)
print(accuracy_score(y_test, svm.predict(X_test_enc)))

0.8621212121212121


In [23]:
df.head(1)

Unnamed: 0,text,clean_text,clean_text_lemma,clean_text_neg,text_lem_cor,sentiment,subjectivity,class
0,"plot : two teen couples go to a church party ,...",plot two teen couples go church party drink dr...,plot two teen couple go church party drink dri...,plot two teen couples go church party drink dr...,plot teen couple go church party drink drive g...,-0.011356,0.500699,0


In [24]:
X_train, X_test, y_train, y_test = train_test_split(pdf['text_lem_cor'], pdf['class'], test_size=.33, random_state=42)

In [25]:
vec = TfidfVectorizer(max_features=50000, ngram_range=(1, 4))
X_train_enc = vec.fit_transform(X_train)
X_test_enc = vec.transform(X_test)

In [26]:
lr = LogisticRegression(random_state=42, solver='lbfgs')
lr.fit(X_train_enc, y_train)
print(accuracy_score(y_test, lr.predict(X_test_enc)))

0.8227272727272728


In [27]:
svm = LinearSVC(random_state=42)
svm.fit(X_train_enc, y_train)
print(accuracy_score(y_test, svm.predict(X_test_enc)))

0.8469696969696969


In [28]:
misclassified = np.where(y_test != svm.predict(X_test_enc))

In [59]:
for id in misclassified[0][:5]:
    x, y = X_test.iloc[id], y_test.iloc[id]
    print(f"True label {y} | Pred label {svm.predict(X_test_enc[id])}")
    print(x[:100])
    xblob = TextBlob(x)
    print("Sentiment=", xblob.sentiment)

True label 0 | Pred label [1]
various film see seattle film festival true men raft u director orson welles recently uncover fourth
Sentiment= Sentiment(polarity=0.07661624551159435, subjectivity=0.46390566797543564)
True label 1 | Pred label [0]
lake placid definately typical creature attack people movie maybe enjoyable clever actually come com
Sentiment= Sentiment(polarity=0.11121455075536711, subjectivity=0.6121841593780368)
True label 1 | Pred label [0]
boom introduction music finish camera sweep red mountain see figure look barren red landscape kiss w
Sentiment= Sentiment(polarity=0.09758771929824561, subjectivity=0.5733474310776943)
True label 0 | Pred label [1]
giant begin monologue funny distinctive princess medieval fairy tale feel pleasantly surprise sharp 
Sentiment= Sentiment(polarity=0.09841327561327565, subjectivity=0.6619111592111593)
True label 0 | Pred label [1]
walt disney studio finally meet match lush animation twentieth century fox anastasia judge late effo
Sentimen