In [1]:
import pandas as pd
import numpy as np

In [2]:
df_main = pd.read_csv('clean_data.csv')
df_imdb = pd.read_csv('clean_imdb.csv')

In [3]:
import sklearn
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV, RidgeClassifier, RidgeClassifierCV
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB, ComplementNB

In [4]:
np.random.seed(42)
pdf_main = df_main.reindex(np.random.permutation(df_main.index))
pdf_imdb = df_imdb.reindex(np.random.permutation(df_imdb.index))

In [5]:
X_train, X_test, y_train, y_test = train_test_split(pdf_main['clean_text'], pdf_main['class'], test_size=.33, random_state=42)

In [6]:
X_enh = np.hstack((X_train, pdf_imdb['clean_text']))
y_enh = np.hstack((y_train, pdf_imdb['class']))

In [26]:
vectorizer = TfidfVectorizer()
X_train_enc = vectorizer.fit_transform(X_enh)
X_test_enc = vectorizer.transform(X_test)

In [29]:
lr = LogisticRegression(random_state=42)
lr.fit(X_train_enc, y_enh)
accuracy_score(y_test, lr.predict(X_test_enc))



0.8621212121212121

In [45]:
svc = LinearSVC(random_state=42)
svc.fit(X_train_enc, y_enh)
accuracy_score(y_test, svc.predict(X_test_enc))

0.8575757575757575

In [58]:
svc2 = LinearSVC(C=10, random_state=42)
svc2.fit(X_train_enc, y_enh)
accuracy_score(y_test, svc2.predict(X_test_enc))

0.8484848484848485

In [43]:
lr2 = LogisticRegression(penalty='l2', C=5, max_iter=500, solver='lbfgs', random_state=42)
lr2.fit(X_train_enc, y_enh)
accuracy_score(y_test, lr2.predict(X_test_enc))

0.8666666666666667

In [44]:
ridge = RidgeClassifier(random_state=42)
ridge.fit(X_train_enc, y_enh)
accuracy_score(y_test, ridge.predict(X_test_enc))

0.8545454545454545

In [46]:
from sklearn.ensemble import VotingClassifier

In [52]:
vote = VotingClassifier([('svc', LinearSVC(random_state=42)),
                         ('lr', LogisticRegression(C=5, max_iter=500, solver='lbfgs', random_state=42)),
                         ('ridge', RidgeClassifier(random_state=42))], n_jobs=3, voting='hard')

In [53]:
vote.fit(X_train_enc, y_enh)

VotingClassifier(estimators=[('svc', LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=42, tol=0.0001,
     verbose=0)), ('lr', LogisticRegression(C=5, class_weight=None, dual=False, fit...t=True,
        max_iter=None, normalize=False, random_state=42, solver='auto',
        tol=0.001))],
         flatten_transform=None, n_jobs=3, voting='hard', weights=None)

In [54]:
accuracy_score(y_test, vote.predict(X_test_enc))

0.8636363636363636

In [79]:
vectorizer = TfidfVectorizer(max_features=50000, ngram_range=(1, 3))
X_train_enc = vectorizer.fit_transform(X_enh)
X_test_enc = vectorizer.transform(X_test)

In [80]:
lr = LogisticRegression(random_state=42)
lr.fit(X_train_enc, y_enh)
accuracy_score(y_test, lr.predict(X_test_enc))



0.8924242424242425

In [81]:
svc = LinearSVC(random_state=42)
svc.fit(X_train_enc, y_enh)
accuracy_score(y_test, svc.predict(X_test_enc))

0.8666666666666667

In [82]:
svc2 = LinearSVC(C=10, random_state=42)
svc2.fit(X_train_enc, y_enh)
accuracy_score(y_test, svc2.predict(X_test_enc))

0.8393939393939394

In [83]:
lr2 = LogisticRegression(penalty='l2', C=5, max_iter=500, solver='lbfgs', random_state=42)
lr2.fit(X_train_enc, y_enh)
accuracy_score(y_test, lr2.predict(X_test_enc))

0.8787878787878788

In [84]:
ridge = RidgeClassifier(random_state=42)
ridge.fit(X_train_enc, y_enh)
accuracy_score(y_test, ridge.predict(X_test_enc))

0.8712121212121212

In [85]:
from sklearn.ensemble import VotingClassifier

In [86]:
vote = VotingClassifier([('svc', LinearSVC(random_state=42)),
                         ('lr', LogisticRegression(C=5, max_iter=500, solver='lbfgs', random_state=42)),
                         ('ridge', RidgeClassifier(random_state=42))], n_jobs=3, voting='hard')

In [87]:
vote.fit(X_train_enc, y_enh)

VotingClassifier(estimators=[('svc', LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=42, tol=0.0001,
     verbose=0)), ('lr', LogisticRegression(C=5, class_weight=None, dual=False, fit...t=True,
        max_iter=None, normalize=False, random_state=42, solver='auto',
        tol=0.001))],
         flatten_transform=None, n_jobs=3, voting='hard', weights=None)

In [88]:
accuracy_score(y_test, vote.predict(X_test_enc))

0.8757575757575757

In [92]:
# Trying naive bayes
multi_nb = MultinomialNB()
multi_nb.fit(X_train_enc, y_enh)
accuracy_score(y_test, multi_nb.predict(X_test_enc))

0.8106060606060606

In [93]:
cnb = ComplementNB()
cnb.fit(X_train_enc, y_enh)
accuracy_score(y_test, cnb.predict(X_test_enc))

0.8106060606060606

In [130]:
vectorizer = TfidfVectorizer(max_features=50000, ngram_range=(1, 4))
X_train_enc = vectorizer.fit_transform(X_enh)
X_test_enc = vectorizer.transform(X_test)

In [131]:
lr = LogisticRegression(random_state=42)
lr.fit(X_train_enc, y_enh)
accuracy_score(y_test, lr.predict(X_test_enc))



0.8939393939393939

In [134]:
vectorizer = TfidfVectorizer(max_features=100000, ngram_range=(1, 1))
X_train_enc = vectorizer.fit_transform(X_enh)
X_test_enc = vectorizer.transform(X_test)

In [136]:
lr = LogisticRegression(random_state=42, solver='lbfgs')
lr.fit(X_train_enc, y_enh)
accuracy_score(y_test, lr.predict(X_test_enc))

0.8621212121212121

In [20]:
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import LSTM, Dense, Embedding, Flatten, Conv1D, MaxPooling1D, Dropout
from keras.optimizers import Adam

In [32]:
import os
embeddings_index = {}
f = open(os.path.join('../', 'glove.6B.50d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [8]:
tokenizer = Tokenizer(num_words=50000)
tokenizer.fit_on_texts(X_enh)

seq_train = tokenizer.texts_to_sequences(X_enh)
seq_test = tokenizer.texts_to_sequences(X_test)

In [40]:
EMBEDDING_DIM = 50
MAX_SEQUENCE_LENGTH = 300
embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, EMBEDDING_DIM))
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [41]:
embedding_layer = Embedding(len(tokenizer.word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

In [27]:
pad_train = pad_sequences(seq_train, maxlen=300)
pad_test = pad_sequences(seq_test, maxlen=300)

In [44]:
model = Sequential()
model.add(embedding_layer)
model.add(LSTM(100, dropout=.2, recurrent_dropout=.2))
model.add(Dense(100, activation='relu'))
model.add(Dense(100, activation='relu'))
# model.add(LSTM(50, return_sequences=False, dropout=.2, recurrent_dropout=.2))
model.add(Dense(1, activation='sigmoid'))
model.compile('adam', loss='binary_crossentropy', metrics=['accuracy'])

In [45]:
model.fit(pad_train, y_enh, epochs=2, batch_size=64, validation_data=(pad_test, y_test))

Train on 51340 samples, validate on 660 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f8fe97946d8>

In [46]:
model.fit(pad_train, y_enh, epochs=5, batch_size=64, validation_data=(pad_test, y_test))

Train on 51340 samples, validate on 660 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f8fe973b160>