In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
import itertools
from tensorflow import keras
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import optimizers, Model

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## **LSTM**

In [None]:
names = ['index','review', 'sentiment', 'label']
df = pd.read_csv('/content/drive/MyDrive/imdb/train_data.csv', sep=',', names=names, header=0)
#df_val = pd.read_csv('val.csv', sep=',', names=names, header=0)
#df=pd.concat((df_train, df_val))
df.dropna(how='any', inplace=True)
df.reset_index(drop=True, inplace=True)
df["review"] = df['review'].values.astype('U')
X = df['review'].to_numpy()
Y = df['label'].to_numpy()
print(X.shape)

(10000,)


In [None]:
from sklearn.model_selection import train_test_split
X_tr, X_val, Y_tr, Y_val = train_test_split(X, Y, test_size=0.25, random_state=0)

In [None]:
MAX_FEATURES = 10000
tfidf = TfidfVectorizer(max_features = MAX_FEATURES)
tfidf.fit(X_tr)
X_train = tfidf.transform(X_tr)
X_train = X_train.todense()
X_tr=X_train


X_val=tfidf.transform(X_val)
X_val=X_val.todense()

In [None]:
X_tr=np.array(X_tr)
Y_tr=np.array(Y_tr)

In [None]:
clf_Iso = IsolationForest(random_state=np.random.RandomState(0),n_jobs = -1)
clf_Iso.fit(X_tr)
Y_Iso_Forest = clf_Iso.predict(X_tr)
result = np.where(Y_Iso_Forest == -1)
result = list(itertools.chain.from_iterable(result))

In [None]:
X_removed = np.delete(X_tr,result,axis = 0)
if Y_tr is None:
    X_train=X_removed
else:
    Y_removed = np.delete(Y_tr,result,axis = 0)
X_tr=X_removed
Y_tr=Y_removed

In [None]:
look_back=1
num_samples=X_tr.shape[0]
num_features=X_tr.shape[1]
X_tr= np.reshape(np.array(X_tr), (num_samples, look_back, num_features))

In [None]:
batch_size=128

In [None]:
def create_model(look_back=None, input_nodes=None, activation='relu', 
                optimizer='adam', hidden_layers=2, neurons=400, hidden_units=600):
    model = keras.Sequential()
    model.add(keras.layers.LSTM(hidden_units, dropout=0.2, 
                                input_shape=(look_back, input_nodes)))
    
    for _ in range(hidden_layers):
        model.add(keras.layers.Dense(neurons, activation=activation))

    model.add(keras.layers.Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer=optimizer, 
                    metrics=['accuracy'])
    return model

In [None]:
epochs = 5 # can change this
kf = KFold(n_splits=3, random_state=None)
acc_list = []
X_train = None # init
X_test = None # init
Y_test = None #init
# Doing cross validation testing
for train_index, test_index in kf.split(X):
    X_train, X_test = X_tr[train_index], X_tr[test_index]
    Y_train, Y_test = Y_tr[train_index], Y_tr[test_index]
    model = create_model(look_back=look_back, input_nodes=num_features)
    history = model.fit(X_train, Y_train, validation_data=(X_test, Y_test), epochs=epochs, batch_size=batch_size)
    print("----Start Evaluating----")
    _, acc = model.evaluate(X_test, Y_test, verbose=1)
    acc_list.append(acc)
    print("Testing Accuracy:", acc)
print("Mean testing accuracy:", sum(acc_list) / len(acc_list)

In [None]:
_,acc_val = model.evaluate(X_val, Y_val, verbose=1)
print('Validation accuracy:', acc_val)

In [None]:
lstm_probs = model.predict(X_val).reshape(X_val.shape[0])

np.savetxt('lstm_probs.csv', lstm_probs, delimiter=',', header='probs')

In [None]:
lstm_preds = (lstm_probs >= 0.5).astype("int32")
np.savetxt('lstm_preds.csv', lstm_preds, delimiter=',', header='preds')

In [None]:
lstm_cm = confusion_matrix(np.array(y_val), lstm_preds)
print(lstm_cm) 

In [None]:
#save best model
lstm_best = lstm_gs.best_estimator_
#check best n_estimators value
print(lstm_gs.best_params_)

## **SVM**

In [None]:
names = ['index','review', 'sentiment', 'label']
df = pd.read_csv('/content/drive/MyDrive/imdb/train_data.csv', sep=',', names=names, header=0)
#df_val = pd.read_csv('val.csv', sep=',', names=names, header=0)
#df=pd.concat((df_train, df_val))
df.dropna(how='any', inplace=True)
df.reset_index(drop=True, inplace=True)
df["review"] = df['review'].values.astype('U')
X = df['review'].to_numpy()
Y = df['label'].to_numpy()
print(X.shape)

In [None]:
X_tr, X_val, Y_tr, Y_val = train_test_split(X, Y, test_size=0.25, random_state=0)

In [None]:
MAX_FEATURES = 10000
tfidf = TfidfVectorizer(max_features = MAX_FEATURES)
tfidf.fit(X_tr)
X_train = tfidf.transform(X_tr)
X_train = X_train.todense()
X_tr=X_train


X_val=tfidf.transform(X_val)
X_val=X_val.todense()

In [None]:
X_tr=np.array(X_tr)
Y_tr=np.array(Y_tr)

In [None]:
clf_Iso = IsolationForest(random_state=np.random.RandomState(0),n_jobs = -1)
clf_Iso.fit(X_tr)
Y_Iso_Forest = clf_Iso.predict(X_tr)
result = np.where(Y_Iso_Forest == -1)
result = list(itertools.chain.from_iterable(result))

In [None]:
X_removed = np.delete(X_tr,result,axis = 0)
if Y_tr is None:
    X_train=X_removed
else:
    Y_removed = np.delete(Y_tr,result,axis = 0)
X_tr=X_removed
Y_tr=Y_removed

In [None]:
kf = KFold(n_splits=10)
svm = SVC(C=1, kernel='rbf')
acc_list = []
for train_index, test_index in kf.split(X_tr):
    X_train, X_test = X_tr[train_index], X_tr[test_index]
    Y_train, Y_test = Y_tr[train_index], Y_tr[test_index]
    svm.fit(X_train, Y_train)
    print("----Start Evaluating----")
    acc = svm.score(X_test, Y_test)
    acc_list.append(acc)
    print("Testing Accuracy:", acc)
print("Mean testing accuracy:", sum(acc_list) / len(acc_list))

In [None]:
acc_val = svm.score(X_val, Y_val)
print('Validation accuracy:', acc_val)

In [None]:
svm_probs = svm.predict_proba(X_val)[:, 1]
np.savetxt('svm_probs.csv', svm_probs, delimiter=',', header='probs')
svm_preds = svm.predict(X_val)
np.savetxt('svm_preds.csv', svm_preds, delimiter=',', header='preds')

In [None]:
svm_cm = confusion_matrix(np.array(Y_val), svm_preds)
print(svm_cm)

## **LOGREG**

In [None]:
names = ['index','review', 'sentiment', 'label']
df = pd.read_csv('/content/drive/MyDrive/imdb/train_data.csv', sep=',', names=names, header=0)
#df_val = pd.read_csv('val.csv', sep=',', names=names, header=0)
#df=pd.concat((df_train, df_val))
df.dropna(how='any', inplace=True)
df.reset_index(drop=True, inplace=True)
df["review"] = df['review'].values.astype('U')
X = df['review'].to_numpy()
Y = df['label'].to_numpy()
print(X.shape)

In [None]:
X_tr, X_val, Y_tr, Y_val = train_test_split(X, Y, test_size=0.25, random_state=0)

In [None]:
MAX_FEATURES = 10000
cv = CountVectorizer(max_features = MAX_FEATURES)
cv.fit(X_tr)
X_train = cv.transform(X_tr)
X_train = X_train.todense()
X_tr=X_train


X_valid=cv.transform(X_val)
X_valid=X_valid.todense()
X_val=X_valid


In [None]:
X_tr=np.array(X_tr)
Y_tr=np.array(Y_tr)

In [None]:
clf_Iso = IsolationForest(random_state=np.random.RandomState(0),n_jobs = -1)
clf_Iso.fit(X_tr)
Y_Iso_Forest = clf_Iso.predict(X_tr)
result = np.where(Y_Iso_Forest == -1)
result = list(itertools.chain.from_iterable(result))

In [None]:
X_removed = np.delete(X_tr,result,axis = 0)
if Y_tr is None:
    X_train=X_removed
else:
    Y_removed = np.delete(Y_tr,result,axis = 0)
X_tr=X_removed
Y_tr=Y_removed

In [None]:
print(X_tr.shape, Y_tr.shape)

In [None]:
num_samples = X_tr.shape[0]
num_features = X_tr.shape[1]
X_tr = np.reshape(np.array(X_tr), (num_samples, num_features))

In [None]:
C = 1 
solver = 'sag' 
kf = KFold(n_splits=5)
logistic = LogisticRegression(max_iter=500, C=C, solver=solver)
acc_list = []
# Doing cross validation testing
for train_index, test_index in kf.split(X_tr):
    X_train, X_test = X_tr[train_index], X_tr[test_index]
    Y_train, Y_test = Y_tr[train_index], Y_tr[test_index]
    logistic.fit(X_train, Y_train)
    print("----Start Evaluating----")
    acc = logistic.score(X_test, Y_test)
    acc_list.append(acc)
    print("Testing Accuracy:", acc)
print("Mean testing accuracy:", sum(acc_list) / len(acc_list))

In [None]:
acc_val = logistic.score(X_val, Y_val)
print('Validation accuracy:', acc_val)

In [None]:
logreg_probs = logistic.predict_proba(X_val)[:, 1]
np.savetxt('logreg_probs.csv', logreg_probs, delimiter=',', header='probs')
logreg_preds = logistic.predict(X_val)
np.savetxt('logreg_preds.csv', logreg_preds, delimiter=',', header='preds')

In [None]:
lr_cm = confusion_matrix(np.array(Y_val), logreg_preds)
print(lr_cm)

VOTING CLASSIFIER

In [None]:
from sklearn.ensemble import VotingClassifier
#create a dictionary of our models
estimators=[(‘lstm’, model), (‘svm’, svm), (‘logistic’, logistic)]
#create our voting classifier, inputting our models
ensemble = VotingClassifier(estimators, voting=’hard’)

In [None]:
#fit model to training data
ensemble.fit(X_tr, Y_tr)
#test our model on the test data
ensemble.score(X, Y)