# **IMPORTING LIBRARIES**

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
import itertools
from sklearn.svm import SVC

# **LOADING DATASET**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
names = ['index','review', 'sentiment', 'label']
df = pd.read_csv('/content/drive/MyDrive/imdb/train_data.csv', sep=',', names=names, header=0)
#df_val = pd.read_csv('val.csv', sep=',', names=names, header=0)
#df=pd.concat((df_train, df_val))
df.dropna(how='any', inplace=True)
df.reset_index(drop=True, inplace=True)
df["review"] = df['review'].values.astype('U')
X = df['review'].to_numpy()
Y = df['label'].to_numpy()
print(X.shape)

(10000,)


In [None]:
X_tr, X_val, Y_tr, Y_val = train_test_split(X, Y, test_size=0.25, random_state=0)

# **VECTORIZING DATASET**

In [None]:
MAX_FEATURES = 10000
tfidf = TfidfVectorizer(max_features = MAX_FEATURES)
tfidf.fit(X_tr)
X_train = tfidf.transform(X_tr)
X_train = X_train.todense()
X_tr=X_train


X_val=tfidf.transform(X_val)
X_val=X_val.todense()


X shape is (10000,)


# **REMOVING OUTLIERS**

In [None]:
X_tr=np.array(X_tr)
Y_tr=np.array(Y_tr)

In [None]:
clf_Iso = IsolationForest(random_state=np.random.RandomState(0),n_jobs = -1)
clf_Iso.fit(X_tr)
Y_Iso_Forest = clf_Iso.predict(X_tr)
result = np.where(Y_Iso_Forest == -1)
result = list(itertools.chain.from_iterable(result))

In [None]:
X_removed = np.delete(X_tr,result,axis = 0)
if Y_tr is None:
    X_train=X_removed
else:
    Y_removed = np.delete(Y_tr,result,axis = 0)
X_tr=X_removed
Y_tr=Y_removed

In [None]:
print(X_tr.shape, Y_tr.shape)

(7500, 10000) (7500,)


# **TRAINING SVM MODEL**

In [None]:
kf = KFold(n_splits=10)
svm = SVC(C=1, kernel='rbf')
acc_list = []
for train_index, test_index in kf.split(X_tr):
    X_train, X_test = X_tr[train_index], X_tr[test_index]
    Y_train, Y_test = Y_tr[train_index], Y_tr[test_index]
    svm.fit(X_train, Y_train)
    print("----Start Evaluating----")
    acc = svm.score(X_test, Y_test)
    acc_list.append(acc)
    print("Testing Accuracy:", acc)
print("Mean testing accuracy:", sum(acc_list) / len(acc_list))

----Start Evaluating----
Testing Accuracy: 0.8853333333333333
----Start Evaluating----
Testing Accuracy: 0.884
----Start Evaluating----
Testing Accuracy: 0.884
----Start Evaluating----
Testing Accuracy: 0.844
----Start Evaluating----
Testing Accuracy: 0.8706666666666667
----Start Evaluating----
Testing Accuracy: 0.884
----Start Evaluating----
Testing Accuracy: 0.8906666666666667
----Start Evaluating----
Testing Accuracy: 0.848
----Start Evaluating----
Testing Accuracy: 0.8653333333333333
----Start Evaluating----
Testing Accuracy: 0.8533333333333334
Mean testing accuracy: 0.8709333333333333


# **VALIDATING MODEL**

In [None]:
acc_val = svm.score(X_val, Y_val)
print('Validation accuracy:', acc_val)

(2500, 10000)




Validation accuracy: 0.8568
