In [5]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import KFold
import itertools
from sklearn.svm import SVC

### LOADING DATASET 

In [6]:
names = ['Tweet', 'Label']
df = pd.read_csv('train.csv', sep=',', names=names, header=0)
#df_val = pd.read_csv('val.csv', sep=',', names=names, header=0)
#df=pd.concat((df_train, df_val))
df.dropna(how='any', inplace=True)
df.reset_index(drop=True, inplace=True)
df["Tweet"] = df['Tweet'].values.astype('U')
X = df['Tweet'].to_numpy()
y = df['Label'].to_numpy()
print(X.shape)

(6420,)


### VECTORIZING DATASET

In [7]:
MAX_FEATURES = 10000
tfidf = TfidfVectorizer(max_features = MAX_FEATURES)
tfidf.fit(X)
X_train = tfidf.transform(X)
X_train = X_train.todense()
X=X_train
print('X shape is', X.shape)

X shape is (6420, 10000)


### REMOVING OUTLIERS

In [None]:
X=np.array(X)
y=np.array(y)

In [None]:
clf_Iso = IsolationForest(random_state=np.random.RandomState(42),n_jobs = -1)
clf_Iso.fit(X)
y_Iso_Forest = clf_Iso.predict(X)
result = np.where(y_Iso_Forest == -1)
result = list(itertools.chain.from_iterable(result))

In [None]:
X_removed = np.delete(X,result,axis = 0)
if y is None:
    X=X_removed
else:
    y_removed = np.delete(y,result,axis = 0)
X=X_removed
y=y_removed

In [None]:
print(X.shape, y.shape)

### TRAINING SVM MODEL

In [8]:
kf = KFold(n_splits=3, random_state=1)
svm = SVC(C=0.25, kernel='linear')
acc_list = []
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    svm.fit(X_train, y_train)
    print("----Start Evaluating----")
    acc = svm.score(X_test, y_test)
    acc_list.append(acc)
    print("Testing Accuracy:", acc)
print("Mean testing accuracy:", sum(acc_list) / len(acc_list))



----Start Evaluating----
Testing Accuracy: 0.9074766355140187
----Start Evaluating----
Testing Accuracy: 0.9186915887850468
----Start Evaluating----
Testing Accuracy: 0.9121495327102803
Mean testing accuracy: 0.9127725856697819


### VALIDATING MODEL

In [10]:
df_val=pd.read_csv('val.csv', names=names, sep=',', header=0)
df_val.dropna(how='any', inplace=True)
df_val.reset_index(drop=True, inplace=True)
df_val["Tweet"] = df_val['Tweet'].values.astype('U')
X_val = df_val['Tweet'].to_numpy()
y_val = df_val['Label'].to_numpy()
X_val=tfidf.transform(X_val)
X_val=X_val.todense()
print(X_val.shape)
acc_val = svm.score(X_val, y_val)
print('Validation accuracy:', acc_val)

(2140, 10000)
Validation accuracy: 0.9065420560747663
