### This is the module where the performance of classical ML models is tested using tfidf 

In [1]:
#make the necessary imports
import numpy as np
import pandas as pd
import itertools
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

In [2]:
#Read the data
df=pd.read_csv('../../../datasets/liar_tweaked/trainvectordata.csv')
testdf=pd.read_csv('../../../datasets/liar_tweaked/testvectordata.csv')
validdf=pd.read_csv('../../../datasets/liar_tweaked/validvectordata.csv')

In [3]:
#get training set
#two sets for testing accuracy, test set and valid set, all provided in the LIAR dataset
x_train,y_train=df['statement'],df['label']
x_test,y_test=testdf['statement'],testdf['label']
x_valid,y_valid=validdf['statement'],validdf['label']

In [4]:
#create tfidf vectors
tfidf_vectorizer=TfidfVectorizer(stop_words='english', max_df=0.7)
tfidf_train=tfidf_vectorizer.fit_transform(df['statement']) 
tfidf_test=tfidf_vectorizer.transform(testdf['statement'])  
tfidf_valid=tfidf_vectorizer.transform(validdf['statement'])  

In [5]:
#check the shape
tfidf_train
tfidf_test

<1267x11915 sparse matrix of type '<class 'numpy.float64'>'
	with 12177 stored elements in Compressed Sparse Row format>

In [6]:
#fit the classical ML models
LogicalRegressionclassifier = LogisticRegression(random_state = 0)
LogicalRegressionclassifier.fit(tfidf_train,y_train)
SVMclassifier = SVC()
SVMclassifier.fit(tfidf_train,y_train)
KNNclassifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
KNNclassifier.fit(tfidf_train,y_train)
Randomforestclassifier = RandomForestClassifier(n_estimators = 300, criterion = 'entropy', random_state = 0)
Randomforestclassifier.fit(tfidf_train,y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=300,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [7]:
#get the predictions on both test and valid dataset using the models

KNN_y_test_predict = KNNclassifier.predict(tfidf_test)

SVM_y_test_predict = SVMclassifier.predict(tfidf_test)

Randomforest_y_test_predict = Randomforestclassifier.predict(tfidf_test)

LogicalRegression_y_test_predict = LogicalRegressionclassifier.predict(tfidf_test)




KNN_y_valid_predict = KNNclassifier.predict(tfidf_valid)

SVM_y_valid_predict = SVMclassifier.predict(tfidf_valid)

Randomforest_y_valid_predict = Randomforestclassifier.predict(tfidf_valid)

LogicalRegression_y_valid_predict = LogicalRegressionclassifier.predict(tfidf_valid)

In [9]:
#check the accuracy of the models

print('algorithm - test dataset accuracy - valid dataset accuracy')
print('KNN - ' ,round(accuracy_score(y_test, KNN_y_test_predict),4),'- '
      ,round(accuracy_score(y_valid, KNN_y_valid_predict),4))
print('SVM - ' ,round(accuracy_score(y_test, SVM_y_test_predict),4), '- '
     ,round(accuracy_score(y_valid, SVM_y_valid_predict),4))
print('Random Forest - ', round(accuracy_score(y_test, Randomforest_y_test_predict),4), '- '
     ,round(accuracy_score(y_valid, Randomforest_y_valid_predict),4))
print('Logical Regression - ', round(accuracy_score(y_test, LogicalRegression_y_test_predict),4), '- '
     ,round(accuracy_score(y_valid, LogicalRegression_y_valid_predict),4))

algorithm - test dataset accuracy - valid dataset accuracy
KNN -  0.5706 -  0.5561
SVM -  0.6172 -  0.6129
Random Forest -  0.6156 -  0.6129
Logical Regression -  0.6164 -  0.595
