### This module runs the classical models on the linguistic features of the dataset

In [1]:
#make necessary imports
import numpy as np
import pandas as pd
import itertools
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

In [2]:
#Read the data
df=pd.read_csv('../../datasets/liar_tweaked/trainfeaturedata.csv')
testdf=pd.read_csv('../../datasets/liar_tweaked/testfeaturedata.csv')
validdf=pd.read_csv('../../datasets/liar_tweaked/validfeaturedata.csv')

In [3]:
#see the data
df

Unnamed: 0.1,Unnamed: 0,statement,label,SentimentScore,CountPunc,Readability,CountWord
0,0,Says the Annies List political group supports ...,0,0.2500,2,14.9,11
1,1,When did the decline of coal start? It started...,1,0.3612,5,7.7,24
2,2,"Hillary Clinton agrees with John McCain ""by vo...",1,0.3182,3,9.6,19
3,3,Health care reform legislation is likely to ma...,0,0.7579,1,10.9,12
4,4,The economic turnaround started at the end of ...,1,0.0000,1,4.8,10
...,...,...,...,...,...,...,...
10224,10235,There are a larger number of shark attacks in ...,1,-0.7506,1,7.6,17
10225,10236,Democrats have now become the party of the [At...,1,0.4019,3,7.4,14
10226,10237,Says an alternative to Social Security that op...,1,0.5859,3,17.6,28
10227,10238,On lifting the U.S. Cuban embargo and allowing...,0,0.0000,3,3.6,11


In [4]:
#Feature Scaling
sc=StandardScaler()
df.loc[:,'SentimentScore':'CountWord']=sc.fit_transform(df.loc[:,'SentimentScore':'CountWord'])
testdf.loc[:,'SentimentScore':'CountWord']=sc.fit_transform(testdf.loc[:,'SentimentScore':'CountWord'])
validdf.loc[:,'SentimentScore':'CountWord']=sc.fit_transform(validdf.loc[:,'SentimentScore':'CountWord'])

In [5]:
df

Unnamed: 0.1,Unnamed: 0,statement,label,SentimentScore,CountPunc,Readability,CountWord
0,0,Says the Annies List political group supports ...,0,0.679383,-0.312743,0.947578,-0.726853
1,1,When did the decline of coal start? It started...,1,0.965594,0.894749,-0.658919,0.619501
2,2,"Hillary Clinton agrees with John McCain ""by vo...",1,0.854919,0.089754,-0.234982,0.101672
3,3,Health care reform legislation is likely to ma...,0,1.986638,-0.715241,0.055080,-0.623287
4,4,The economic turnaround started at the end of ...,1,0.035922,-0.715241,-1.305980,-0.830419
...,...,...,...,...,...,...,...
10224,10235,There are a larger number of shark attacks in ...,1,-1.896005,-0.715241,-0.681231,-0.105459
10225,10236,Democrats have now become the party of the [At...,1,1.070349,0.089754,-0.725856,-0.416156
10226,10237,Says an alternative to Social Security that op...,1,1.543937,0.089754,1.550014,1.033763
10227,10238,On lifting the U.S. Cuban embargo and allowing...,0,0.035922,0.089754,-1.573729,-0.726853


In [6]:
#initialize ML models
LogicalRegressionclassifier = LogisticRegression(random_state = 0)
SVMclassifier = SVC()
KNNclassifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
Randomforestclassifier = RandomForestClassifier(n_estimators = 300, criterion = 'entropy', random_state = 0)

In [7]:
#All Features
LogicalRegressionclassifier.fit(df.loc[:,'SentimentScore':'CountWord'], df['label'])
SVMclassifier.fit(df.loc[:,'SentimentScore':'CountWord'], df['label'])
KNNclassifier.fit(df.loc[:,'SentimentScore':'CountWord'], df['label'])
Randomforestclassifier.fit(df.loc[:,'SentimentScore':'CountWord'], df['label'])

LogicalRegression_y_test_predict = LogicalRegressionclassifier.predict(testdf.loc[:,'SentimentScore':'CountWord'])

LogicalRegression_y_valid_predict = LogicalRegressionclassifier.predict(validdf.loc[:,'SentimentScore':'CountWord'])

SVM_y_test_predict = SVMclassifier.predict(testdf.loc[:,'SentimentScore':'CountWord'])

SVM_y_valid_predict = SVMclassifier.predict(validdf.loc[:,'SentimentScore':'CountWord'])

KNN_y_test_predict = KNNclassifier.predict(testdf.loc[:,'SentimentScore':'CountWord'])

KNN_y_valid_predict = KNNclassifier.predict(validdf.loc[:,'SentimentScore':'CountWord'])

Randomforest_y_test_predict = Randomforestclassifier.predict(testdf.loc[:,'SentimentScore':'CountWord'])

Randomforest_y_valid_predict = Randomforestclassifier.predict(validdf.loc[:,'SentimentScore':'CountWord'])

print('algorithm - test dataset accuracy - valid dataset accuracy')
print('KNN - ' ,round(accuracy_score(testdf['label'], KNN_y_test_predict),4),'- '
      ,round(accuracy_score(validdf['label'], KNN_y_valid_predict),4))
print('SVM - ' ,round(accuracy_score(testdf['label'], SVM_y_test_predict),4), '- '
     ,round(accuracy_score(validdf['label'], SVM_y_valid_predict),4))
print('Random Forest - ', round(accuracy_score(testdf['label'], Randomforest_y_test_predict),4), '- '
     ,round(accuracy_score(validdf['label'], Randomforest_y_valid_predict),4))
print('Logical Regression - ', round(accuracy_score(testdf['label'], LogicalRegression_y_test_predict),4), '- '
     ,round(accuracy_score(validdf['label'], LogicalRegression_y_valid_predict),4))

algorithm - test dataset accuracy - valid dataset accuracy
KNN -  0.5706 -  0.521
SVM -  0.5643 -  0.5319
Random Forest -  0.5304 -  0.5382
Logical Regression -  0.5635 -  0.5327


In [8]:
#Only Sentiment
LogicalRegressionclassifier.fit(df['SentimentScore'].values.reshape(-1,1), df['label'])
SVMclassifier.fit(df['SentimentScore'].values.reshape(-1,1), df['label'])
KNNclassifier.fit(df['SentimentScore'].values.reshape(-1,1), df['label'])
Randomforestclassifier.fit(df['SentimentScore'].values.reshape(-1,1), df['label'])

LogicalRegression_y_test_predict = LogicalRegressionclassifier.predict(testdf['SentimentScore'].values.reshape(-1,1))

LogicalRegression_y_valid_predict = LogicalRegressionclassifier.predict(validdf['SentimentScore'].values.reshape(-1,1))

SVM_y_test_predict = SVMclassifier.predict(testdf['SentimentScore'].values.reshape(-1,1))

SVM_y_valid_predict = SVMclassifier.predict(validdf['SentimentScore'].values.reshape(-1,1))

KNN_y_test_predict = KNNclassifier.predict(testdf['SentimentScore'].values.reshape(-1,1))

KNN_y_valid_predict = KNNclassifier.predict(validdf['SentimentScore'].values.reshape(-1,1))

Randomforest_y_test_predict = Randomforestclassifier.predict(testdf['SentimentScore'].values.reshape(-1,1))

Randomforest_y_valid_predict = Randomforestclassifier.predict(validdf['SentimentScore'].values.reshape(-1,1))

print('algorithm - test dataset accuracy - valid dataset accuracy')
print('KNN - ' ,round(accuracy_score(testdf['label'], KNN_y_test_predict),4), '- '
      ,round(accuracy_score(validdf['label'], KNN_y_valid_predict),4))
print('SVM - ' ,round(accuracy_score(testdf['label'], SVM_y_test_predict),4), '- '
      ,round(accuracy_score(validdf['label'], SVM_y_valid_predict),4))
print('Random Forest - ', round(accuracy_score(testdf['label'], Randomforest_y_test_predict),4), '- '
      ,round(accuracy_score(validdf['label'], Randomforest_y_valid_predict),4))
print('Logical Regression - ', round(accuracy_score(testdf['label'], LogicalRegression_y_test_predict),4), '- '
      ,round(accuracy_score(validdf['label'], LogicalRegression_y_valid_predict),4))

algorithm - test dataset accuracy - valid dataset accuracy
KNN -  0.532 -  0.5202
SVM -  0.5635 -  0.5202
Random Forest -  0.5485 -  0.5062
Logical Regression -  0.5635 -  0.5202


In [9]:
#Only CountPunc
LogicalRegressionclassifier.fit(df['CountPunc'].values.reshape(-1,1), df['label'])
SVMclassifier.fit(df['CountPunc'].values.reshape(-1,1), df['label'])
KNNclassifier.fit(df['CountPunc'].values.reshape(-1,1), df['label'])
Randomforestclassifier.fit(df['CountPunc'].values.reshape(-1,1), df['label'])

LogicalRegression_y_test_predict = LogicalRegressionclassifier.predict(testdf['CountPunc'].values.reshape(-1,1))

LogicalRegression_y_valid_predict = LogicalRegressionclassifier.predict(validdf['CountPunc'].values.reshape(-1,1))

SVM_y_test_predict = SVMclassifier.predict(testdf['CountPunc'].values.reshape(-1,1))

SVM_y_valid_predict = SVMclassifier.predict(validdf['CountPunc'].values.reshape(-1,1))

KNN_y_test_predict = KNNclassifier.predict(testdf['CountPunc'].values.reshape(-1,1))

KNN_y_valid_predict = KNNclassifier.predict(validdf['CountPunc'].values.reshape(-1,1))

Randomforest_y_test_predict = Randomforestclassifier.predict(testdf['CountPunc'].values.reshape(-1,1))

Randomforest_y_valid_predict = Randomforestclassifier.predict(validdf['CountPunc'].values.reshape(-1,1))

print('algorithm - test dataset accuracy - valid dataset accuracy')
print('KNN - ' ,round(accuracy_score(testdf['label'], KNN_y_test_predict),4), '- '
      ,round(accuracy_score(validdf['label'], KNN_y_valid_predict),4))
print('SVM - ' ,round(accuracy_score(testdf['label'], SVM_y_test_predict),4), '- '
      ,round(accuracy_score(validdf['label'], SVM_y_valid_predict),4))
print('Random Forest - ', round(accuracy_score(testdf['label'], Randomforest_y_test_predict),4), '- '
      ,round(accuracy_score(validdf['label'], Randomforest_y_valid_predict),4))
print('Logical Regression - ', round(accuracy_score(testdf['label'], LogicalRegression_y_test_predict),4), '- '
      ,round(accuracy_score(validdf['label'], LogicalRegression_y_valid_predict),4))

algorithm - test dataset accuracy - valid dataset accuracy
KNN -  0.4459 -  0.4751
SVM -  0.5627 -  0.5148
Random Forest -  0.562 -  0.5101
Logical Regression -  0.5635 -  0.5202


In [10]:
#Only Readability
LogicalRegressionclassifier.fit(df['Readability'].values.reshape(-1,1), df['label'])
SVMclassifier.fit(df['Readability'].values.reshape(-1,1), df['label'])
KNNclassifier.fit(df['Readability'].values.reshape(-1,1), df['label'])
Randomforestclassifier.fit(df['Readability'].values.reshape(-1,1), df['label'])

LogicalRegression_y_test_predict = LogicalRegressionclassifier.predict(testdf['Readability'].values.reshape(-1,1))

LogicalRegression_y_valid_predict = LogicalRegressionclassifier.predict(validdf['Readability'].values.reshape(-1,1))

SVM_y_test_predict = SVMclassifier.predict(testdf['Readability'].values.reshape(-1,1))

SVM_y_valid_predict = SVMclassifier.predict(validdf['Readability'].values.reshape(-1,1))

KNN_y_test_predict = KNNclassifier.predict(testdf['Readability'].values.reshape(-1,1))

KNN_y_valid_predict = KNNclassifier.predict(validdf['Readability'].values.reshape(-1,1))

Randomforest_y_test_predict = Randomforestclassifier.predict(testdf['Readability'].values.reshape(-1,1))

Randomforest_y_valid_predict = Randomforestclassifier.predict(validdf['Readability'].values.reshape(-1,1))

print('algorithm - test dataset accuracy - valid dataset accuracy')
print('KNN - ' ,round(accuracy_score(testdf['label'], KNN_y_test_predict),4), '- '
      ,round(accuracy_score(validdf['label'], KNN_y_valid_predict),4))
print('SVM - ' ,round(accuracy_score(testdf['label'], SVM_y_test_predict),4), '- '
      ,round(accuracy_score(validdf['label'], SVM_y_valid_predict),4))
print('Random Forest - ', round(accuracy_score(testdf['label'], Randomforest_y_test_predict),4), '- '
      ,round(accuracy_score(validdf['label'], Randomforest_y_valid_predict),4))
print('Logical Regression - ', round(accuracy_score(testdf['label'], LogicalRegression_y_test_predict),4), '- '
      ,round(accuracy_score(validdf['label'], LogicalRegression_y_valid_predict),4))

algorithm - test dataset accuracy - valid dataset accuracy
KNN -  0.5178 -  0.5039
SVM -  0.5635 -  0.5202
Random Forest -  0.5541 -  0.514
Logical Regression -  0.5635 -  0.5202


In [11]:
#Only CountWord
LogicalRegressionclassifier.fit(df['CountWord'].values.reshape(-1,1), df['label'])
SVMclassifier.fit(df['CountWord'].values.reshape(-1,1), df['label'])
KNNclassifier.fit(df['CountWord'].values.reshape(-1,1), df['label'])
Randomforestclassifier.fit(df['CountWord'].values.reshape(-1,1), df['label'])

LogicalRegression_y_test_predict = LogicalRegressionclassifier.predict(testdf['CountWord'].values.reshape(-1,1))

LogicalRegression_y_valid_predict = LogicalRegressionclassifier.predict(validdf['CountWord'].values.reshape(-1,1))

SVM_y_test_predict = SVMclassifier.predict(testdf['CountWord'].values.reshape(-1,1))

SVM_y_valid_predict = SVMclassifier.predict(validdf['CountWord'].values.reshape(-1,1))

KNN_y_test_predict = KNNclassifier.predict(testdf['CountWord'].values.reshape(-1,1))

KNN_y_valid_predict = KNNclassifier.predict(validdf['CountWord'].values.reshape(-1,1))

Randomforest_y_test_predict = Randomforestclassifier.predict(testdf['CountWord'].values.reshape(-1,1))

Randomforest_y_valid_predict = Randomforestclassifier.predict(validdf['CountWord'].values.reshape(-1,1))

print('algorithm - test dataset accuracy - valid dataset accuracy')
print('KNN - ' ,round(accuracy_score(testdf['label'], KNN_y_test_predict),4), '- '
      ,round(accuracy_score(validdf['label'], KNN_y_valid_predict),4))
print('SVM - ' ,round(accuracy_score(testdf['label'], SVM_y_test_predict),4), '- '
      ,round(accuracy_score(validdf['label'], SVM_y_valid_predict),4))
print('Random Forest - ', round(accuracy_score(testdf['label'], Randomforest_y_test_predict),4), '- '
      ,round(accuracy_score(validdf['label'], Randomforest_y_valid_predict),4))
print('Logical Regression - ', round(accuracy_score(testdf['label'], LogicalRegression_y_test_predict),4), '- '
      ,round(accuracy_score(validdf['label'], LogicalRegression_y_valid_predict),4))

algorithm - test dataset accuracy - valid dataset accuracy
KNN -  0.4988 -  0.5288
SVM -  0.5635 -  0.5389
Random Forest -  0.5635 -  0.5397
Logical Regression -  0.5635 -  0.5218
