In [1]:
# Load dependencies for this Jupyter Notebook
import pandas as pd
import time
import numpy as np
from functools import reduce
from lib.util import fetch_tweets, to_unix_tmsp, fetch_X

#Train and Test preprocessing
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

#Classifiers:
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF

### Read CSV file for thread level features and separate is_rumor tag with data:

In [2]:
fn = "data/threads/germanwings-crash.csv"
gw_thrds=fetch_X(fn)
gw_thrds_rumortags=gw_thrds["is_rumor"]
gw_thrds_without_rumor_tag=gw_thrds.drop(['is_rumor'], axis=1)

print(gw_thrds_without_rumor_tag.columns.values)


['thread' 'has_coords_mean' 'has_coords_sum' 'has_coords_var'
 'Pronoun_mean' 'Pronoun_sum' 'Pronoun_var' 'thread_length'
 'hashtags_count_mean' 'hashtags_count_sum' 'hashtags_count_var'
 'hashtags_count_shared' 'positivewordcount_mean' 'positivewordcount_sum'
 'positivewordcount_var' 'user.default_pic_mean' 'user.default_pic_sum'
 'user.default_pic_var' 'favorite_count_mean' 'favorite_count_sum'
 'favorite_count_var' 'has_quest_mean' 'has_quest_sum' 'has_quest_var'
 'hasperiod_mean' 'hasperiod_sum' 'hasperiod_var'
 'has_quest_or_exclaim_mean' 'has_quest_or_exclaim_sum'
 'has_quest_or_exclaim_var' 'user.profile_sbcolor_mean'
 'user.profile_sbcolor_sum' 'user.profile_sbcolor_var'
 'has_smile_emoji_mean' 'has_smile_emoji_sum' 'has_smile_emoji_var'
 'negativewordcount_mean' 'negativewordcount_sum' 'negativewordcount_var'
 'user_mentions_mean' 'user_mentions_sum' 'user_mentions_var'
 'sentimentscore_mean' 'sentimentscore_sum' 'sentimentscore_var'
 'sensitive_mean' 'sensitive_sum' 'sensitiv

### Used functions:

In [3]:
def convertTrueFalseTo01(X):
    X[X==True]=1.0
    X[X==False]=0.0
    #X[X=='True']=1.0
    #X[X=='False']=0.0
    return X

def standardize_cols(X, mu=None, sigma=None):
    # Standardize each column with mean 0 and variance 1
    n_rows, n_cols = X.shape

    if mu is None:
        mu = np.mean(X, axis=0)

    if sigma is None:
        sigma = np.std(X, axis=0)
        sigma[sigma < 1e-8] = 1.

    return (X - mu) / sigma, mu, sigma


### Data Preprocessing:

In [4]:
gw_thrds_values=gw_thrds_without_rumor_tag.values
n,d=gw_thrds_values.shape
gw_thrds_values=convertTrueFalseTo01(gw_thrds_values[:,1:d])
n,d=gw_thrds_values.shape

gw_thrds_rumortags_values=convertTrueFalseTo01(gw_thrds_rumortags.values)
gw_thrds_values,_,_=standardize_cols(gw_thrds_values.astype(float))

n,d=gw_thrds_values.shape
print(gw_thrds_values.shape)

(405, 114)


## Running some classifiers
### Train and Test data separation:

In [5]:
X_train, X_test, y_train, y_test = train_test_split(gw_thrds_values, gw_thrds_rumortags_values, test_size=0.25, random_state=45)
le = preprocessing.LabelEncoder()
le.fit(y_train)
y_train=le.transform(y_train)
y_test=le.transform(y_test)
print(X_train.shape,X_test.shape,y_train.shape)
print('y_train bincount:', np.bincount(y_train)/np.sum(np.bincount(y_train)))
print('y_test bincount:', np.bincount(y_test)/np.sum(np.bincount(y_test)))

(303, 114) (102, 114) (303,)
y_train bincount: [0.49834983 0.50165017]
y_test bincount: [0.50980392 0.49019608]


In [6]:
def test_models(models):
    for model_name in models:
        model=models[model_name]
        model.fit(X_train,y_train)
        y_test_hat=model.predict(X_test)
        print('%s train accuracy:' % model_name, np.mean(model.predict(X_train)==y_train))
        print('%s test accuracy:' % model_name, np.mean(y_test_hat==y_test))
        print()

### SVM.SVC:

In [7]:
models={
    'linear_SVM':svm.SVC(gamma='scale', kernel='linear'),
    'SVM_with_RBF_kernel': svm.SVC(gamma='scale', kernel='rbf'),
    'SVM_with_sigmoid_kernel' : svm.SVC(gamma='scale', kernel='sigmoid'),
    'KNN_with_k=5':KNeighborsClassifier(n_neighbors=5),
    'Decision_Tree_Classifier':DecisionTreeClassifier(random_state=0),
    'Random_Forest_Classifier_n=100_maxDepth=3':RandomForestClassifier(n_estimators=100, max_depth=3, random_state=4),
    'AdaBoost_n=100':AdaBoostClassifier(n_estimators=100),
    'Gaussian_Process_Classifier':GaussianProcessClassifier(1.0 * RBF(1.0)),
}

In [8]:
test_models(models)

Random_Forest_Classifier_n=100_maxDepth=3 train accuracy: 0.8679867986798679
Random_Forest_Classifier_n=100_maxDepth=3 test accuracy: 0.6764705882352942

AdaBoost_n=100 train accuracy: 1.0
AdaBoost_n=100 test accuracy: 0.6862745098039216

SVM_with_RBF_kernel train accuracy: 0.8943894389438944
SVM_with_RBF_kernel test accuracy: 0.6176470588235294

Decision_Tree_Classifier train accuracy: 1.0
Decision_Tree_Classifier test accuracy: 0.6568627450980392

KNN_with_k=5 train accuracy: 0.7491749174917491
KNN_with_k=5 test accuracy: 0.5196078431372549

SVM_with_sigmoid_kernel train accuracy: 0.6270627062706271
SVM_with_sigmoid_kernel test accuracy: 0.5784313725490197

Gaussian_Process_Classifier train accuracy: 1.0
Gaussian_Process_Classifier test accuracy: 0.6176470588235294

linear_SVM train accuracy: 0.8976897689768977
linear_SVM test accuracy: 0.6176470588235294

