In [1]:
# Load dependencies for this Jupyter Notebook
import pandas as pd
import time
import numpy as np
from functools import reduce
from lib.util import fetch_tweets, to_unix_tmsp, fetch_X

#Train and Test preprocessing
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

#Classifiers:
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF

### Read CSV file for thread level features and separate is_rumor tag with data:

In [2]:
fn = "data/threads/germanwings-crash.csv"
gw_thrds=fetch_X(fn)
gw_thrds_rumortags=gw_thrds["is_rumor"]
gw_thrds_without_rumor_tag=gw_thrds.drop(['is_rumor'], axis=1)

print(gw_thrds_without_rumor_tag.columns.values)


['Unnamed: 0' 'thread' 'user.verified' 'thread_length' 'hashtags_count'
 'user.default_pic' 'urls_count' 'favorite_count' 'has_smile_emoji'
 'retweet_count' 'user.has_bg_img' 'user.tweets_count'
 'src.followers_count' 'src.listed_count' 'src.user_verified' 'created'
 'src.created_at' 'src.tweets_total' 'first_resp' 'last_resp' 'resp_var'
 'time_to_first_resp' 'time_to_last_resp']


### Used functions:

In [3]:
def convertTrueFalseTo01(X):
    X[X==True]=1.0
    X[X==False]=0.0
    #X[X=='True']=1.0
    #X[X=='False']=0.0
    return X

def standardize_cols(X, mu=None, sigma=None):
    # Standardize each column with mean 0 and variance 1
    n_rows, n_cols = X.shape

    if mu is None:
        mu = np.mean(X, axis=0)

    if sigma is None:
        sigma = np.std(X, axis=0)
        sigma[sigma < 1e-8] = 1.

    return (X - mu) / sigma, mu, sigma


### Data Preprocessing:

In [4]:
gw_thrds_values=gw_thrds_without_rumor_tag.values
n,d=gw_thrds_values.shape
gw_thrds_values=convertTrueFalseTo01(gw_thrds_values[:,1:d])
n,d=gw_thrds_values.shape

gw_thrds_rumortags_values=convertTrueFalseTo01(gw_thrds_rumortags.values)
gw_thrds_values,_,_=standardize_cols(gw_thrds_values.astype(float))

n,d=gw_thrds_values.shape
print(gw_thrds_values.shape)

(405, 22)


## Running some classifiers
### Train and Test data separation:

In [5]:
X_train, X_test, y_train, y_test = train_test_split(gw_thrds_values, gw_thrds_rumortags_values, test_size=0.25, random_state=45)
le = preprocessing.LabelEncoder()
le.fit(y_train)
y_train=le.transform(y_train)
y_test=le.transform(y_test)
print(X_train.shape,X_test.shape,y_train.shape)
print('y_train bincount:', np.bincount(y_train)/np.sum(np.bincount(y_train)))
print('y_test bincount:', np.bincount(y_test)/np.sum(np.bincount(y_test)))

(303, 22) (102, 22) (303,)
y_train bincount: [0.49834983 0.50165017]
y_test bincount: [0.50980392 0.49019608]


In [6]:
def test_model(model):
    model.fit(X_train,y_train)
    y_test_hat=model.predict(X_test)
    print('train error:', np.mean(model.predict(X_train)==y_train))
    print('test error:', np.mean(y_test_hat==y_test))

### SVM.SVC:

In [7]:
model = svm.SVC(gamma='scale', kernel='linear')
test_model(model)

train error: 0.6798679867986799
test error: 0.5980392156862745


In [8]:
model = svm.SVC(gamma='scale', kernel='rbf')
test_model(model)

train error: 0.7227722772277227
test error: 0.5882352941176471


In [9]:
model = svm.SVC(gamma='scale', kernel='sigmoid')
test_model(model)

train error: 0.5742574257425742
test error: 0.6372549019607843


In [10]:
model = svm.SVC(gamma='scale', kernel='poly')
test_model(model)

train error: 0.6567656765676567
test error: 0.5588235294117647


In [11]:
model = KNeighborsClassifier(n_neighbors=5)
test_model(model)

train error: 0.7458745874587459
test error: 0.5980392156862745


In [12]:
model=DecisionTreeClassifier(random_state=0)
test_model(model)

train error: 1.0
test error: 0.5980392156862745


In [13]:
model=RandomForestClassifier(n_estimators=100, max_depth=3, random_state=4)
test_model(model)

train error: 0.8283828382838284
test error: 0.6568627450980392


In [14]:
model=AdaBoostClassifier(n_estimators=100)
test_model(model)

train error: 0.9834983498349835
test error: 0.6764705882352942


In [15]:
model=GaussianProcessClassifier(1.0 * RBF(1.0))
test_model(model)

train error: 0.7557755775577558
test error: 0.5
