## Import Libraries and dataset



In [2]:
import pandas as pd
import numpy as np
import pipeline as p

from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')
df = pd.read_csv("../data_consolidation/consolidated_version2.csv")
#df = pd.read_csv("C:/Users/Maca/Documents/project_ml/Project-Machine-Learning-CAPP/data_consolidation/consolidated_version2.csv")

In [3]:
df.head(2)

Unnamed: 0.1,Unnamed: 0,id,bot,description,probe_timestamp,created_at,lang,protected,verified,geo_enabled,default_profile,followers_count,friends_count,listed_count,favourites_count,statuses_count,source
0,0,3039154799,human,••TEEN WOLF//SKAM//SHAMELESS••Il mio livello d...,Thu May 16 13:57:12 +0000 2019,Sun Feb 15 14:56:36 +0000 2015,it,False,0.0,0.0,1.0,163,407,0,4193,5761,cresci-rtbust-2019
1,1,390617262,bot,,Tue Apr 16 13:51:17 +0000 2019,Fri Oct 14 08:00:55 +0000 2011,it,False,0.0,0.0,1.0,289,401,1,213,3210,cresci-rtbust-2019


# Data Wrangling

In [4]:
df.columns

Index(['Unnamed: 0', 'id', 'bot', 'description', 'probe_timestamp',
       'created_at', 'lang', 'protected', 'verified', 'geo_enabled',
       'default_profile', 'followers_count', 'friends_count', 'listed_count',
       'favourites_count', 'statuses_count', 'source'],
      dtype='object')

In [5]:
p.describe(df)

Unnamed: 0.1,Unnamed: 0,id,verified,geo_enabled,default_profile,followers_count,friends_count,listed_count,favourites_count,statuses_count
count,63264.0,63264.0,55721.0,56860.0,56166.0,63264.0,63264.0,63264.0,63264.0,63264.0
mean,20393.214356,7.179809e+17,0.057267,0.155575,0.816704,51401.14,1220.452,160.520407,3030.690709,6684.949
std,16265.554666,4.80538e+17,0.232355,0.362455,0.386912,1389009.0,19959.91,3696.298209,15509.481248,39389.06
min,0.0,586.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,3273.0,2369773000.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,11.0
50%,18905.5,1.050035e+18,0.0,0.0,1.0,2.0,36.0,0.0,0.0,45.0
75%,34721.25,1.056234e+18,0.0,0.0,1.0,100.0,267.25,1.0,125.0,318.25
max,50537.0,1.079456e+18,1.0,1.0,1.0,106938000.0,2141379.0,606500.0,886115.0,2766520.0


In [6]:
#Found the NaN values for each column
for i in df.columns:
    print("Found {} NaN {} records.".format(df[i].isna().sum(), i))

Found 0 NaN Unnamed: 0 records.
Found 0 NaN id records.
Found 7543 NaN bot records.
Found 32995 NaN description records.
Found 0 NaN probe_timestamp records.
Found 0 NaN created_at records.
Found 2987 NaN lang records.
Found 11086 NaN protected records.
Found 7543 NaN verified records.
Found 6404 NaN geo_enabled records.
Found 7098 NaN default_profile records.
Found 0 NaN followers_count records.
Found 0 NaN friends_count records.
Found 0 NaN listed_count records.
Found 0 NaN favourites_count records.
Found 0 NaN statuses_count records.
Found 0 NaN source records.


In [7]:
#Target value is column 'bot' I dropped all the NaN bot column
df = df.dropna(subset=['bot'])
for i in df.columns:
    print("Found {} NaN {} records.".format(df[i].isna().sum(), i))

Found 0 NaN Unnamed: 0 records.
Found 0 NaN id records.
Found 0 NaN bot records.
Found 28703 NaN description records.
Found 0 NaN probe_timestamp records.
Found 0 NaN created_at records.
Found 1987 NaN lang records.
Found 3543 NaN protected records.
Found 0 NaN verified records.
Found 0 NaN geo_enabled records.
Found 0 NaN default_profile records.
Found 0 NaN followers_count records.
Found 0 NaN friends_count records.
Found 0 NaN listed_count records.
Found 0 NaN favourites_count records.
Found 0 NaN statuses_count records.
Found 0 NaN source records.


In [8]:
#Fill NaN description, if no descrption is 0 otherwise 1
df['description'] = df.description.fillna(0)
df['has_description'] = df.loc[:, 'description'].apply(lambda x: 0 if x == 0 else 1)
df = df.drop(['description'], axis=1)

In [9]:
#if no lenguage is English then attribute is 0, 1 otherwise
df['len_en'] = df.loc[:, 'lang'].apply(lambda x: 1 if x == 'en' else 0)
df = df.drop(['lang'], axis=1)

In [10]:
#Transform bot column into dummy
df['bot'] = df.loc[:, 'bot'].apply(lambda x: 0 if x == 'human' else 1)

# Modeling
## 1. Decision Tree

In [11]:
#Model only with to features that we need
df_to_model = df[['bot', 'verified', 'geo_enabled', 'default_profile', 'has_description', 'len_en', 'followers_count', 
                  'friends_count', 'listed_count', 'favourites_count', 'statuses_count']]

In [12]:
#Split
df_train, df_test = p.split(df_to_model)

Total data before split: 55721
Train size data: 44576
Test size data: 11145


In [13]:
df_train, df_test = p.replace_missing(df_train, df_test, ['followers_count', 
                  'friends_count', 'listed_count', 'favourites_count', 'statuses_count'])

[4m"Train data before fillna with median value:"
[0m followers_count     0
friends_count       0
listed_count        0
favourites_count    0
statuses_count      0
dtype: int64
[4m"Test data before fillna with median value:"
[0m followers_count     0
friends_count       0
listed_count        0
favourites_count    0
statuses_count      0
dtype: int64

[1m[92m"Median Values to fill" {'followers_count': 1.0, 'friends_count': 21.0, 'listed_count': 0.0, 'favourites_count': 0.0, 'statuses_count': 40.0} [0m

[4m[94m"Sanity check: Train data after fillna with median value"
[0m followers_count     0
friends_count       0
listed_count        0
favourites_count    0
statuses_count      0
dtype: int64
[4m[94m"Sanity check: Test data after fillna with median value"
[0m followers_count     0
friends_count       0
listed_count        0
favourites_count    0
statuses_count      0
dtype: int64


In [23]:
#Define features and target
labels = df_train.loc[:, df_train.columns != 'bot'].columns.values
train_target = df_train.loc[:, 'bot']
train_features = df_train.loc[:, df_train.columns != 'bot']
test_target = df_test.loc[:, 'bot']
test_features = df_test.loc[:, df_train.columns != 'bot']

In [24]:
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
seed=0
k=10
# Decision Tree by the function
dt = DecisionTreeClassifier(random_state=seed)

params = {'criterion':['entropy', 'gini'], 'max_depth':[1,3,5], 'min_samples_split':[2,5,10,20]}

grid_tree = GridSearchCV(estimator=dt, param_grid=params,
                   cv=k, return_train_score=True,
                   scoring = ['accuracy', 'precision', 'recall'],
                   refit='accuracy')

grid_tree.fit(train_features, train_target)
cv_results = pd.DataFrame(grid_tree.cv_results_)

KeyboardInterrupt: 

In [None]:
columns = ['param_criterion','param_max_depth', 'param_min_samples_split', 
                   'rank_test_accuracy', 'mean_test_accuracy', 'mean_test_precision',
                   'mean_test_recall']
results = cv_results.sort_values(by='rank_test_accuracy', ascending=True)
results = results[columns]
results.head()

In [None]:
from sklearn import tree
import pydotplus
from IPython.display import Image

best = grid_tree.best_estimator_
dot_data = tree.export_graphviz(best, out_file=None, feature_names=labels, class_names=True)

graph = pydotplus.graph_from_dot_data(dot_data)
Image(graph.create_png())

In [None]:
importances = best.feature_importances_

# Sort in descending order
indices = np.argsort(importances)[::-1]

# Sort the labels in a corresponding fashion
names = [labels[i] for i in indices]

# Plot
sns.set(rc={'figure.figsize':(10, 5)})
sns.set_style("white")
plt.figure()
plt.title('Figure 1 - Most important feature best accuracy model, Decision Tree', fontsize=22)
plt.ylabel('Percentage importance', fontsize=15, fontweight='bold')
plt.xlabel('Feature', fontsize=15, fontweight='bold')
plt.bar(range(train_features.shape[1]),importances[indices])
plt.xticks(range(train_features.shape[1]), names, rotation=90)
plt.show()

In [None]:
# Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
from sklearn.metrics import plot_confusion_matrix, classification_report, plot_precision_recall_curve, precision_recall_curve

#Predict the values of the train
test_pred=best.predict(test_features)

#Plot the confusion matrix
plot_confusion_matrix(best,train_features,train_target)
plt.title('Figure 2 - Confusion matrix best accuracy model, Decision Tree', fontsize=22)
print(metrics.confusion_matrix(test_target, test_pred))

In [None]:
#Get the metrics for the Decision Tree model
print("Accuracy:\t{}\nPrecision:\t{}\nRecall:\t\t{}\nF1 Score:\t{}\n".format(metrics.accuracy_score(test_target, test_pred),
                                                                           metrics.precision_score(test_target, test_pred),
                                                                           metrics.recall_score(test_target, test_pred),
                                                                           metrics.f1_score(test_target, test_pred)
                                                                          ))

In [None]:
#Plot Precision Recall curve best model decision tree
plot_precision_recall_curve(best,test_features,test_target)
plt.title('Figure 3 - Precision Recall Curve Best Model Decision Tree', fontsize=22)
plt.show()

## 2. Random Forest

In [None]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=seed)

params2 = {'criterion':['entropy', 'gini'], 'max_depth':[1,3,5], 'min_samples_split':[2,5,10], 'n_estimators':[100,1000,5000]}

grid_rf = GridSearchCV(estimator=rf, param_grid=params2,
                   cv=k, return_train_score=True,
                   scoring = ['accuracy', 'precision', 'recall'],
                   refit='accuracy')

grid_rf.fit(train_features, train_target)
cv_results_rf = pd.DataFrame(grid_rf.cv_results_)

In [None]:
results_rf = cv_results_rf.sort_values(by='rank_test_accuracy', ascending=True)
results_rf = results_rf[['param_criterion','param_max_depth', 
                                'param_min_samples_split', 'param_n_estimators', 
                                'mean_test_accuracy', 'rank_test_accuracy',
                                'mean_test_precision', 'mean_test_recall']]
results_rf.head()

In [None]:
bestrf = grid_rf.best_estimator_
bestrf

In [None]:
importances_rf = bestrf.feature_importances_

# Sort in descending order
indices_rf = np.argsort(importances_rf)[::-1]

# Sort the labels in a corresponding fashion
names_rf = [labels[i] for i in indices_rf]

# Plot
sns.set(rc={'figure.figsize':(10, 5)})
sns.set_style("white")
plt.figure()
plt.title('Figure 8 - Most important feature best accuracy model Random Forest', fontsize=22)
plt.ylabel('Percentage importance', fontsize=15, fontweight='bold')
plt.xlabel('Feature', fontsize=15, fontweight='bold')
plt.bar(range(train_features.shape[1]),importances_rf[indices])
plt.xticks(range(train_features.shape[1]), names_rf, rotation=90)
plt.show()

In [None]:
test_pred_rf = bestrf.predict(test_features)
plot_confusion_matrix(bestrf,train_features,train_target)
plt.title('Figure 9 - Confusion Matrix Best Model of Random Forest', fontsize=22)
print(metrics.confusion_matrix(test_target, test_pred_rf))

In [None]:
#Complete preicision recall curve for random forst
plot_precision_recall_curve(bestrf,test_features,test_target)
plt.title('Figure 10 - Precision Recall Curve Random Forest', fontsize=22)
plt.show()

## 3. Naive Bayes
#### Challenges:
Scikit Learn's Naive Bayes Model assumes same type of distribution for all features. However, there are multiple types of features in our dataset, including binomial variables(verified, geoenabled, default_profile, has_description, len_en) and continuous variables(followers_count, friends_count, listed_count, favourites_count, statuses_count). 

To proceed, we decide to assume multinomial distribution for our features for two reasons. First, it is reasonable to transform continuous variables into categorical variables through discretization. Second,binomial distribution is simply a special case of multinomial distribution. 

#### Transform continuous variables to categorical variables

In [40]:
# We are patitioning each continuous variables into 5 bins (Hey Maca and Rukhshan, I think we should take care of outliers earlier in the dataset)
continuous_var = ["followers_count","friends_count","listed_count","favourites_count","statuses_count"]
nb_train = df_train.copy()
nb_test = df_test.copy()
datasets = [nb_train, nb_test]
bin_labels_5 = [1,2,3,4,5] #five levels
        
for df_now in datasets:
    for variable in continuous_var :
        new_name = "cat_"+ variable
        cut_bins = [0,10,100,1000,10000, df_now[variable].max()]
        print(cut_bins)
        df_now[new_name] = pd.Series(pd.cut(df_now[variable], bins=cut_bins, labels=bin_labels_5, include_lowest=True))
        
nb_train.head()
#nb_train["followers_count"].quantile(0.8)

[0, 10, 100, 1000, 10000, 106938028]
[0, 10, 100, 1000, 10000, 2141379]
[0, 10, 100, 1000, 10000, 224105]
[0, 10, 100, 1000, 10000, 886115]
[0, 10, 100, 1000, 10000, 2766520]
[0, 10, 100, 1000, 10000, 105043889]
[0, 10, 100, 1000, 10000, 1250468]
[0, 10, 100, 1000, 10000, 606500]
[0, 10, 100, 1000, 10000, 577666]
[0, 10, 100, 1000, 10000, 1101318]


Unnamed: 0,bot,verified,geo_enabled,default_profile,has_description,len_en,followers_count,friends_count,listed_count,favourites_count,statuses_count,cat_followers_count,cat_friends_count,cat_listed_count,cat_favourites_count,cat_statuses_count
11658,1,0.0,0.0,1.0,1,1,320,480,0,473,488,3,3,1,3,3
10293,1,0.0,0.0,1.0,0,0,0,21,0,0,5,1,2,1,1,1
47061,0,0.0,1.0,0.0,1,1,752,765,72,10498,41776,3,3,2,5,5
53260,0,0.0,1.0,0.0,1,1,1004,1595,46,10804,32873,4,4,2,5,5
21949,1,0.0,0.0,1.0,0,1,0,0,0,0,45,1,1,1,1,2


In [41]:
# Only keep the needed columns
nb_model_train= nb_train[['bot', 'verified', 'geo_enabled', 'default_profile', 'has_description', 'len_en', 'cat_followers_count', 
                  'cat_friends_count', 'cat_listed_count', 'cat_favourites_count', 'cat_statuses_count']]
nb_model_test = nb_test[['bot', 'verified', 'geo_enabled', 'default_profile', 'has_description', 'len_en', 'cat_followers_count', 
                  'cat_friends_count', 'cat_listed_count', 'cat_favourites_count', 'cat_statuses_count']]

Since all variables are categorical/binomial now, we don't need to normalize them.

#### Modeling

In [42]:
# Getting features and target
labels = nb_model_train.loc[:, nb_model_train.columns != 'bot'].columns.values
nb_train_target = nb_model_train.loc[:, 'bot']
nb_train_features = nb_model_train.loc[:, nb_model_train.columns != 'bot']
nb_test_target = nb_model_test .loc[:, 'bot']
nb_test_features = nb_model_test .loc[:, nb_model_test .columns != 'bot']

In [72]:
# Fit the Model using randomized search
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV

naive_bayes = MultinomialNB()
#params = {'class_prior': [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,None]}
alphas = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
params= {'alpha': alphas, 'fit_prior' : [True, False], 'class_prior' : [None, [.1,.9],[.2, .8]]}
#random_search = RandomizedSearchCV(estimator = naive_bayes, param_distributions= params, scoring='accuracy',n_iter=10,cv=10,random_state=0)
grid_search = GridSearchCV(estimator = naive_bayes, param_grid = params, scoring = ["accuracy", "precision","recall"],cv=10, refit='accuracy')
# train the model
nb_model = random_search.fit(nb_train_features, nb_train_target)
grid_search.fit(nb_train_features, nb_train_target)
# scores
#random_search.grid_scores_

GridSearchCV(cv=10, estimator=MultinomialNB(),
             param_grid={'alpha': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9,
                                   1.0],
                         'class_prior': [None, [0.1, 0.9], [0.2, 0.8]],
                         'fit_prior': [True, False]},
             refit='accuracy', scoring=['accuracy', 'precision', 'recall'])

In [73]:
cv_results = pd.DataFrame(grid_search.cv_results_)
cv_results.columns

Index(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time',
       'param_alpha', 'param_class_prior', 'param_fit_prior', 'params',
       'split0_test_accuracy', 'split1_test_accuracy', 'split2_test_accuracy',
       'split3_test_accuracy', 'split4_test_accuracy', 'split5_test_accuracy',
       'split6_test_accuracy', 'split7_test_accuracy', 'split8_test_accuracy',
       'split9_test_accuracy', 'mean_test_accuracy', 'std_test_accuracy',
       'rank_test_accuracy', 'split0_test_precision', 'split1_test_precision',
       'split2_test_precision', 'split3_test_precision',
       'split4_test_precision', 'split5_test_precision',
       'split6_test_precision', 'split7_test_precision',
       'split8_test_precision', 'split9_test_precision', 'mean_test_precision',
       'std_test_precision', 'rank_test_precision', 'split0_test_recall',
       'split1_test_recall', 'split2_test_recall', 'split3_test_recall',
       'split4_test_recall', 'split5_test_recall', 'split6_test

In [77]:
cv_results = pd.DataFrame(grid_search.cv_results_)
columns = ['param_alpha','param_class_prior','param_fit_prior' , 
                   'rank_test_accuracy', 'mean_test_accuracy','mean_test_precision','mean_test_recall']
results = cv_results.sort_values(by='rank_test_accuracy', ascending=True)
results = results[columns]
results.head()

Unnamed: 0,param_alpha,param_class_prior,param_fit_prior,rank_test_accuracy,mean_test_accuracy,mean_test_precision,mean_test_recall
0,0.1,,True,1,0.930523,0.940457,0.973461
54,1.0,,True,1,0.930523,0.940457,0.973461
24,0.5,,True,1,0.930523,0.940457,0.973461
36,0.7,,True,1,0.930523,0.940457,0.973461
6,0.2,,True,1,0.930523,0.940457,0.973461


In [79]:
# Examine the best model
print(grid_search.best_score_)
print(grid_search.best_params_)
print(grid_search.best_estimator_)

0.9305231898889673
{'alpha': 0.1, 'class_prior': None, 'fit_prior': True}
MultinomialNB(alpha=0.1)


In [82]:
from sklearn import metrics
# Use the best model to predict test data
bestnb = grid_search.best_estimator_
test_pred_nb = bestnb.predict(nb_test_features)
metrics.confusion_matrix(nb_test_target, test_pred_nb)
print("Accuracy:\t{}\nPrecision:\t{}\nRecall:\t\t{}\nF1 Score:\t{}\n".format(metrics.accuracy_score(nb_test_target, test_pred_nb),
                                                                           metrics.precision_score(nb_test_target, test_pred_nb),
                                                                           metrics.recall_score(nb_test_target, test_pred_nb),
                                                                           metrics.f1_score(nb_test_target, test_pred_nb)
                                                                          ))

Accuracy:	0.9329744279946164
Precision:	0.9415584415584416
Recall:		0.9753733895792954
F1 Score:	0.9581676653413227

