# Twitter Bot or Not?

Model Comparison: 
* RandomForest 
* XGBoost

Oversampling Method Comparison: 
* None
* RandomOversampler
* SMOTE
* ADASYN

In [1]:
# Basics
import pandas as pd
import numpy as np
from collections import Counter

# Visuals
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from visualize import *

# Models
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# Model support
from sklearn.model_selection import train_test_split, cross_val_score, learning_curve, KFold, StratifiedKFold
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (accuracy_score, classification_report, confusion_matrix, f1_score, 
                             plot_confusion_matrix, precision_score, recall_score, roc_auc_score, roc_curve)
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN

## Data import and setup

In [2]:
raw_df = pd.read_csv('data_files/twitter_human_bots_dataset.csv.zip', index_col=False)

# drop funny index column
raw_df.drop(columns=['Unnamed: 0'], inplace=True)

# Binary classifications for bots and boolean values
raw_df['bot'] = raw_df['account_type'].apply(lambda x: 1 if x == 'bot' else 0)
raw_df['default_profile'] = raw_df['default_profile'].astype(int)
raw_df['default_profile'] = raw_df['default_profile'].astype(int)
raw_df['default_profile_image'] = raw_df['default_profile_image'].astype(int)
raw_df['geo_enabled'] = raw_df['geo_enabled'].astype(int)
raw_df['verified'] = raw_df['verified'].astype(int)

# datetime conversion
raw_df['created_at'] = pd.to_datetime(raw_df['created_at'])
# hour created
raw_df['hour_created'] = pd.to_datetime(raw_df['created_at']).dt.hour



In [3]:
# usable df setup
df = raw_df[['bot', 'screen_name', 'created_at', 'hour_created', 'verified', 'location', 'geo_enabled', 'lang', 'default_profile', 
              'default_profile_image', 'favourites_count', 'followers_count', 'friends_count', 'statuses_count',
             'average_tweets_per_day', 'account_age_days']]

In [4]:
# Interesting features to look at: 
df['avg_daily_followers'] = np.round(df['followers_count'] / df['account_age_days'])
df['avg_daily_friends'] = np.round(df['followers_count'] / df['account_age_days'])
df['avg_daily_favorites'] = np.round(df['followers_count'] / df['account_age_days'])

# Log transformations for highly skewed data
df['friends_log'] = np.round(np.log(1 + df['friends_count']), 3)
df['followers_log'] = np.round(np.log(1 + df['followers_count']), 3)
df['favs_log'] = np.round(np.log(1 + df['favourites_count']), 3)
df['avg_daily_tweets_log'] = np.round(np.log(1+ df['average_tweets_per_day']), 3)

# Possible popularity metrics
df['popularity'] = np.round(df['friends_log'] * df['followers_log'], 3)
df['tweet_to_followers'] = np.round(np.log( 1+ df['statuses_count']) * np.log(1+ df['followers_count']), 3)

# Log-transformed daily acquisition metrics for dist. plots
df['follower_acq_rate'] = np.round(np.log(1 + (df['followers_count'] / df['account_age_days'])), 3)
df['friends_acq_rate'] = np.round(np.log(1 + (df['friends_count'] / df['account_age_days'])), 3)
df['favs_rate'] = np.round(np.log(1 + (df['friends_count'] / df['account_age_days'])), 3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['avg_daily_followers'] = np.round(df['followers_count'] / df['account_age_days'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['avg_daily_friends'] = np.round(df['followers_count'] / df['account_age_days'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['avg_daily_favorites'] = np.round(

In [5]:
del raw_df

In [6]:
num_bots = len(df[df['bot'] == 1])
num_humans = len(df[df['bot'] == 0])

print("Number of bots: ", num_bots)
print("Number of humans: ", num_humans)
print("Bots / Total %: ", (num_bots / len(df)) * 100)

Number of bots:  12425
Number of humans:  25013
Bots / Total %:  33.18820449810353


In [7]:
features = ['verified', 
            #'created_at',
            #'hour_created',
            'geo_enabled', 
            'default_profile', 
            'default_profile_image', 
            'favourites_count', 
            'followers_count', 
            'friends_count', 
            'statuses_count', 
            'average_tweets_per_day',
            #'avg_daily_followers', 
            #'avg_daily_friends',
            #'avg_daily_favorites',
            'popularity', 
            'tweet_to_followers', 
            'follower_acq_rate', 
            'friends_acq_rate', 
            'favs_rate'
           ]

X = df[features]
y = df['bot']

In [8]:
def kf_model_compare(X, y, n_splits=3, random_state=3):
    
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)

    forest_acc, forest_prec, forest_rec, forest_f1, forest_roc_auc = [], [], [], [], []
    xgb_acc, xgb_prec, xgb_rec, xgb_f1, xgb_roc_auc = [], [], [], [], []

    X_kf, y_kf = np.array(X), np.array(y)
    
    for train_ind, val_ind in kf.split(X, y):

        X_train, y_train = X_kf[train_ind], y_kf[train_ind]
        X_val, y_val = X_kf[val_ind], y_kf[val_ind]

        # Random Forest
        forest = RandomForestClassifier()
        forest.fit(X_train, y_train)
        forest_pred = forest.predict(X_val)

        forest_acc.append(accuracy_score(y_val, forest_pred))
        forest_prec.append(precision_score(y_val, forest_pred))
        forest_rec.append(recall_score(y_val, forest_pred))
        forest_f1.append(f1_score(y_val, forest_pred))
        forest_roc_auc.append(roc_auc_score(y_val, forest.predict_proba(X_val)[:,1]))

        # XGBoost
        xgb = XGBClassifier()
        xgb.fit(X_train, y_train)
        xgb_pred = xgb.predict(X_val)

        xgb_acc.append(accuracy_score(y_val, xgb_pred))
        xgb_prec.append(precision_score(y_val, xgb_pred))
        xgb_rec.append(recall_score(y_val, xgb_pred))
        xgb_f1.append(f1_score(y_val, xgb_pred))
        xgb_roc_auc.append(roc_auc_score(y_val, xgb.predict_proba(X_val)[:,1]))


    print("RandomForest")
    print(f'Accuracy: {np.mean(forest_acc):.5f} +- {np.std(forest_acc):5f}')
    print(f'Precision: {np.mean(forest_prec):.5f} +- {np.std(forest_prec):5f}')
    print(f'Recall: {np.mean(forest_rec):.5f} +- {np.std(forest_rec):5f}')
    print(f'F1 Score: {np.mean(forest_f1):.5f} +- {np.std(forest_f1):5f}')
    print(f'ROC AUC: {np.mean(forest_roc_auc):.5f} +- {np.std(forest_roc_auc):5f}')
    print("")
    print("XGBoost")
    print(f'Accuracy: {np.mean(xgb_acc):.5f} +- {np.std(xgb_acc):5f}')
    print(f'Precision: {np.mean(xgb_prec):.5f} +- {np.std(xgb_prec):5f}')
    print(f'Recall: {np.mean(xgb_rec):.5f} +- {np.std(xgb_rec):5f}')
    print(f'F1 Score: {np.mean(xgb_f1):.5f} +- {np.std(xgb_f1):5f}')
    print(f'ROC AUC: {np.mean(xgb_roc_auc):.5f} +- {np.std(xgb_roc_auc):5f}')

## No Oversampling

In [9]:
kf_model_compare(X, y, n_splits=4, random_state=37)

RandomForest
Accuracy: 0.87336 +- 0.002599
Precision: 0.85314 +- 0.004486
Recall: 0.74710 +- 0.008123
F1 Score: 0.79656 +- 0.003321
ROC AUC: 0.92754 +- 0.002403

XGBoost
Accuracy: 0.87417 +- 0.003397
Precision: 0.84895 +- 0.004499
Recall: 0.75523 +- 0.007839
F1 Score: 0.79933 +- 0.004919
ROC AUC: 0.92904 +- 0.002306
