
## Importing Packages for the Project

In [None]:
# Basic packages required for data manipulation and Feature Engineering

import pandas as pd
import numpy as np
# for plotting graphs
import matplotlib.pyplot as plt
import seaborn as sns
from mpl_toolkits.mplot3d import Axes3D
import plotly.express as px
import pylab as py

# statistical package for conducting hypothesis tests and other tasks 
import statsmodels.api as stm
from scipy.stats import chi2_contingency
from scipy.stats import norm

#package for model building
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression 
from sklearn.preprocessing import StandardScaler 
from sklearn.metrics import confusion_matrix, classification_report 

#package for Feature Engeeniring/Categorical Data Transformation
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder


# import SMOTE module from imblearn library 
# pip install imblearn (if you don't have imblearn in your system) 
from imblearn.over_sampling import SMOTE
from sklearn.decomposition import PCA

# to allow multiple outputs be visible in the same output cell

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"


from scipy import stats


## Reading the data ustilised for training the model

In [None]:
#df=pd.read_csv("D:/Sitanshu/Python/Twitter_Bot_Classification/finaldata.csv")
df=pd.read_csv("~/virtualenv/Twitter_Project/Twitter_Bot_Classification/finaldata.csv")
#df_1=pd.read_csv("D:/Sitanshu/Python/Twitter_Bot_Classification/model_data.csv")
#try to embed features from df_1 in code so that single file is called
df_1=pd.read_csv("~/virtualenv/Twitter_Project/Twitter_Bot_Classification/model_data.csv")
df_1 = df_1.drop('Unnamed: 0', axis =1)
df.drop(['Unnamed: 0', 'crawled_at', 'contributors_enabled','testset', 'random', 'default_profile_image', 'notifications', 'following', 'follow_request_sent', 'is_translator'], axis = 1, inplace = True)

#### Variable Modification for use in the model

#### Modifying certain columns to use in the model

In [None]:
#if not using df_1 then uncomment

# manipulating target variable by putting 0 and 1 values

#if the account's SPAM/Bot label it 1, else label it 0.
df.loc[df["Category"] == 'genuine', "class"] = "0"
df.loc[df["Category"] != 'genuine', "class"] = "1"
#class count
df['class'].value_counts()

#If url is present in the twitter account label 1, else label 0. 
df.loc[df["url"].isnull(), "url_present"] = "0"
df.loc[df["url"].notnull(), "url_present"] = "1"
#count of accounts
df['url_present'].value_counts()
#bar plot to visualise the class of account based on url_present label
#fig_1 = sns.countplot(x = 'url_present', data = df, hue = 'class')

#if the profile is default or not, bot/SPAM accounts are likely to be default
df.loc[df["default_profile"].isnull(), "default_profile"] = "0"
#default account value count based on the label
df['default_profile'].value_counts()
#bar plot to visualise distribution of default_profile on class
#fig_1 = sns.countplot(x = 'default_profile', data = df, hue = 'class')

#if the account has enabled it's geo location
df.loc[df["geo_enabled"].isnull(), "geo_enabled"] = "0"
#count of accounts that have enabled their geo_location
df['geo_enabled'].value_counts()
#visualisation
#fig_1 = sns.countplot(x = 'geo_enabled', data = df, hue = 'class')

df.loc[df["profile_use_background_image"].isnull(), "profile_use_background_image"] = "0"
df['profile_use_background_image'].value_counts()
#fig_1 = sns.countplot(x = 'profile_use_background_image', data = df, hue = 'class')

df.loc[df["description"].isnull(), "description_present"] = "0"
df.loc[df["description"].notnull(), "description_present"] = "1"
df['description_present'].value_counts()


df.loc[df["verified"] != 1, "verified"] = "0"
df['verified'].value_counts()
df.loc[df["protected"] != 1, "protected"] = "0"
df['protected'].value_counts()
#fig_1 = sns.countplot(x = 'description_present', data = df, hue = 'class')


##### Ignore warning should be implemented or not?

In [None]:
df_model = df[['screen_name','statuses_count', 'followers_count', 'friends_count', 'favourites_count', 'listed_count', 'url_present', 'default_profile', 'geo_enabled', 'profile_use_background_image','description_present', 'verified', 'protected', 'class']]

#implement age variable function in the above portion only directly from the df['timestamp']
df_model['age'] = df_1['age']

## Using EDA to understand data and create new Features

In [None]:
pd.options.display.float_format = '{:.5f}'.format
df_model.describe()

In [None]:
corrMatrix = df.corr()
sns.set(rc = {'figure.figsize':(10, 10)} )
sns.set(font_scale = 2)
sns.heatmap(corrMatrix, vmin = -1, vmax = 1, center = 0, cmap = sns.diverging_palette(40, 420, n = 200), square = True, cbar_kws = {'shrink': 1}, annot = True)
plt.show

In [None]:
list = ['statuses_count', 'followers_count', 'friends_count', 'favourites_count', 'listed_count', 'age']
for i in list:
    fig = plt.figure(figsize =(7, 7)) 
    # Creating axes instance 
    ax = fig.add_axes([1, 1, 1, 1]) 
    #change of origin by adding 1 to the original data 
    ax.boxplot(df_model[i] + 1 )

In [None]:
list = {'statuses_count', 'followers_count', 'friends_count', 'favourites_count', 'listed_count', 'age'}
for i in list:
    fig, ax = plt.subplots(1)
    sns.histplot(df_model[i], bins = 100, log_scale= False, kde = True)
    #np.log(df[i]+1).plot.hist(bins = 100)


In [None]:
list = {'statuses_count', 'followers_count', 'friends_count', 'favourites_count', 'listed_count', 'age'}
for i in list:
    fig, ax = plt.subplots(1)
    sns.histplot(df_model[i]+1, bins = 100, log_scale= True, kde = True)
    #np.log(df[i]+1).plot.hist(bins = 100)


#look for details
crim_boxcox = stats.boxcox(df['statuses_count']+1)[0]

In [None]:
pd.crosstab(df['verified'], df['protected'])

In [None]:
pd.crosstab(df['default_profile'], df['protected'])

In [None]:
pd.crosstab(df['default_profile'], df['protected'])

In [None]:
# not working with crosstab to assess the suitability of variables
#pd.crosstab(df['default_profile'], df['default_profile_image'])

In [None]:
pd.crosstab(df['class'], df['verified'])

In [None]:
pd.crosstab(df['class'], df['protected'], margins = True)

In [None]:
pd.crosstab(df['protected'], df['verified'], margins = True)

In [None]:
pd.crosstab(df['verified'], df['class'], margins = True)

In [None]:
pivot_df = pd.crosstab(index = df_model['protected'], columns = df_model['class'])

In [None]:
plt.clf()
cross = pd.crosstab(index = [df_model['verified'], df_model['protected'], df_model['default_profile'], df_model['profile_use_background_image'], df_model['description_present'], df_model['geo_enabled'], df_model['url_present']], columns = df['class'])
plt.subplots(figsize=(10,10))
plt.tick_params(labelsize = 10)
sns.heatmap(cross, cmap = 'YlOrBr', cbar_kws = {'shrink': 0.8})
cross

In [None]:
df_model.info()

In [None]:
df_pairplot = df[['statuses_count', 'followers_count', 'friends_count', 'favourites_count', 'listed_count', 'class']]

In [None]:
df_pairplot['statuses_count'] = np.log(df_pairplot['statuses_count']+1)
df_pairplot['followers_count'] = np.log(df_pairplot['followers_count']+1)
df_pairplot['friends_count'] = np.log(df_pairplot['friends_count']+1)
df_pairplot['favourites_count'] = np.log(df_pairplot['favourites_count']+1)
df_pairplot['listed_count'] = np.log(df_pairplot['listed_count']+1)

In [None]:
sns.pairplot(df_pairplot)

In [None]:
sns.pairplot(df_pairplot, hue = 'class')

In [None]:
#3D Plot
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(np.log(df['followers_count']+1), np.log(df['listed_count']+1), np.log(df['favourites_count']+1), c='r', marker='o')

ax.set_xlabel('X Label')
ax.set_ylabel('Y Label')
ax.set_zlabel('Z Label')

In [None]:
# 3D plot
fig = px.scatter_3d(df, x=np.log(df['statuses_count']+1), y=np.log(df['followers_count']+1), z=np.log(df['listed_count']+1),color=df['Category'],labels= True)
fig.show()

In [None]:
fig_5 = stm.qqplot(np.log(df['favourites_count']+1), line = '45')
fig_4 = stm.qqplot(np.log(df['listed_count']+1), line = '45')
fig_3 = stm.qqplot(np.log(df['followers_count']+1), line = '45')
fig_2 = stm.qqplot(np.log(df['statuses_count']+1), line = '45')
fig_1 = stm.qqplot(np.log(df['friends_count']+1), line = '45')
plt.show()

In [None]:
plt.scatter(np.log(df['followers_count']+1), np.log(df['friends_count']+1))

# Work on this section required

## Feature Creation using Color Schemes.

In [None]:
# manipulating color_codes of the text to create the feature assiciated with background text color. 
df['profile_background_color'].value_counts()

df.loc[df["profile_background_color"] == '0000FF', "background_color_class"] = "0000FF"
df.loc[df["profile_background_color"] == 'C0DEED', "background_color_class"] = "C0DEED"
df.loc[df["profile_background_color"] == '131516', "background_color_class"] = "131516"
df.loc[df["profile_background_color"] == '000000', "background_color_class"] = "000000"
df.loc[df['background_color_class'].isnull(), 'background_color_class'] = "others"
df['background_color_class'].value_counts()
fig_1 = sns.countplot(x = 'background_color_class', data = df, hue = 'class')
#df.loc[np.logical_and(df["profile_background_color"] != '131516', df["profile_background_color"] != 'C0DEED',df["profile_background_color"] != '0000FF') , "background_color_class"] = "4"

In [None]:
df['profile_text_color'].value_counts()

In [None]:
### Need to change variable encoding for 0  and 000000, since 0 represents missing value and is category in itself

In [None]:
# manipulating target variable by putting 0 and 1 values
df.loc[df["profile_text_color"] == '333333', "text_color_class"] = "333333"
df.loc[np.logical_or(df["profile_text_color"] == '0',df["profile_text_color"] == '000000') , "text_color_class"] = "000000"
df.loc[df["profile_text_color"] == '3D1957', "text_color_class"] = "3"
#df.loc[np.logical_and(df["profile_background_color"] != '131516', df["profile_background_color"] != 'C0DEED',df["profile_background_color"] != '0000FF') , "background_color_class"] = "4"

In [None]:
df['text_color_class'] = df['text_color_class'].fillna("other")
df['text_color_class'].value_counts()
fig_1 = sns.countplot(x = 'background_color_class', data = df, hue = 'class')

In [None]:
df_model['text_color_class'] = df['text_color_class']

In [None]:
print(df['profile_sidebar_fill_color'].value_counts())

In [None]:
# manipulating target variable by putting 0 and 1 values
df.loc[df["profile_sidebar_fill_color"] == 'DDEEF6', "sidebar_color_class"] = "DDEEF6"
df.loc[np.logical_or(df["profile_sidebar_fill_color"] == '0',df["profile_sidebar_fill_color"] == '000000') , "sidebar_color_class"] = "000000"
df.loc[df["profile_sidebar_fill_color"] == '407DB0', "sidebar_color_class"] = "407DB0"
df.loc[df["profile_sidebar_fill_color"] == 'EFEFEF', "sidebar_color_class"] = "EFEFEF"
#df.loc[np.logical_and(df["profile_background_color"] != '131516', df["profile_background_color"] != 'C0DEED',df["profile_background_color"] != '0000FF') , "background_color_class"] = "4"

In [None]:
df['sidebar_color_class'] = df['sidebar_color_class'].fillna('others')
#fig_1 = sns.countplot(x = 'sidebar_color_class', data = df, hue = 'class')

In [None]:
df_model['sidebar_color_class'] = df['sidebar_color_class']

## Hypothesis Testing

#### Hypothesis Testing and significance of Variable, assumptions of Hypothesis Tests are to be met before performing them

#### Testing of hypothesis that newly created variables have significant impact on the class

In [None]:
# Since we are using training data to set variables, we may not find the variable highly effective in the testing set.

In [None]:
# https://towardsdatascience.com/how-to-test-for-statistically-significant-relationships-between-categorical-variables-with-chi-66c3ebeda7cc

In [None]:
### create column with value count of class for the pivot table

###### Hypothesis: variables 'protected and class are independent'

In [None]:
chi2, p, dof, ex = chi2_contingency(pivot_df, correction=False)
print(chi2, '{:.10f}'.format(p))

###### Hypothesis: variables 'background_color_class and class are independent'

In [None]:
pivot_df = pd.crosstab(index = df_model['background_color_class'], columns = df_model['class'])

chi2, p, dof, ex = chi2_contingency(pivot_df, correction=True)
print(pivot_df)
print(chi2, '{:.10f}'.format(p))

##  Variable Encoding for Categorical Variables and Creates features

In [None]:
### https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html

In [None]:
# Label Encoding for nominal data 
gle = LabelEncoder()
s_color_labels = gle.fit_transform(df_model['sidebar_color_class'])
s_color_mappings = {index: label for index, label in 
                  enumerate(gle.classes_)}
s_color_mappings

In [None]:
df_model['s_color_labels'] = s_color_labels

In [None]:
gen_ohe = OneHotEncoder()
s_color_f_arr = gen_ohe.fit_transform(
                              df_model[['s_color_labels']]).toarray()
feature_labels = list(gle.classes_)
gen_features = pd.DataFrame(s_color_f_arr, 
                            columns=feature_labels)

In [None]:
gen_features.head()

In [None]:
df_model_1 = pd.concat([df_model.reset_index(drop=True), gen_features], axis = 1)

In [None]:
df_model_1.drop(labels = ['s_color_labels', 'sidebar_color_class', 'text_color_class', 'background_color_class'],axis = 1, inplace = True)

## Logistics model Training and testing

In [None]:
df_model_1 = df_model

In [None]:
df_model_1['url_present'] = df_model_1['url_present'].astype(float)
df_model_1['default_profile'] = df_model_1['default_profile'].astype(float)
df_model_1['geo_enabled'] = df_model_1['geo_enabled'].astype(float)
df_model_1['profile_use_background_image'] = df_model_1['profile_use_background_image'].astype(float)
df_model_1['description_present'] = df_model_1['description_present'].astype(float)
df_model_1['verified'] = df_model_1['verified'].astype(float)
df_model_1['protected'] = df_model_1['protected'].astype(float)
df_model_1['class'] = df_model_1['class'].astype(float)

In [None]:
#features
X=df_model_1.drop(columns = ['class', 'screen_name'], axis =1)
#target variable
y=df_model_1[['class']].values

#### Training the model on raw data

In [None]:
# split into 70:30 ration 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0) 

# describes info about train and test set 
print("Number transactions X_train dataset: ", X_train.shape) 
print("Number transactions y_train dataset: ", y_train.shape) 
print("Number transactions X_test dataset: ", X_test.shape) 
print("Number transactions y_test dataset: ", y_test.shape) 

In [None]:
# train the model on train set
lr1 = LogisticRegression() 
fit_model = lr1.fit(X_train, y_train)
predictions = lr1.predict(X_test) 

# print classification report 
print(classification_report(y_test, predictions)) 

#### Using Oversampling technique to increase the label count for training data

In [None]:
print("Before OverSampling, counts of label '1': {}".format(sum(y_train == 1))) 
print("Before OverSampling, counts of label '0': {} \n".format(sum(y_train == 0))) 

sm = SMOTE(random_state = 2) 
X_train_res, y_train_res = sm.fit_resample(X_train, y_train.ravel()) 

print('After OverSampling, the shape of train_X: {}'.format(X_train_res.shape)) 
print('After OverSampling, the shape of train_y: {} \n'.format(y_train_res.shape)) 

print("After OverSampling, counts of label '1': {}".format(sum(y_train_res == 1))) 
print("After OverSampling, counts of label '0': {}".format(sum(y_train_res == 0))) 

In [None]:
# train the model on train set
lr1 = LogisticRegression() 
fit_model = lr1.fit(X_train_res, y_train_res)
predictions = lr1.predict(X_test) 


# print classification report 
print(classification_report(y_test, predictions)) 

#### Undersampling to assess the model accuracy when less data is present 

In [None]:
print("Before Undersampling, counts of label '1': {}".format(sum(y_train == 1))) 
print("Before Undersampling, counts of label '0': {} \n".format(sum(y_train == 0))) 

# apply near miss 
from imblearn.under_sampling import NearMiss 
nr = NearMiss() 

X_train_miss, y_train_miss = nr.fit_resample(X_train, y_train.ravel()) 

print('After Undersampling, the shape of train_X: {}'.format(X_train_miss.shape)) 
print('After Undersampling, the shape of train_y: {} \n'.format(y_train_miss.shape)) 

print("After Undersampling, counts of label '1': {}".format(sum(y_train_miss == 1))) 
print("After Undersampling, counts of label '0': {}".format(sum(y_train_miss == 0))) 

In [None]:
# train the model on train set 
lr2 = LogisticRegression() 
lr2.fit(X_train_miss, y_train_miss) 
predictions = lr2.predict(X_test) 

# print classification report 
print(classification_report(y_test, predictions)) 

#### Fitting the model on log transformed data (variables with large range) and checking if it improves the accuracies

In [None]:
X.info()

In [None]:
# log transformation of variables with large values for better fitting of model.

X = (pd.DataFrame([np.log(X['statuses_count']+1), np.log(X['followers_count']+1), np.log(X['friends_count']+1), np.log(X['favourites_count']+1), np.log(X['listed_count']+1), X['default_profile'], X['geo_enabled'], X['verified'], X['protected'], X['profile_use_background_image'], X['url_present'], X['description_present'], X['age']]))
X = np.transpose(X)


In [None]:
# split into 70:30 ration 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0) 

# describes info about train and test set 
print("Number transactions X_train dataset: ", X_train.shape) 
print("Number transactions y_train dataset: ", y_train.shape) 
print("Number transactions X_test dataset: ", X_test.shape) 
print("Number transactions y_test dataset: ", y_test.shape) 

In [None]:
# train the model on train set
lr1 = LogisticRegression() 
fit_model = lr1.fit(X_train, y_train)
predictions = lr1.predict(X_test) 

# print classification report 
print(classification_report(y_test, predictions)) 

#### Using statsmodel package to fit the model

In [None]:
# https://www.geeksforgeeks.org/logistic-regression-using-statsmodels/

In [None]:
log_reg = stm.Logit(y_train, X_train).fit()
print(log_reg.summary())

In [None]:
log_reg_1 = stm.GLM(y_train, X_train).fit()
print(log_reg_1.summary())

#### RFE
##### RFE for the top features

In [None]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()

rfe = RFE(logreg, 1)

rfe = rfe.fit(X, y)
print(rfe.support_)
print(rfe.ranking_)

In [None]:
predictors = X_train
selector = RFE(lr1, n_features_to_select= 1)
selector = selector.fit(predictors, y_train)

In [None]:
order = selector.ranking_
order

In [None]:
X_train.info()

In [None]:
feature_ranks = []
for i in order:
    feature_ranks.append(f"{i}.{X_train.columns[i-1]}")

In [None]:
feature_ranks

### Random Forest Classification

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(max_depth = 5, bootstrap=True)
model = rf.fit(X_train, y_train)


In [None]:
# Use the forest's predict method on the test data
predictions = model.predict(X_test)

In [None]:
from sklearn.metrics import roc_auc_score, confusion_matrix

In [None]:
roc_auc_score(y_test, predictions)

In [None]:
confusion_matrix(y_test, predictions)

### Cross Validation Score

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
cross_val_score(rf, X, y, cv = 6, scoring='roc_auc')

In [None]:
model.feature_importances_

#### Binomial Family Model with Default Link Function

In [None]:
bin_model = stm.GLM(y_train, X_train, family = stm.families.Binomial())

In [None]:
bin_results = bin_model.fit()

In [None]:
print(bin_results.summary())

In [None]:
df.head()

### Work in Progress

#### Principal Component Analysis

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=1)
principalComponents = pca.fit_transform(x)
principalDf = pd.DataFrame(data = principalComponents, columns = ['principal component 1', 'principal component 2'])

#### Hosmer Lemeshow Test Function 

In [None]:
def hl_test(data, g):
    '''
    Hosmer-Lemeshow test to judge the goodness of fit for binary data

    Input: dataframe(data), integer(num of subgroups divided)
    
    Output: float
    '''
    data_st = data.sort_values('prob')
    data_st['dcl'] = pd.qcut(data_st['prob'], g)
    
    ys = data_st['ViolentCrimesPerPop'].groupby(data_st.dcl).sum()
    yt = data_st['ViolentCrimesPerPop'].groupby(data_st.dcl).count()
    yn = yt - ys
    
    yps = data_st['prob'].groupby(data_st.dcl).sum()
    ypt = data_st['prob'].groupby(data_st.dcl).count()
    ypn = ypt - yps
    
    hltest = ( ((ys - yps)**2 / yps) + ((yn - ypn)**2 / ypn) ).sum()
    pval = 1 - chi2.cdf(hltest, g-2)
    
    df = g-2
    
    print('\n HL-chi2({}): {}, p-value: {}\n'.format(df, hltest, pval))