Disclaimer: This script was adapted from HDS-5230, taught in Spring 2019 by Dr. Evan Carey.

In [3]:
import numpy as np
import pandas as pd
from patsy import dmatrices
import sklearn
from sklearn import datasets

In [1]:
# The original dataset doesn't have column names, so let's go and create them.
col_names = ['username','bot_or_not', 'followersCount', 'friendsCount', 'tweetsCount', 'favoritesCount', 'tweetsPerDay', 'followersToFriends', 'favoritesPerTweet']

In [4]:
# datasets are located at /data/clean/labeled/concatenated/ML folder within this repository
data_equal = pd.read_csv('dataset_equal_class.csv', names = col_names) # pass column names as an optional parameter
data_all = pd.read_csv('dataset_final.csv', names = col_names)

In [5]:
data_equal.shape

(1450, 9)

In [5]:
data_all.shape

(5042, 9)

In [6]:
# Detect variables that we will need for our model
vars_to_remove = ['username', 'bot_or_not']
vars_left = set(data_equal.columns) - set(vars_to_remove)
formula = "bot_or_not ~ " + " + ".join(vars_left)
formula

'bot_or_not ~ tweetsPerDay + friendsCount + favoritesCount + favoritesPerTweet + followersCount + tweetsCount + followersToFriends'

In [7]:
# Use Patsy to create design matrices which are sort of enhanced pandas dataframes.
Y_all,X_all = dmatrices(formula, data_all)
Y_equal,X_equal = dmatrices(formula, data_equal)

In [8]:
from sklearn import preprocessing
X_all_normalized = preprocessing.normalize(X_all)
X_equal_normalized = preprocessing.normalize(X_equal)

In [9]:
# Randomly splitting data into 80/20.
from sklearn.model_selection import train_test_split
X_all_train, X_all_test, y_all_train, y_all_test = \
    train_test_split(X_all,
                     np.ravel(Y_all), # prevents dimensionality error later
                     test_size=0.2,
                     random_state=42)

X_equal_train, X_equal_test, y_equal_train, y_equal_test = \
    train_test_split(X_equal,
                     np.ravel(Y_equal),
                     test_size=0.2,
                     random_state=42)

X_all_normalized_train, X_all_normalized_test, y_all_normalized_train, y_all_normalized_test = \
    train_test_split(X_all_normalized,
                     np.ravel(Y_all),
                     test_size=0.2,
                     random_state=42)

X_equal_normalized_train, X_equal_normalized_test, y_equal_normalized_train, y_equal_normalized_test = \
    train_test_split(X_equal_normalized,
                     np.ravel(Y_equal),
                     test_size=0.2,
                     random_state=42)


In [10]:
# Simple logistic regression without regularization.
from sklearn import linear_model

clf_all = linear_model.LogisticRegressionCV(fit_intercept=True, cv = 5, Cs = 20, solver='liblinear', max_iter = 1000)
clf_all.fit(X_all_train,y_all_train)

clf_equal = linear_model.LogisticRegressionCV(fit_intercept=True, cv = 5, Cs = 20, solver='liblinear',max_iter = 1000)
clf_equal.fit(X_equal_train,y_equal_train)

clf_all_normalized = linear_model.LogisticRegressionCV(fit_intercept=True, cv = 5, Cs = 20, solver='liblinear',max_iter = 1000)
clf_all_normalized.fit(X_all_normalized_train,y_all_normalized_train)

clf_equal_normalized = linear_model.LogisticRegressionCV(fit_intercept=True, cv = 5, Cs = 20, solver='liblinear',max_iter = 1000)
clf_equal_normalized.fit(X_equal_normalized_train,y_equal_normalized_train)


LogisticRegressionCV(Cs=20, class_weight=None, cv=5, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=1000,
           multi_class='warn', n_jobs=None, penalty='l2',
           random_state=None, refit=True, scoring=None, solver='liblinear',
           tol=0.0001, verbose=0)

In [11]:
# Create function to print results
def get_results(x1):
    print("\n{0:30}   {1:4}    {2:4}".format('Model','Train','Test'))
    print('------------------------------------------------')
    for i in x1.keys():
        print("{0:30}   {1:<6.4}   {2:<6.4}".format(i,x1[i][0],x1[i][1]))

In [12]:
# Create dictionary to store all the results:
result_scores = {}
# Score the Model on Training and Testing Set
result_scores['Logistic_noReg_all'] = \
            (sklearn.metrics.accuracy_score(y_all_train,clf_all.predict(X_all_train)),
             sklearn.metrics.accuracy_score(y_all_test,clf_all.predict(X_all_test)))

In [13]:
result_scores['Logistic_noReg_equal'] = \
            (sklearn.metrics.accuracy_score(y_equal_train,clf_equal.predict(X_equal_train)),
             sklearn.metrics.accuracy_score(y_equal_test,clf_equal.predict(X_equal_test)))

In [14]:
result_scores['Logistic_noReg_all_norm'] = \
            (sklearn.metrics.accuracy_score(y_all_normalized_train,clf_all_normalized.predict(X_all_normalized_train)),
             sklearn.metrics.accuracy_score(y_all_normalized_test,clf_all_normalized.predict(X_all_normalized_test)))

In [15]:
result_scores['Logistic_noReg_equal_norm'] = \
            (sklearn.metrics.accuracy_score(y_equal_normalized_train,clf_equal_normalized.predict(X_equal_normalized_train)),
             sklearn.metrics.accuracy_score(y_equal_normalized_test,clf_equal_normalized.predict(X_equal_normalized_test)))

In [16]:
get_results(result_scores)


Model                            Train    Test
------------------------------------------------
Logistic_noReg_all               0.8574   0.8692
Logistic_noReg_equal             0.5491   0.5345
Logistic_noReg_all_norm          0.854    0.8712
Logistic_noReg_equal_norm        0.6569   0.6069
