# Classification Model

## Contains code all pre-processing, model construction and model evaluation stages

In [2]:
#importing neccessary libraries
import pandas as pd

#importing prepared CSV File as pandas dataframe
friends_data = pd.read_csv('only_friends.csv')
print(friends_data.head())
print(friends_data.info())

#separating the features (x) and target (Y)
x = friends_data[['user_followers_count','user_friends_count',
                  'user_favourites_count', 'user_statuses_count',
                  'days_laststatus']]
Y = friends_data['am_following']

print(x.head())
print(Y)

        user_id user_screenname          user_name      user_location  \
0  8.536820e+17       Xxcept1on      Mike_Official                NaN   
1  1.829906e+07      badgermind       Robin Taylor  Stromness, Orkney   
2  2.148446e+09         najarvg            Rajan G            Chicago   
3  3.569359e+08         bolyche  Chris Bolychevsky    London, England   
4  7.812613e+08        amir_hk_              amiir                NaN   

                                            user_bio user_createdat  \
0  Founder  of CallOfTechies\r\n#Entrepreneur #Pr...     04-16-2017   
1  Photographer, pagan explorer of self and natur...     12-22-2008   
2  #cancerinformatics @uccancercenter. Science ge...     10-22-2013   
3  Works at a startup, former neuroscience PhD st...     08-17-2011   
4  #BigData, #DataMining, #DataScience, Combinato...     08-25-2012   

   user_followers_count  user_friends_count  user_favourites_count  \
0                    43                 202                    6

## Steps: 
## 1) Train-Test-Split
## 2) Normalize 
## 3) Minimize imbalance 
## 4) Model (that suits imbalanced data)

In [3]:
#train-test-split: A test size of 30% has been chosen
from sklearn.model_selection import train_test_split
x_train, x_test, Y_train, Y_test = train_test_split(
        x, Y, test_size=0.3, random_state=90)

print('Shape of x_train and x_test:')
print(x_train.shape)
print(x_test.shape)
print('Shape of Y_train and Y_test:')
print(Y_train.shape)
print(Y_test.shape)

Shape of x_train and x_test:
(438, 5)
(189, 5)
Shape of Y_train and Y_test:
(438,)
(189,)


In [4]:
#perform Normalisation of data: MinMaxScaler
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler().fit(x_train)
x_train_norm = scaler.transform(x_train)
x_test_norm = scaler.transform(x_test)
print("Train:", x_train_norm)
print("Test:", x_test_norm) 

Train: [[0.00071016 0.00173439 0.0009947  0.00191569 0.00130208]
 [0.00098574 0.00069798 0.00479469 0.00016048 0.00390625]
 [0.00120833 0.0029823  0.01120996 0.00206614 0.00195312]
 ...
 [0.01132015 0.01552486 0.02252051 0.00637895 0.01627604]
 [0.00059357 0.00130079 0.0019894  0.00122363 0.01692708]
 [0.00051937 0.00483301 0.0023247  0.00070209 0.00130208]]
Test: [[1.56871058e-03 1.94589564e-03 1.12881955e-03 4.76414952e-03
  1.30208333e-03]
 [6.35963750e-05 2.78136170e-03 9.94702372e-04 2.10625558e-04
  6.51041667e-04]
 [4.31713392e-02 4.26722223e-02 3.92069204e-02 1.38812272e-02
  0.00000000e+00]
 [2.64984896e-04 2.01992428e-03 6.09115497e-03 3.88152814e-03
  1.95312500e-03]
 [2.33186708e-04 1.18445822e-03 3.38645864e-03 1.49443848e-03
  6.51041667e-04]
 [4.98171604e-04 1.26906237e-03 5.92350851e-04 3.00893654e-04
  1.30208333e-03]
 [9.00948646e-04 2.98229658e-03 5.91233207e-03 7.76305628e-03
  1.30208333e-03]
 [2.56505379e-03 6.05977284e-03 2.69687283e-02 1.34399166e-03
  3.9062500

In [7]:
#performing an initial classification using KNN Classifier and Decision Tree Classifier
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=3)
knn_op = knn.fit(x_train_norm, Y_train)
Y_pred_knn = knn.predict(x_test_norm)

from sklearn import metrics
print('KNN Classifier:')
print('Accuracy: ', metrics.accuracy_score(Y_test, Y_pred_knn))
print('Precision: ', metrics.precision_score(Y_test, Y_pred_knn))
print('Recall: ', metrics.recall_score(Y_test, Y_pred_knn))
print('f1 Score', metrics.f1_score(Y_test, Y_pred_knn))

from sklearn.tree import DecisionTreeClassifier
dt= DecisionTreeClassifier(criterion = "entropy", random_state = 100, max_depth=3, min_samples_leaf=5)
dt_op = dt.fit(x_train_norm, Y_train)
Y_pred_dt = dt.predict(x_test_norm)

print('\nDecision Tree:')
print('Accuracy: ', metrics.accuracy_score(Y_test, Y_pred_dt))
print('Precision: ', metrics.precision_score(Y_test, Y_pred_dt))
print('Recall: ', metrics.recall_score(Y_test, Y_pred_dt))
print('f1 Score', metrics.f1_score(Y_test, Y_pred_dt))

KNN Classifier:
Accuracy:  0.8994708994708994
Precision:  0.5
Recall:  0.05263157894736842
f1 Score 0.09523809523809525

Decision Tree:
Accuracy:  0.873015873015873
Precision:  0.14285714285714285
Recall:  0.05263157894736842
f1 Score 0.07692307692307693


In [8]:
#resampling data: Performing upsampling using imblearn.over_sampling.SMOTE
print("Before OverSampling, counts of label '1': {}".format(sum(Y_train==1)))
print("Before OverSampling, counts of label '0': {} \n".format(sum(Y_train==0)))

from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=2)
x_train_res, Y_train_res = sm.fit_sample(x_train_norm, Y_train.ravel())

print('After OverSampling, the shape of train_X: {}'.format(x_train_res.shape))
print('After OverSampling, the shape of train_y: {} \n'.format(Y_train_res.shape))
print("After OverSampling, counts of label '1': {}".format(sum(Y_train_res==1)))
print("After OverSampling, counts of label '0': {}".format(sum(Y_train_res==0)))


Before OverSampling, counts of label '1': 28
Before OverSampling, counts of label '0': 410 

After OverSampling, the shape of train_X: (820, 5)
After OverSampling, the shape of train_y: (820,) 

After OverSampling, counts of label '1': 410
After OverSampling, counts of label '0': 410


In [10]:
#using decision tree again on re-sampled data
dt1= DecisionTreeClassifier(criterion = "entropy", random_state = 100, max_depth=3, min_samples_leaf=5)
dt_op1 = dt.fit(x_train_res, Y_train_res)
Y_pred_dt1 = dt.predict(x_test_norm)

print('\nDecision Tree, after up-sampling:')
print('Accuracy: ', metrics.accuracy_score(Y_test, Y_pred_dt1))
print('Precision: ', metrics.precision_score(Y_test, Y_pred_dt1))
print('Recall: ', metrics.recall_score(Y_test, Y_pred_dt1))
print('f1 Score', metrics.f1_score(Y_test, Y_pred_dt1))

#Trying XGB on up-sampled data
from xgboost import XGBClassifier
XGB = XGBClassifier()
XGB.fit(x_train_res,Y_train_res)
Y_pred_XGB = XGB.predict(x_test_norm)

print('\nXGB Classifier after upsampling:')
print("\nAccuracy: {} ".format(metrics.accuracy_score(Y_test, Y_pred_XGB)))
print("f1 Score : {}".format(metrics.f1_score(Y_test, Y_pred_XGB)))
print("Precision : {}".format(metrics.precision_score(Y_test, Y_pred_XGB)))
print("Recall : {}".format(metrics.recall_score(Y_test, Y_pred_XGB)))


Decision Tree, after up-sampling:
Accuracy:  0.6931216931216931
Precision:  0.14545454545454545
Recall:  0.42105263157894735
f1 Score 0.2162162162162162

XGB Classifier after upsampling:

Accuracy: 0.8095238095238095 
f1 Score : 0.25
Precision : 0.20689655172413793
Recall : 0.3157894736842105


  if diff:


In [11]:
print(metrics.classification_report(Y_test,Y_pred_XGB))

             precision    recall  f1-score   support

          0       0.92      0.86      0.89       170
          1       0.21      0.32      0.25        19

avg / total       0.85      0.81      0.83       189

