In [12]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


# Importing Libraries

In [1]:
import pandas as pd
from pandas import Series,DataFrame
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix 

import os
import warnings
warnings.filterwarnings('ignore')
from datetime import datetime
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
# Algorithms
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier,ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier

# Importing the dataset

In [2]:
# Importing the dataset
df_trainX = pd.read_csv("Train_Data_X.csv")

df_trainY = pd.read_csv("Train_Data_y.csv")

df_testX = pd.read_csv("Test_Data_X.csv")

# Data Preprocessing

In [3]:
#checking the data size
print("\nThe train data X size is : {} ".format(df_trainX.shape))
print("\nThe train data Y size is : {} ".format(df_trainY.shape))
print("The test data X size is : {} ".format(df_testX.shape))


The train data X size is : (2711, 23) 

The train data Y size is : (2711, 2) 
The test data X size is : (1163, 23) 


In [4]:
df_trainX.info()
print('-'*25)
df_trainY.info()
print('-'*25)
df_testX.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2711 entries, 0 to 2710
Data columns (total 23 columns):
Unnamed: 0             2711 non-null int64
X_follower_count       2711 non-null int64
X_following_count      2711 non-null int64
X_listed_count         2711 non-null int64
X_mentions_received    2711 non-null float64
X_retweets_received    2711 non-null float64
X_mentions_sent        2711 non-null float64
X_retweets_sent        2711 non-null float64
X_posts                2711 non-null float64
X_network_feature_1    2711 non-null int64
X_network_feature_2    2711 non-null float64
X_network_feature_3    2711 non-null float64
Y_follower_count       2711 non-null int64
Y_following_count      2711 non-null int64
Y_listed_count         2711 non-null int64
Y_mentions_received    2711 non-null float64
Y_retweets_received    2711 non-null float64
Y_mentions_sent        2711 non-null float64
Y_retweets_sent        2711 non-null float64
Y_posts                2711 non-null float64
Y_network

**What we observe from this:**
There are no missing values. 

# Data handling

In [5]:
df_trainX = df_trainX.drop(['Unnamed: 0'], axis=1)
df_trainY = df_trainY.drop(['Unnamed: 0'], axis=1)
df_testX = df_testX.drop(['Unnamed: 0'], axis=1)

In [6]:
# Generating few new features as rate of X

df_trainX['follow_rateX'] = df_trainX['X_following_count']/df_trainX['X_follower_count']
df_testX['follow_rateX'] = df_testX['X_following_count']/df_testX['X_follower_count']
df_trainX['mentions_rateX'] = df_trainX['X_mentions_received']/df_trainX['X_mentions_sent']
df_testX['mentions_rateX'] = df_testX['X_mentions_received']/df_testX['X_mentions_sent']
df_trainX['retweets_rateX'] = df_trainX['X_retweets_received']/df_trainX['X_retweets_sent']
df_testX['retweets_rateX'] = df_testX['X_retweets_received']/df_testX['X_retweets_sent']


In [7]:
# Generating few new features as rate of Y

df_trainX['follow_rateY'] = df_trainX['Y_following_count']/df_trainX['Y_follower_count']
df_testX['follow_rateY'] = df_testX['Y_following_count']/df_testX['Y_follower_count']
df_trainX['mentions_rateY'] = df_trainX['Y_mentions_received']/df_trainX['Y_mentions_sent']
df_testX['mentions_rateY'] = df_testX['Y_mentions_received']/df_testX['Y_mentions_sent']
df_trainX['retweets_rateY'] = df_trainX['Y_retweets_received']/df_trainX['Y_retweets_sent']
df_testX['retweets_rateY'] = df_testX['Y_retweets_received']/df_testX['Y_retweets_sent']


In [8]:
df_trainX.head()

Unnamed: 0,X_follower_count,X_following_count,X_listed_count,X_mentions_received,X_retweets_received,X_mentions_sent,X_retweets_sent,X_posts,X_network_feature_1,X_network_feature_2,...,Y_posts,Y_network_feature_1,Y_network_feature_2,Y_network_feature_3,follow_rateX,mentions_rateX,retweets_rateX,follow_rateY,mentions_rateY,retweets_rateY
0,4981,2026,170,57.856207,18.718991,3.52385,1.630532,5.137533,118,65.990909,...,1.635264,10,20.0,30.0,0.406746,16.418466,11.480295,0.828903,0.094898,0.293924
1,45639,902,2661,297.708558,73.997768,55.763536,6.666984,47.722563,561,147.404293,...,9.732542,2607,41.289101,1967.639388,0.019764,5.338768,11.099137,0.000259,109.900479,35.863433
2,1259726,377,6129,1817.298643,422.667342,24.043562,15.131286,25.584763,3376,30.048666,...,0.201007,10,20.0,30.0,0.000299,75.583585,27.933339,1.475362,1.0,1.0
3,33719,29367,903,256.829708,47.839261,13.11251,0.679044,24.584411,238,92.779221,...,1.645168,14,20.0,98.5,0.870933,19.586617,70.450871,1.159204,1.262177,1.074209
4,1834057,813,30642,3348.533818,776.852854,58.663704,14.616587,35.319419,6271,73.226014,...,7.547107,39,99.206897,2876.793103,0.000443,57.080164,53.148718,0.508216,1.493745,1.069563


In [9]:
df_trainY = pd.get_dummies(df_trainY, columns=['X_social_than_Y?'])

In [10]:
df_trainY = df_trainY.drop(['X_social_than_Y?_0'], axis=1)

In [11]:
df_trainY.head()

Unnamed: 0,X_social_than_Y?_1
0,0
1,0
2,1
3,1
4,1


In [12]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
df_trainX = sc_X.fit_transform(df_trainX)
df_testX = sc_X.transform(df_testX)

In [13]:
def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))


In [14]:
params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        }

In [15]:
xgb = XGBClassifier(learning_rate=0.02, n_estimators=600, objective='binary:logistic',
                    silent=True, nthread=1)

In [16]:
folds = 10
param_comb = 10

skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1001)

random_search = RandomizedSearchCV(xgb, param_distributions=params, n_iter=param_comb, scoring='roc_auc', n_jobs=4, cv=skf.split(df_trainX, df_trainY), verbose=3, random_state=1001 )

# Here we go
start_time = timer(None) # timing starts from this point for "start_time" variable
random_search.fit(df_trainX, df_trainY)
timer(start_time) # timing ends here for "start_time" variable

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:   45.8s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:  2.6min finished



 Time taken: 0 hours 2 minutes and 39.52 seconds.


In [17]:
print('\n All results:')
print(random_search.cv_results_)
print('\n Best estimator:')
print(random_search.best_estimator_)
print('\n Best normalized gini score for %d-fold search with %d parameter combinations:' % (folds, param_comb))
print(random_search.best_score_ * 2 - 1)
print('\n Best hyperparameters:')
print(random_search.best_params_)
results = pd.DataFrame(random_search.cv_results_)
results.to_csv('xgb-random-grid-search-results-01.csv', index=False)



 All results:
{'mean_fit_time': array([ 5.90536511,  7.40701129,  7.0476368 ,  5.46484187,  6.97346287,
        4.67855167,  5.60468462,  5.17017438,  5.99307966,  5.16322429]), 'std_fit_time': array([ 0.59544726,  0.07782866,  0.06416983,  0.04935961,  0.0626642 ,
        0.0382237 ,  0.05214254,  0.24095729,  0.02897615,  0.1779636 ]), 'mean_score_time': array([ 0.01924574,  0.02084537,  0.01975102,  0.01236718,  0.01655927,
        0.01562419,  0.01213357,  0.01119342,  0.01767225,  0.01667531]), 'std_score_time': array([ 0.01536323,  0.00094086,  0.0010725 ,  0.00066222,  0.00091071,
        0.0007718 ,  0.00137846,  0.00303001,  0.00423375,  0.00180925]), 'param_subsample': masked_array(data = [1.0 0.6 0.8 1.0 0.8 1.0 1.0 0.8 0.8 0.8],
             mask = [False False False False False False False False False False],
       fill_value = ?)
, 'param_min_child_weight': masked_array(data = [5 1 5 5 1 10 1 1 1 1],
             mask = [False False False False False False False False F

In [18]:
df_testY = random_search.predict(df_testX)

In [19]:
results_df = pd.DataFrame(data={'X_social_than_Y?':df_testY})
results_df.to_csv('Test_Data_y.csv', index=False)