## Importing the required libraries and packages

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [56]:
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import accuracy_score,recall_score, precision_score,f1_score 
from sklearn.metrics import classification_report, confusion_matrix

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV


In [57]:
# Reading the CSV files
broadband_data = pd.read_csv('broadband_data.csv')
print(type(broadband_data))

outage_data = pd.read_csv('outage_data.csv')
print(type(outage_data))

report_data = pd.read_csv('report_data.csv')
print(type(report_data))

server_data = pd.read_csv('server_data.csv')
print(type(server_data))

sample_submission = pd.read_csv('sample_submission.csv')
print(type(sample_submission))

test_data = pd.read_csv('student_test.csv')
print(type(test_data))

train_data = pd.read_csv('train_data.csv')
print(type(train_data))

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>


In [58]:
train_data

Unnamed: 0,id,area_code,outage_duration
0,13366,area_415,1
1,6783,area_474,0
2,9519,area_931,1
3,10202,area_700,1
4,4555,area_600,2
...,...,...,...
5899,1910,area_403,1
5900,10475,area_821,0
5901,10675,area_798,2
5902,14714,area_210,0


In [59]:
outage_duration=train_data[["id","outage_duration"]]

In [60]:
train_data.drop("outage_duration",axis=1,inplace=True)

In [61]:
train_data

Unnamed: 0,id,area_code
0,13366,area_415
1,6783,area_474
2,9519,area_931
3,10202,area_700
4,4555,area_600
...,...,...
5899,1910,area_403
5900,10475,area_821
5901,10675,area_798
5902,14714,area_210


In [62]:
test_data

Unnamed: 0,id,area_code
0,3340,area_344
1,14067,area_933
2,1134,area_16
3,27,area_793
4,9793,area_344
...,...,...
1472,7975,area_124
1473,10664,area_821
1474,7753,area_476
1475,989,area_38


In [63]:
main_data=pd.concat([train_data,test_data],axis=0)

In [64]:
main_data

Unnamed: 0,id,area_code
0,13366,area_415
1,6783,area_474
2,9519,area_931
3,10202,area_700
4,4555,area_600
...,...,...
1472,7975,area_124
1473,10664,area_821
1474,7753,area_476
1475,989,area_38


In [65]:
data=pd.merge(main_data,broadband_data,how="left")
data=pd.merge(data,outage_data,how="left")
data=pd.merge(data,report_data,how="left")
data=pd.merge(data,server_data,how="left")

In [66]:
data

Unnamed: 0,id,area_code,broadband_type,outage_type,log_report_type,volume,transit_server_type
0,13366,area_415,broadband_type_2,outage_type_4,log_report_type_312,1,transit_server_type_35
1,6783,area_474,broadband_type_2,outage_type_2,log_report_type_312,2,transit_server_type_35
2,6783,area_474,broadband_type_2,outage_type_2,log_report_type_312,2,transit_server_type_34
3,6783,area_474,broadband_type_2,outage_type_2,log_report_type_233,1,transit_server_type_35
4,6783,area_474,broadband_type_2,outage_type_2,log_report_type_233,1,transit_server_type_34
...,...,...,...,...,...,...,...
61834,989,area_38,broadband_type_3,outage_type_4,log_report_type_153,1,transit_server_type_10
61835,989,area_38,broadband_type_3,outage_type_4,log_report_type_154,1,transit_server_type_27
61836,989,area_38,broadband_type_3,outage_type_4,log_report_type_154,1,transit_server_type_10
61837,3129,area_1067,broadband_type_8,outage_type_1,log_report_type_54,8,transit_server_type_11


In [13]:
for col in ['broadband_type', 'outage_type', 'log_report_type', 'transit_server_type', 'area_code']:
    data[col] = data[col].astype('category')

In [67]:
cat_attr = data.select_dtypes('float').columns
cat_attr

Index([], dtype='object')

In [68]:
num_attr=data.select_dtypes('int64').columns
num_attr

Index(['id', 'volume'], dtype='object')

In [69]:
fe_broaband = data.groupby('broadband_type').size()/len(data)
data.loc[:, 'broadband_type'] = data['broadband_type'].map(fe_broaband)

In [70]:
fe_outage = data.groupby('outage_type').size()/len(data)
data.loc[:, 'outage_type'] = data['outage_type'].map(fe_outage)

In [71]:
fe_log = data.groupby('log_report_type').size()/len(data)
data.loc[:, 'log_report_type'] = data['log_report_type'].map(fe_log)

In [72]:
fe_transit = data.groupby('transit_server_type').size()/len(data)
data.loc[:, 'transit_server_type'] = data['transit_server_type'].map(fe_transit)

In [73]:
fe_area = data.groupby('area_code').size()/len(data)
data.loc[:, 'area_code'] = data['area_code'].map(fe_area)

In [74]:
data

Unnamed: 0,id,area_code,broadband_type,outage_type,log_report_type,volume,transit_server_type
0,13366,0.000825,0.513365,0.014877,0.073319,1,0.193874
1,6783,0.002523,0.513365,0.392309,0.073319,2,0.193874
2,6783,0.002523,0.513365,0.392309,0.073319,2,0.184188
3,6783,0.002523,0.513365,0.392309,0.027248,1,0.193874
4,6783,0.002523,0.513365,0.392309,0.027248,1,0.184188
...,...,...,...,...,...,...,...
61834,989,0.000517,0.011708,0.014877,0.001714,1,0.009266
61835,989,0.000517,0.011708,0.014877,0.002749,1,0.003218
61836,989,0.000517,0.011708,0.014877,0.002749,1,0.009266
61837,3129,0.000598,0.323501,0.591391,0.028138,8,0.212245


In [75]:
X_train=data.set_index("id").loc[train_data["id"]]

In [76]:
X_train=X_train.reset_index("id")

In [77]:
train=pd.merge(X_train,outage_duration)

In [78]:
X_train=train.drop(["id","outage_duration"],axis=1)

In [79]:
y_train=train["outage_duration"]

In [80]:
y_train.shape

(48973,)

In [81]:
X_test=data.set_index("id").loc[test_data["id"]]

In [82]:
X_test=X_test.reset_index("id").drop_duplicates("id")

In [83]:
X_test.drop("id",axis=1,inplace=True)

In [84]:
num_attr="volume"

In [85]:
from sklearn.preprocessing import StandardScaler

In [86]:
scaler = StandardScaler()
scaler.fit(X_train[[num_attr]])

StandardScaler()

In [87]:
scaler.transform(X_train[[num_attr]])

array([[-0.32381984],
       [-0.28124413],
       [-0.28124413],
       ...,
       [-0.23866843],
       [-0.11094131],
       [-0.11094131]])

In [88]:
X_train[num_attr] = scaler.transform(X_train[[num_attr]])
X_test[num_attr] = scaler.transform(X_test[[num_attr]])

In [89]:
X_train

Unnamed: 0,area_code,broadband_type,outage_type,log_report_type,volume,transit_server_type
0,0.000825,0.513365,0.014877,0.073319,-0.323820,0.193874
1,0.002523,0.513365,0.392309,0.073319,-0.281244,0.193874
2,0.002523,0.513365,0.392309,0.073319,-0.281244,0.184188
3,0.002523,0.513365,0.392309,0.027248,-0.323820,0.193874
4,0.002523,0.513365,0.392309,0.027248,-0.323820,0.184188
...,...,...,...,...,...,...
48968,0.000194,0.513365,0.392309,0.067918,-0.281244,0.184188
48969,0.000194,0.513365,0.392309,0.073319,-0.238668,0.193874
48970,0.000194,0.513365,0.392309,0.073319,-0.238668,0.184188
48971,0.001003,0.323501,0.392309,0.012096,-0.110941,0.088536


In [90]:
X_test

Unnamed: 0,area_code,broadband_type,outage_type,log_report_type,volume,transit_server_type
0,0.000938,0.513365,0.014877,0.073319,-0.281244,0.193874
1,0.000016,0.513365,0.591391,0.032067,-0.323820,0.055046
2,0.000356,0.323501,0.591391,0.000388,0.059362,0.212245
4,0.001941,0.323501,0.591391,0.005805,-0.238668,0.212245
6,0.000938,0.513365,0.392309,0.073319,0.144513,0.184188
...,...,...,...,...,...,...
12839,0.012678,0.513365,0.392309,0.073319,-0.323820,0.184188
12847,0.019244,0.323501,0.591391,0.016866,-0.323820,0.212245
12848,0.008344,0.513365,0.392309,0.013002,-0.323820,0.193874
12852,0.000517,0.013115,0.014877,0.001746,-0.281244,0.003218


## Final Model

## 1. AdaBoost Classifier

In [137]:
# Create a AdaBoost classifier object
clf_ada = AdaBoostClassifier(DecisionTreeClassifier(criterion="gini", class_weight='balanced'), n_estimators=100, learning_rate = 0.4, random_state=123)

In [138]:
# Train the classifier
clf_ada.fit(X_train, y_train)

AdaBoostClassifier(base_estimator=DecisionTreeClassifier(class_weight='balanced'),
                   learning_rate=0.4, n_estimators=100, random_state=123)

In [139]:
# Predict
y_pred_train = clf_ada.predict(X_train)
y_pred_test = clf_ada.predict(X_test)

In [110]:
y_pred_test.shape

(1477,)

In [111]:
submission_data = test_data.drop("area_code",axis=1)

In [112]:
submission_data["outage_duration"]=y_pred_test

In [113]:
submission_data

Unnamed: 0,id,outage_duration
0,3340,0
1,14067,0
2,1134,2
3,27,0
4,9793,0
...,...,...
1472,7975,0
1473,10664,2
1474,7753,0
1475,989,0


In [144]:
submission_data.to_csv("my_submission_3.csv",index=False)

## 2. KNN with Grid Search CV

In [38]:
# Create knn classifer object
clf_knn = KNeighborsClassifier()

In [39]:
param_grid = {
                'n_neighbors': range(1,51),
                'weights' : ['uniform', 'distance'],
                'metric':["euclidean", "manhattan"]
            }

In [40]:
# GridSearchCV
clf_knn_grid = GridSearchCV(clf_knn, param_grid, cv=5, scoring='f1_macro', return_train_score=False)

In [41]:
# Train the classifier
%time clf_knn_grid.fit(X_train, y_train)

Wall time: 25min 34s


GridSearchCV(cv=10, estimator=KNeighborsClassifier(n_neighbors=10),
             param_grid={'metric': ['euclidean', 'manhattan'],
                         'n_neighbors': range(1, 51),
                         'weights': ['uniform', 'distance']},
             scoring='f1_macro')

In [42]:
# Find best model
clf_knn_grid_model = clf_knn_grid.best_estimator_
print(clf_knn_grid_model)
print (clf_knn_grid.best_score_, clf_knn_grid.best_params_) 

KNeighborsClassifier(metric='manhattan', n_neighbors=10, weights='distance')
0.5234250795820741 {'metric': 'manhattan', 'n_neighbors': 10, 'weights': 'distance'}


In [43]:
# Predict
y_pred_train = clf_knn_grid_model.predict(X_train)
y_pred_test = clf_knn_grid_model.predict(X_test)

In [44]:
y_pred_test.shape

(1477,)

In [45]:
submission_data = test_data.drop("area_code",axis=1)

In [46]:
submission_data["outage_duration"]=y_pred_test

In [49]:
submission_data

Unnamed: 0,id,outage_duration
0,3340,0
1,14067,0
2,1134,2
3,27,0
4,9793,0
...,...,...
1472,7975,0
1473,10664,2
1474,7753,0
1475,989,0


In [48]:
submission_data.to_csv("my_submission_4.csv",index=False)

## 3. AdaBoost with Grid Search CV

In [48]:
param_grid = {'n_estimators' : [25, 50, 100],
              'learning_rate' : [0.2, 0.3, 0.4]}

In [49]:
# GridSearchCV
clf_grid_ada = GridSearchCV(AdaBoostClassifier(DecisionTreeClassifier()), param_grid, scoring='f1_macro',n_jobs=-1)

In [None]:
# Train the classifier
%time clf_grid_ada.fit(X_train, y_train)

In [41]:
# Find best model
best_ada_model = clf_grid_ada.best_estimator_
print(best_ada_model)
print (clf_grid_ada.best_score_, clf_grid_ada.best_params_) 

AdaBoostClassifier(base_estimator=DecisionTreeClassifier(), learning_rate=0.4,
                   n_estimators=100)
0.5303837677047365 {'learning_rate': 0.4, 'n_estimators': 100}


In [42]:
# Predict
y_pred_train = best_ada_model.predict(X_train)
y_pred_test = best_ada_model.predict(X_test)

In [43]:
y_pred_test.shape

(1477,)

In [44]:
submission_data = test_data.drop("area_code",axis=1)

In [45]:
submission_data["outage_duration"]=y_pred_test

In [46]:
submission_data

Unnamed: 0,id,outage_duration
0,3340,0
1,14067,0
2,1134,2
3,27,0
4,9793,0
...,...,...
1472,7975,0
1473,10664,2
1474,7753,0
1475,989,0


In [47]:
submission_data.to_csv("my_submission_5.csv",index=False)

## 5. XGBoost with Grid Search CV

In [91]:
clf_XGB_grid = XGBClassifier()

In [92]:
# Use a grid over parameters of interest
param_grid = {
     'colsample_bytree': np.linspace(0.6, 0.8, 2),
     'n_estimators':[100, 200],
     'max_depth': [3, 4],
     'gamma': [0.2,0.3,0.4],
     'learning_rate': [0.001, 0.01, 0.1, 1, 10],
     'scale_pos_weight':[4]
}

In [93]:
# GridSearchCV
CV_XGB_grid = GridSearchCV(estimator=clf_XGB_grid, param_grid=param_grid, n_jobs=-1, scoring='f1_macro')

In [94]:
# Train the classifier
%time CV_XGB_grid.fit(X = X_train, y=y_train)

Parameters: { "scale_pos_weight" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Wall time: 18min 57s


GridSearchCV(estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=100, n_jobs=None,
                                     num_parallel_tree=None, random_state=None,
                                     reg_alpha=None, reg_lambda=None,
                                     scale_pos_weight=None, subsample=None,
                                     tree_method=None, validate_parameter

In [95]:
# Find best model
CV_XGB_grid_model = CV_XGB_grid.best_estimator_
print(CV_XGB_grid_model)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.6, gamma=0.2, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=1, max_delta_step=0, max_depth=4,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=200, n_jobs=4, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=4, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)


In [96]:
# Predict
y_pred_train=CV_XGB_grid_model.predict(X_train)
y_pred_test=CV_XGB_grid_model.predict(X_test)

In [97]:
y_pred_test.shape

(1477,)

In [98]:
submission_data = test_data.drop("area_code",axis=1)

In [99]:
submission_data["outage_duration"]=y_pred_test

In [100]:
submission_data

Unnamed: 0,id,outage_duration
0,3340,0
1,14067,0
2,1134,2
3,27,0
4,9793,0
...,...,...
1472,7975,0
1473,10664,1
1474,7753,0
1475,989,0


In [102]:
submission_data.to_csv("my_submission_6.csv",index=False)