# Load Datasets

In [186]:
import numpy as np
import pandas as pd
import os
import math

import sklearn.tree
import sklearn.ensemble

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.decomposition import PCA

In [187]:
# Clone repository in order to get access locally to the datasets
!rm -rf .git README.md
!git clone -b playing-emsembled-methods https://github.com/sergio-gimenez/anomaly-4G-detection 

fatal: destination path 'anomaly-4G-detection' already exists and is not an empty directory.


In [188]:
train = pd.read_csv('anomaly-4G-detection/ML-MATT-CompetitionQT2021_train.csv', sep=';')
test = pd.read_csv('anomaly-4G-detection/ML-MATT-CompetitionQT2021_test.xls', sep=';' )

In [189]:
# Separate labels from data 
X = train.drop('Unusual', axis='columns')#.to_numpy()
y = train['Unusual']#.to_numpy()

# We split the data into training and validation subsets (80% and 20%) in
# order to validate our training
X_train, X_validation, y_train, y_validation = train_test_split(X, y, 
                                                                train_size=0.8,
                                                                random_state=1, stratify = y)
X_test = test
#X_train = X
#y_train = y
zeros = []
ones = []
for a in y_train:
  if a == 0:
    zeros.append(a)
  else:
    ones.append(1)
print(len(zeros), len(ones), len(ones)+ len(zeros), y_train.shape)

21377 8146 29523 (29523,)


In [190]:
#Refactor time feature to minuts and cellName to unique identifier 1:1
def getTimeInMinutes(x):
  hh, mm  = x.split(":")
  return int(hh)* 60 + int(mm)

def createCellNameDictionary(data):
  cellList = []
  for i in data["CellName"]:
    cellList.append(i)
  cellList = set(cellList)
  cellDict = {}
  for idx, value in enumerate(cellList):
    cellDict[value]=idx
  return cellDict

def refactorFeaturesDataframe(data):
  #data["Time"] = data["Time"].apply(lambda x: getTimeInMinutes(x))
  data["TimeCos"] = data["Time"].apply(lambda x: math.cos(getTimeInMinutes(x)*math.pi/(12*60)))
  data["TimeSin"] = data["Time"].apply(lambda x: math.sin(getTimeInMinutes(x)*math.pi/(12*60)))
  del data["Time"]

  cellNameDict = createCellNameDictionary(data);
  data["CellName"] = data["CellName"].apply(lambda x: cellNameDict[x])
  print(data.head())
  return data


In [191]:
#Refactoring data from features to useful values
X_train = refactorFeaturesDataframe(X_train).to_numpy()
y_train = y_train.to_numpy()


X_validation = refactorFeaturesDataframe(X_validation).to_numpy()
y_validation = y_validation.to_numpy()

X_test = refactorFeaturesDataframe(test).to_numpy()

       CellName  PRBUsageUL  PRBUsageDL  ...  maxUE_UL+DL   TimeCos   TimeSin
14974        19      22.837       2.728  ...           12 -0.991445  0.130526
5318         27       7.377       1.011  ...            7 -0.442289  0.896873
31338        20       0.101       0.808  ...            4  0.442289  0.896873
22493        24       2.425       9.701  ...           10 -0.831470  0.555570
27949        15      22.938       3.335  ...           12 -0.923880  0.382683

[5 rows x 14 columns]
       CellName  PRBUsageUL  PRBUsageDL  ...  maxUE_UL+DL   TimeCos   TimeSin
11936        17      0.2020      0.5050  ...            5 -0.555570  0.831470
23788        15      4.0413      1.2222  ...            7  0.500000  0.866025
29595         0      0.2020      0.8080  ...            3 -0.130526  0.991445
29759         9      3.3674      0.5049  ...            6  0.831470  0.555570
33580        16     12.4290      1.1120  ...            7  0.321439 -0.946930

[5 rows x 14 columns]
   CellName  PRBUs

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

# Solving the Classification Problem

In [192]:
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from joblib import dump, load
from google.colab import files
from scipy.stats import uniform, randint

#if performVoting == 'y':
  
try:
  clf_GS = load('anomaly-4G-detection/voting_model.joblib') 

except:
  voting_clf = VotingClassifier( estimators=[ 
      ('xgb', XGBClassifier(random_state=1)),
      ('dt', DecisionTreeClassifier(random_state=1)),
      ('knn', KNeighborsClassifier())
      ], voting='soft')

  def xgb_f1(y, t, threshold=0.5):
    t = t.get_label()
    y_bin = (y > threshold).astype(int) # works for both type(y) == <class 'numpy.ndarray'> and type(y) == <class 'pandas.core.series.Series'>
    return 'f1',f1_score(t,y_bin)

  pipe = Pipeline(steps=[('std_slc', StandardScaler()),
                         ('voting_clf', XGBClassifier(random_state=1,
                                                      scale_pos_weight=7,
                                                      colsample_bytree= 0.053381469489678104,
                                                      eta= 0.20289460663803338,
                                                      gamma= 0.88723107873764,
                                                      learning_rate= 0.15455380920536027,
                                                      max_depth= 26,
                                                      min_child_weight= 1,
                                                      n_estimators= 565,
                                                      subsample= 0.9738168894035317))])
  
  parameters = {
  # 'voting_clf__eta'    : uniform(0.2, 0.35),
  # "voting_clf__colsample_bytree": uniform(0.05, 0.2),
  # "voting_clf__min_child_weight": randint(1, 5),
  # "voting_clf__gamma": uniform(0.35, 0.6),
  # "voting_clf__learning_rate": uniform(0.1, 0.3), # default 0.1 
  # "voting_clf__max_depth": randint(10, 30), # default 3
  # "voting_clf__n_estimators": randint(500, 1000), # default 100
  # "voting_clf__subsample": uniform(0.6, 0.99)
  }

  # grid_params = {
  # "voting_clf__colsample_bytree": [0.1, 0.2],
  # "voting_clf__min_child_weight": [1],
  # "voting_clf__gamma": [0.35, 0.4, 0.5],
  # "voting_clf__learning_rate": [0.15, 0.2, 0.3], # default 0.1 
  # "voting_clf__max_depth": [15, 19, 21], # default 3
  # "voting_clf__n_estimators": [600, 700, 800], # default 100
  # "voting_clf__subsample": [0.932]
  # }

  # parameters = {'voting_clf__colsample_bytree': 0.053381469489678104,
  # 'voting_clf__eta': 0.20289460663803338,
  # 'voting_clf__gamma': 0.88723107873764,
  # 'voting_clf__learning_rate': 0.15455380920536027,
  # 'voting_clf__max_depth': 26,
  # 'voting_clf__min_child_weight': 1,
  # 'voting_clf__n_estimators': 565,
  # 'voting_clf__subsample': 0.9738168894035317}

  #clf_GS = GridSearchCV(estimator=pipe, param_grid=grid_params, n_jobs=10, verbose=1, cv=3 )
  clf_GS = RandomizedSearchCV(estimator=pipe, param_distributions=parameters, n_jobs=10, verbose=1, cv=3, n_iter= 1)
  clf_GS.fit(X_train, y_train)

  #Save the model in a file and download locally.
  dump(clf_GS, 'voting_model.joblib')
  files.download('voting_model.joblib') 

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   3 out of   3 | elapsed:   26.1s finished


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [193]:
pred_train = clf_GS.predict(X_train)
pred_val = clf_GS.predict(X_validation)

In [194]:
print("TRAINING\n" + classification_report(y_train, pred_train))
print("\nTESTING\n" + classification_report(y_validation, pred_val))

TRAINING
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     21377
           1       1.00      1.00      1.00      8146

    accuracy                           1.00     29523
   macro avg       1.00      1.00      1.00     29523
weighted avg       1.00      1.00      1.00     29523


TESTING
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5344
           1       1.00      0.99      0.99      2037

    accuracy                           1.00      7381
   macro avg       1.00      0.99      1.00      7381
weighted avg       1.00      1.00      1.00      7381



In [195]:
train_error = 1. - accuracy_score(y_train, pred_train)
train_cmat = confusion_matrix(y_train, pred_train)
val_error = 1. - accuracy_score(y_validation, pred_val)
val_cmat = confusion_matrix(y_validation, pred_val)

print('train error: %f ' % train_error)
print('train confusion matrix:')
print(train_cmat)
print('test error: %f ' % val_error)
print('test confusion matrix:')
print(val_cmat)

train error: 0.000000 
train confusion matrix:
[[21377     0]
 [    0  8146]]
test error: 0.003658 
test confusion matrix:
[[5340    4]
 [  23 2014]]


In [196]:
from sklearn.feature_selection import SelectFromModel
#print(clf_GS.best_estimator_.named_steps["voting_clf"].feature_importances_)
#print(clf_GS.best_estimator_.named_steps["voting_clf"].get_booster().get_fscore())

thresholds = np.sort(clf_GS.best_estimator_.named_steps["voting_clf"].feature_importances_)
print(thresholds)
i = 0
for thresh in thresholds:
  new_X_train = X_train
  selection = SelectFromModel(clf_GS.best_estimator_.named_steps["voting_clf"], threshold=thresh, prefit=True)
  select_X_train = selection.transform(new_X_train)

  selection_model = RandomizedSearchCV(estimator=pipe, param_distributions=parameters, n_jobs=10, verbose=1, cv=3, n_iter= 1)
  selection_model.fit(select_X_train, y_train)
  #dump(clf_GS, '%dselected_feature_model.joblib' % i)
  #files.download('%dselected_feature_model.joblib' % i) 
  pred_train = selection_model.predict(select_X_train)
  print(X_validation.shape, X_train.shape)
  select_X_val = selection.transform(X_validation)
  pred_val = selection_model.predict(select_X_val)
  
  i = i + 1

  train_error = 1. - accuracy_score(y_train, pred_train)
  train_cmat = confusion_matrix(y_train, pred_train)
  val_error = 1. - accuracy_score(y_validation, pred_val)
  val_cmat = confusion_matrix(y_validation, pred_val)

  print("Threshold: %f \n" % thresh)
  print('train error: %f ' % train_error)
  print('train confusion matrix:')
  print(train_cmat)
  print('test error: %f ' % val_error)
  print('test confusion matrix:')
  print(val_cmat)

[0.0051622  0.00552674 0.00815703 0.01028607 0.01257942 0.0125976
 0.03616058 0.04108327 0.0518401  0.05857509 0.07333045 0.09147334
 0.13227946 0.46094868]
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   3 out of   3 | elapsed:   24.0s finished


(7381, 14) (29523, 14)
Threshold: 0.005162 

train error: 0.000000 
train confusion matrix:
[[21377     0]
 [    0  8146]]
test error: 0.003658 
test confusion matrix:
[[5340    4]
 [  23 2014]]
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   3 out of   3 | elapsed:   23.5s finished


(7381, 14) (29523, 14)
Threshold: 0.005527 

train error: 0.000000 
train confusion matrix:
[[21377     0]
 [    0  8146]]
test error: 0.003794 
test confusion matrix:
[[5339    5]
 [  23 2014]]
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   3 out of   3 | elapsed:   21.4s finished


(7381, 14) (29523, 14)
Threshold: 0.008157 

train error: 0.000000 
train confusion matrix:
[[21377     0]
 [    0  8146]]
test error: 0.003387 
test confusion matrix:
[[5339    5]
 [  20 2017]]
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   3 out of   3 | elapsed:   21.5s finished


(7381, 14) (29523, 14)
Threshold: 0.010286 

train error: 0.000000 
train confusion matrix:
[[21377     0]
 [    0  8146]]
test error: 0.003387 
test confusion matrix:
[[5339    5]
 [  20 2017]]
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   3 out of   3 | elapsed:   20.3s finished


(7381, 14) (29523, 14)
Threshold: 0.012579 

train error: 0.000034 
train confusion matrix:
[[21376     1]
 [    0  8146]]
test error: 0.002845 
test confusion matrix:
[[5340    4]
 [  17 2020]]
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   3 out of   3 | elapsed:   20.7s finished


(7381, 14) (29523, 14)
Threshold: 0.012598 

train error: 0.000034 
train confusion matrix:
[[21376     1]
 [    0  8146]]
test error: 0.002845 
test confusion matrix:
[[5340    4]
 [  17 2020]]
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   3 out of   3 | elapsed:   20.5s finished


(7381, 14) (29523, 14)
Threshold: 0.036161 

train error: 0.000034 
train confusion matrix:
[[21376     1]
 [    0  8146]]
test error: 0.002981 
test confusion matrix:
[[5338    6]
 [  16 2021]]
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   3 out of   3 | elapsed:   21.9s finished


(7381, 14) (29523, 14)
Threshold: 0.041083 

train error: 0.000068 
train confusion matrix:
[[21375     2]
 [    0  8146]]
test error: 0.003387 
test confusion matrix:
[[5339    5]
 [  20 2017]]
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   3 out of   3 | elapsed:   24.0s finished


(7381, 14) (29523, 14)
Threshold: 0.051840 

train error: 0.000034 
train confusion matrix:
[[21376     1]
 [    0  8146]]
test error: 0.003252 
test confusion matrix:
[[5338    6]
 [  18 2019]]
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   3 out of   3 | elapsed:   23.1s finished


(7381, 14) (29523, 14)
Threshold: 0.058575 

train error: 0.000102 
train confusion matrix:
[[21374     3]
 [    0  8146]]
test error: 0.002574 
test confusion matrix:
[[5338    6]
 [  13 2024]]
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   3 out of   3 | elapsed:   22.2s finished


(7381, 14) (29523, 14)
Threshold: 0.073330 

train error: 0.000068 
train confusion matrix:
[[21375     2]
 [    0  8146]]
test error: 0.002439 
test confusion matrix:
[[5340    4]
 [  14 2023]]
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   3 out of   3 | elapsed:   21.2s finished


(7381, 14) (29523, 14)
Threshold: 0.091473 

train error: 0.000000 
train confusion matrix:
[[21377     0]
 [    0  8146]]
test error: 0.002439 
test confusion matrix:
[[5340    4]
 [  14 2023]]
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   3 out of   3 | elapsed:   17.8s finished


(7381, 14) (29523, 14)
Threshold: 0.132279 

train error: 0.012228 
train confusion matrix:
[[21365    12]
 [  349  7797]]
test error: 0.015987 
test confusion matrix:
[[5336    8]
 [ 110 1927]]
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   3 out of   3 | elapsed:   21.9s finished


(7381, 14) (29523, 14)
Threshold: 0.460949 

train error: 0.017647 
train confusion matrix:
[[21201   176]
 [  345  7801]]
test error: 0.020051 
test confusion matrix:
[[5303   41]
 [ 107 1930]]


In [202]:
from sklearn.feature_selection import SelectFromModel
#print(clf_GS.best_estimator_.named_steps["voting_clf"].feature_importances_)
#print(clf_GS.best_estimator_.named_steps["voting_clf"].get_booster().get_fscore())

thresholds = np.sort(clf_GS.best_estimator_.named_steps["voting_clf"].feature_importances_)
print(thresholds)
i = 0
new_X_train = X_train
selection = SelectFromModel(clf_GS.best_estimator_.named_steps["voting_clf"], threshold=0.091473 , prefit=True)
select_X_train = selection.transform(new_X_train)

selection_model = RandomizedSearchCV(estimator=pipe, param_distributions=parameters, n_jobs=10, verbose=1, cv=3, n_iter= 1)
selection_model.fit(select_X_train, y_train)
dump(clf_GS, '%dselected_feature_model.joblib' % i)
files.download('%dselected_feature_model.joblib' % i) 
pred_train = selection_model.predict(select_X_train)
select_X_val = selection.transform(X_validation)
pred_val = selection_model.predict(select_X_val)

i = i + 1

train_error = 1. - accuracy_score(y_train, pred_train)
train_cmat = confusion_matrix(y_train, pred_train)
val_error = 1. - accuracy_score(y_validation, pred_val)
val_cmat = confusion_matrix(y_validation, pred_val)

print("Threshold: %f \n" % thresh)
print('train error: %f ' % train_error)
print('train confusion matrix:')
print(train_cmat)
print('test error: %f ' % val_error)
print('test confusion matrix:')
print(val_cmat)

[0.0051622  0.00552674 0.00815703 0.01028607 0.01257942 0.0125976
 0.03616058 0.04108327 0.0518401  0.05857509 0.07333045 0.09147334
 0.13227946 0.46094868]
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   3 out of   3 | elapsed:   24.2s finished


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Threshold: 0.460949 

train error: 0.000000 
train confusion matrix:
[[21377     0]
 [    0  8146]]
test error: 0.002439 
test confusion matrix:
[[5340    4]
 [  14 2023]]


In [203]:
#pred_test = clf_GS.predict(X_test)
select_X_test = selection.transform(X_test)
pred_test = selection_model.predict(select_X_test)
#files.download('anomaly-4G-detection/predictions.csv')

# Submission Formatting

In [204]:
%%shell
# Create submission file if it does not exists
file=predictions.csv
if [ ! -e "$file" ] ; then
    touch anomaly-4G-detection/"$file"
fi



In [205]:
# Create index column in data frame object
submission_dataframe = pd.DataFrame(np.arange(1, 9159), columns=['Id']) 

# Append predictions of test data as column
submission_dataframe['Label'] = pred_test

# Convert Data Frame object to CSV
submission_dataframe.to_csv('predictions.csv', index=False)

!mv predictions.csv anomaly-4G-detection/
predictions = pd.read_csv('anomaly-4G-detection/predictions.csv')
predictions

Unnamed: 0,Id,Label
0,1,1
1,2,0
2,3,0
3,4,0
4,5,1
...,...,...
9153,9154,0
9154,9155,1
9155,9156,0
9156,9157,0


In [206]:
#!rm anomaly-4G-detection/predictions.csv

In [207]:
clf_GS.best_params_
#print(clf_GS.grid_scores_)

{}