# Load Datasets

In [21]:
import numpy as np
import pandas as pd
import os
import math

import sklearn.tree
import sklearn.ensemble

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.decomposition import PCA

In [22]:
# Clone repository in order to get access locally to the datasets
!rm -rf .git README.md
!git clone -b playing-emsembled-methods https://github.com/sergio-gimenez/anomaly-4G-detection 

fatal: destination path 'anomaly-4G-detection' already exists and is not an empty directory.


In [23]:
train = pd.read_csv('anomaly-4G-detection/ML-MATT-CompetitionQT2021_train.csv', sep=';')
test = pd.read_csv('anomaly-4G-detection/ML-MATT-CompetitionQT2021_test.xls', sep=';' )

In [24]:
# Separate labels from data 
X = train.drop('Unusual', axis='columns')#.to_numpy()
y = train['Unusual']#.to_numpy()

# We split the data into training and validation subsets (80% and 20%) in
# order to validate our training
X_train, X_validation, y_train, y_validation = train_test_split(X, y, 
                                                                train_size=0.8,
                                                                random_state=1, stratify = y)
X_test = test
#X_train = X
#y_train = y
zeros = []
ones = []
for a in y_train:
  if a == 0:
    zeros.append(a)
  else:
    ones.append(1)
print(len(zeros), len(ones), len(ones)+ len(zeros), y_train.shape)

26721 10183 36904 (36904,)


In [25]:
#Refactor time feature to minuts and cellName to unique identifier 1:1
def getTimeInMinutes(x):
  hh, mm  = x.split(":")
  return int(hh)* 60 + int(mm)

def createCellNameDictionary(data):
  cellList = []
  for i in data["CellName"]:
    cellList.append(i)
  cellList = set(cellList)
  cellDict = {}
  for idx, value in enumerate(cellList):
    cellDict[value]=idx
  return cellDict

def refactorFeaturesDataframe(data):
  #data["Time"] = data["Time"].apply(lambda x: getTimeInMinutes(x))
  data["TimeCos"] = data["Time"].apply(lambda x: math.cos(getTimeInMinutes(x)*math.pi/(12*60)))
  data["TimeSin"] = data["Time"].apply(lambda x: math.sin(getTimeInMinutes(x)*math.pi/(12*60)))
  del data["Time"]

  cellNameDict = createCellNameDictionary(data);
  data["CellName"] = data["CellName"].apply(lambda x: cellNameDict[x])
  print(data.head())
  return data


In [26]:
#Refactoring data from features to useful values
X_train = refactorFeaturesDataframe(X_train).to_numpy()
y_train = y_train.to_numpy()


X_validation = refactorFeaturesDataframe(X_validation).to_numpy()
y_validation = y_validation.to_numpy()

X_test = refactorFeaturesDataframe(test).to_numpy()

   CellName  PRBUsageUL  PRBUsageDL  ...  maxUE_UL+DL   TimeCos   TimeSin
0        20     12.3848      1.4019  ...            8 -0.946930  0.321439
1         1     22.0438      2.0016  ...           11 -0.831470  0.555570
2         0      0.5105      0.4258  ...            3 -0.442289  0.896873
3         4      1.9963      1.1513  ...            5  0.751840  0.659346
4         5      0.3030      0.4040  ...            3  0.608761  0.793353

[5 rows x 14 columns]
       CellName  PRBUsageUL  PRBUsageDL  ...  maxUE_UL+DL   TimeCos   TimeSin
11936         8      0.2020      0.5050  ...            5 -0.555570  0.831470
23788        26      4.0413      1.2222  ...            7  0.500000  0.866025
29595        18      0.2020      0.8080  ...            3 -0.130526  0.991445
29759        21      3.3674      0.5049  ...            6  0.831470  0.555570
33580        29     12.4290      1.1120  ...            7  0.321439 -0.946930

[5 rows x 14 columns]
   CellName  PRBUsageUL  PRBUsageDL  ...  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


# Solving the Classification Problem

In [27]:
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from joblib import dump, load
from google.colab import files
from scipy.stats import uniform, randint

#if performVoting == 'y':
  
try:
  clf_GS = load('anomaly-4G-detection/voting_model.joblib') 

except:
  voting_clf = VotingClassifier( estimators=[ 
      ('xgb', XGBClassifier(random_state=1)),
      ('dt', DecisionTreeClassifier(random_state=1)),
      ('knn', KNeighborsClassifier())
      ], voting='soft')

  def xgb_f1(y, t, threshold=0.5):
    t = t.get_label()
    y_bin = (y > threshold).astype(int) # works for both type(y) == <class 'numpy.ndarray'> and type(y) == <class 'pandas.core.series.Series'>
    return 'f1',f1_score(t,y_bin)
  pipe = Pipeline(steps=[('std_slc', StandardScaler()),
                              ('voting_clf', XGBClassifier(random_state=1, scale_pos_weight=7, feval=xgb_f1, maximize=true))])
  
  parameters = {
  'voting_clf__eta'    : uniform(0.2, 0.35),
  "voting_clf__colsample_bytree": uniform(0.05, 0.2),
  "voting_clf__min_child_weight": randint(1, 5),
  "voting_clf__gamma": uniform(0.35, 0.6),
  "voting_clf__learning_rate": uniform(0.1, 0.3), # default 0.1 
  "voting_clf__max_depth": randint(10, 30), # default 3
  "voting_clf__n_estimators": randint(500, 1000), # default 100
  "voting_clf__subsample": uniform(0.6, 0.99)
  }

  grid_params = {
  "voting_clf__colsample_bytree": [0.1, 0.2],
  "voting_clf__min_child_weight": [1],
  "voting_clf__gamma": [0.35, 0.4, 0.5],
  "voting_clf__learning_rate": [0.15, 0.2, 0.3], # default 0.1 
  "voting_clf__max_depth": [15, 19, 21], # default 3
  "voting_clf__n_estimators": [600, 700, 800], # default 100
  "voting_clf__subsample": [0.932]
  }

  #clf_GS = GridSearchCV(estimator=pipe, param_grid=grid_params, n_jobs=10, verbose=1, cv=3 )
  clf_GS = RandomizedSearchCV(estimator=pipe, param_distributions=parameters, n_jobs=10, verbose=1, cv=3, n_iter= 250 )
  clf_GS.fit(X_train,y_train)

  #Save the model in a file and download locally.
  dump(clf_GS, 'voting_model.joblib')
  files.download('voting_model.joblib') 

Fitting 3 folds for each of 250 candidates, totalling 750 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  40 tasks      | elapsed:  3.7min
[Parallel(n_jobs=10)]: Done 210 tasks      | elapsed: 17.5min
[Parallel(n_jobs=10)]: Done 460 tasks      | elapsed: 42.3min
[Parallel(n_jobs=10)]: Done 750 out of 750 | elapsed: 70.6min finished


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [28]:
pred_train = clf_GS.predict(X_train)
pred_val = clf_GS.predict(X_validation)

In [29]:
print("TRAINING\n" + classification_report(y_train, pred_train))
print("\nTESTING\n" + classification_report(y_validation, pred_val))

TRAINING
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     26721
           1       1.00      1.00      1.00     10183

    accuracy                           1.00     36904
   macro avg       1.00      1.00      1.00     36904
weighted avg       1.00      1.00      1.00     36904


TESTING
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5344
           1       1.00      1.00      1.00      2037

    accuracy                           1.00      7381
   macro avg       1.00      1.00      1.00      7381
weighted avg       1.00      1.00      1.00      7381



In [30]:
train_error = 1. - accuracy_score(y_train, pred_train)
train_cmat = confusion_matrix(y_train, pred_train)
val_error = 1. - accuracy_score(y_validation, pred_val)
val_cmat = confusion_matrix(y_validation, pred_val)

print('train error: %f ' % train_error)
print('train confusion matrix:')
print(train_cmat)
print('test error: %f ' % val_error)
print('test confusion matrix:')
print(val_cmat)

train error: 0.000000 
train confusion matrix:
[[26721     0]
 [    0 10183]]
test error: 0.000000 
test confusion matrix:
[[5344    0]
 [   0 2037]]


In [36]:
pred_test = clf_GS.predict(X_test)

# Submission Formatting

In [37]:
%%shell
# Create submission file if it does not exists
file=predictions.csv
if [ ! -e "$file" ] ; then
    touch anomaly-4G-detection/"$file"
fi



In [38]:
# Create index column in data frame object
submission_dataframe = pd.DataFrame(np.arange(1, 9159), columns=['Id']) 

# Append predictions of test data as column
submission_dataframe['Label'] = pred_test

# Convert Data Frame object to CSV
submission_dataframe.to_csv('predictions.csv', index=False)

!mv predictions.csv anomaly-4G-detection/
predictions = pd.read_csv('anomaly-4G-detection/predictions.csv')
predictions

Unnamed: 0,Id,Label
0,1,1
1,2,0
2,3,0
3,4,0
4,5,1
...,...,...
9153,9154,0
9154,9155,1
9155,9156,0
9156,9157,0


In [39]:
#!rm anomaly-4G-detection/predictions.csv

In [40]:
clf_GS.best_params_
#print(clf_GS.grid_scores_)

{'voting_clf__colsample_bytree': 0.058782352141150795,
 'voting_clf__eta': 0.43613527903402605,
 'voting_clf__gamma': 0.8563604616539272,
 'voting_clf__learning_rate': 0.2825668240878133,
 'voting_clf__max_depth': 29,
 'voting_clf__min_child_weight': 1,
 'voting_clf__n_estimators': 812,
 'voting_clf__subsample': 0.7500194088550587}