# Load Datasets

In [1]:
import numpy as np
import pandas as pd
import os

import sklearn.tree
import sklearn.ensemble

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.decomposition import PCA

In [2]:
# Clone repository in order to get access locally to the datasets
!rm -rf .git README.md
!git clone -b playing-emsembled-methods https://github.com/sergio-gimenez/anomaly-4G-detection 

fatal: destination path 'anomaly-4G-detection' already exists and is not an empty directory.


In [3]:
!ls -la

total 8104
drwxr-xr-x 1 root root    4096 Dec 31 11:52 .
drwxr-xr-x 1 root root    4096 Dec 31 10:07 ..
drwxr-xr-x 4 root root    4096 Dec 31 11:52 anomaly-4G-detection
drwxr-xr-x 1 root root    4096 Dec 21 17:29 .config
drwxr-xr-x 1 root root    4096 Dec 21 17:29 sample_data
-rw-r--r-- 1 root root 8276307 Dec 31 11:20 voting_model.joblib


In [4]:
train = pd.read_csv('anomaly-4G-detection/ML-MATT-CompetitionQT2021_train.csv', sep=';')
test = pd.read_csv('anomaly-4G-detection/ML-MATT-CompetitionQT2021_test.xls', sep=';' )

In [5]:
# Separate labels from data 
X = train.drop('Unusual', axis='columns')#.to_numpy()
y = train['Unusual']#.to_numpy()

# We split the data into training and validation subsets (80% and 20%) in
# order to validate our training
X_train, X_validation, y_train, y_validation = train_test_split(X, y, 
                                                                train_size=0.8,
                                                                random_state=1, stratify = y)
X_test = test

In [6]:
#Refactor time feature to minuts and cellName to unique identifier 1:1
def getTimeInMinutes(x):
  hh, mm  = x.split(":")
  return int(hh)* 60 + int(mm)

def createCellNameDictionary(data):
  cellList = []
  for i in data["CellName"]:
    cellList.append(i)
  cellList = set(cellList)
  cellDict = {}
  for idx, value in enumerate(cellList):
    cellDict[value]=idx
  return cellDict

def refactorFeaturesDataframe(data):
  data["Time"] = data["Time"].apply(lambda x: getTimeInMinutes(x))
  cellNameDict = createCellNameDictionary(data);
  data["CellName"] = data["CellName"].apply(lambda x: cellNameDict[x])
  return data


In [7]:
#Refactoring data from features to useful values
X_train = refactorFeaturesDataframe(X_train).to_numpy()
y_train = y_train.to_numpy()

X_validation = refactorFeaturesDataframe(X_validation).to_numpy()
y_validation = y_validation.to_numpy()

X_test = refactorFeaturesDataframe(test).to_numpy()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


# Solving the Classification Problem

In [8]:
clf = sklearn.tree.DecisionTreeClassifier(random_state=1)
pipe = Pipeline(steps=[('std_slc', StandardScaler()),
                           ('dec_tree', clf)])

n_components = list(range(1,X_train.shape[1]+1,2))
criterion = ['gini', 'entropy']
max_depth = [None,2,8,12]
min_samples_split = [2,4,8,10]
min_samples_leaf = [1,2,5]

parameters = dict(dec_tree__criterion=criterion,
                      dec_tree__min_samples_split=min_samples_split,
                      dec_tree__min_samples_leaf=min_samples_leaf,
                      dec_tree__max_depth=max_depth)
#clf_GS = GridSearchCV(pipe, parameters)
#clf_GS.fit(X_train, y_train)

#clf.fit(X_train, y_train)

In [9]:
#pred_train = clf_GS.predict(X_train)
#pred_val = clf_GS.predict(X_validation)

# Voting Classifier with GridSearch

In [55]:
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from joblib import dump, load
from google.colab import files
from scipy.stats import uniform, randint

#if performVoting == 'y':
  
try:
  clf_GS = load('anomaly-4G-detection/voting_model.joblib') 
  

except:
  voting_clf = VotingClassifier( estimators=[ 
      ('xgb', XGBClassifier(random_state=1, eta=0.10, max_depth=15, min_child_weight=1, n_estimators=400)),
      ('dt', DecisionTreeClassifier(random_state=1, criterion='entropy', min_samples_leaf=2)),
      ('knn', KNeighborsClassifier(metric="euclidean", weights="uniform", p = 1, n_neighbors=19))
      ], voting='soft')
  
  pipe = Pipeline(steps=[('std_slc', StandardScaler()),
                              ('voting_clf', voting_clf)])

  #Params ADA
  n_estimators_ada = [600, 800]
  learning_rate = [0.1,1,10]
  #Params Decision Tree
  min_samples_split_dt = [2,10,14]
  min_samples_leaf_dt = [1,2,5]
  #Params GradientBoost
  parameters = {
  #  'voting_clf__xgb__learning_rate' : [ 0.01, 0.1, 0.5],
  # 'voting_clf__xgb__gamma' : [ 0.1, 0.5],
    'voting_clf__xgb__eta'    : uniform(0.1, 0.3) ,
    "voting_clf__xgb__colsample_bytree": uniform(0.7, 0.3),
    "voting_clf__xgb__gamma": uniform(0, 0.5),
    "voting_clf__xgb__learning_rate": uniform(0.03, 0.3), # default 0.1 
    "voting_clf__xgb__max_depth": randint(2, 20), # default 3
    "voting_clf__xgb__n_estimators": randint(100, 1000), # default 100
    "voting_clf__xgb__subsample": uniform(0.6, 0.4)
    }
  

  '''
  parameters = dict(voting_clf__ada__n_estimators=n_estimators_ada,
                    voting_clf__decision_tree__min_samples_split = min_samples_split_dt,
                      )
  '''
  #clf_GS = GridSearchCV(estimator=pipe, param_grid=parameters, n_jobs=10, verbose=1, )
  clf_GS = RandomizedSearchCV(estimator=pipe, param_distributions=parameters, n_jobs=10, verbose=1, cv=3 )
  clf_GS.fit(X_train,y_train)

  #Save the model in a file and download locally.
  dump(clf_GS, 'voting_model.joblib')
  files.download('voting_model.joblib') 

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 out of  30 | elapsed: 10.0min finished


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [56]:
a = range(100, 900, 200)
print(a)

range(100, 900, 200)


In [57]:
pred_train = clf_GS.predict(X_train)
pred_val = clf_GS.predict(X_validation)

In [58]:
print("TRAINING\n" + classification_report(y_train, pred_train))
print("\nTESTING\n" + classification_report(y_validation, pred_val))

TRAINING
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     21377
           1       1.00      1.00      1.00      8146

    accuracy                           1.00     29523
   macro avg       1.00      1.00      1.00     29523
weighted avg       1.00      1.00      1.00     29523


TESTING
              precision    recall  f1-score   support

           0       0.97      1.00      0.99      5344
           1       0.99      0.93      0.96      2037

    accuracy                           0.98      7381
   macro avg       0.98      0.96      0.97      7381
weighted avg       0.98      0.98      0.98      7381



In [59]:
train_error = 1. - accuracy_score(y_train, pred_train)
train_cmat = confusion_matrix(y_train, pred_train)
val_error = 1. - accuracy_score(y_validation, pred_val)
val_cmat = confusion_matrix(y_validation, pred_val)

print('train error: %f ' % train_error)
print('train confusion matrix:')
print(train_cmat)
print('test error: %f ' % val_error)
print('test confusion matrix:')
print(val_cmat)

train error: 0.000203 
train confusion matrix:
[[21377     0]
 [    6  8140]]
test error: 0.020187 
test confusion matrix:
[[5334   10]
 [ 139 1898]]


In [60]:
pred_test = clf_GS.predict(X_test)

# Submission Formatting

In [61]:
%%shell
# Create submission file if it does not exists
file=predictions.csv
if [ ! -e "$file" ] ; then
    touch anomaly-4G-detection/"$file"
fi



In [62]:
# Create index column in data frame object
submission_dataframe = pd.DataFrame(np.arange(1, 9159), columns=['Id']) 

# Append predictions of test data as column
submission_dataframe['Label'] = pred_test

# Convert Data Frame object to CSV
submission_dataframe.to_csv('predictions.csv', index=False)

!mv predictions.csv anomaly-4G-detection/
predictions = pd.read_csv('anomaly-4G-detection/predictions.csv')
predictions

Unnamed: 0,Id,Label
0,1,0
1,2,0
2,3,0
3,4,0
4,5,1
...,...,...
9153,9154,0
9154,9155,1
9155,9156,0
9156,9157,0


In [63]:
#!rm anomaly-4G-detection/predictions.csv

In [64]:
clf_GS.best_params_

{'voting_clf__xgb__colsample_bytree': 0.7724263008699148,
 'voting_clf__xgb__eta': 0.213906480983464,
 'voting_clf__xgb__gamma': 0.17010585845471338,
 'voting_clf__xgb__learning_rate': 0.2589374262950261,
 'voting_clf__xgb__max_depth': 7,
 'voting_clf__xgb__n_estimators': 260,
 'voting_clf__xgb__subsample': 0.858426551076145}