# Load Datasets

In [19]:
import numpy as np
import pandas as pd
import os

import sklearn.tree
import sklearn.ensemble

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.decomposition import PCA

In [20]:
# Clone repository in order to get access locally to the datasets
!rm -rf .git README.md
!git clone -b playing-emsembled-methods https://github.com/sergio-gimenez/anomaly-4G-detection 

fatal: destination path 'anomaly-4G-detection' already exists and is not an empty directory.


In [21]:
!ls -la

total 24
drwxr-xr-x 1 root root 4096 Jan  1 12:19 .
drwxr-xr-x 1 root root 4096 Jan  1 11:59 ..
drwxr-xr-x 4 root root 4096 Jan  1 12:20 anomaly-4G-detection
drwxr-xr-x 1 root root 4096 Dec 21 17:29 .config
drwxr-xr-x 2 root root 4096 Jan  1 12:19 .ipynb_checkpoints
drwxr-xr-x 1 root root 4096 Dec 21 17:29 sample_data


In [22]:
train = pd.read_csv('anomaly-4G-detection/ML-MATT-CompetitionQT2021_train.csv', sep=';')
test = pd.read_csv('anomaly-4G-detection/ML-MATT-CompetitionQT2021_test.xls', sep=';' )

In [23]:
# Separate labels from data 
X = train.drop('Unusual', axis='columns')#.to_numpy()
y = train['Unusual']#.to_numpy()

# We split the data into training and validation subsets (80% and 20%) in
# order to validate our training
X_train, X_validation, y_train, y_validation = train_test_split(X, y, 
                                                                train_size=0.8,
                                                                random_state=1, stratify = y)
X_test = test

In [24]:
#Refactor time feature to minuts and cellName to unique identifier 1:1
def getTimeInMinutes(x):
  hh, mm  = x.split(":")
  return int(hh)* 60 + int(mm)

def createCellNameDictionary(data):
  cellList = []
  for i in data["CellName"]:
    cellList.append(i)
  cellList = set(cellList)
  cellDict = {}
  for idx, value in enumerate(cellList):
    cellDict[value]=idx
  return cellDict

def refactorFeaturesDataframe(data):
  data["Time"] = data["Time"].apply(lambda x: getTimeInMinutes(x))
  cellNameDict = createCellNameDictionary(data);
  data["CellName"] = data["CellName"].apply(lambda x: cellNameDict[x])
  return data


In [25]:
#Refactoring data from features to useful values
X_train = refactorFeaturesDataframe(X_train).to_numpy()
y_train = y_train.to_numpy()

X_validation = refactorFeaturesDataframe(X_validation).to_numpy()
y_validation = y_validation.to_numpy()

X_test = refactorFeaturesDataframe(test).to_numpy()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


# Solving the Classification Problem

In [26]:
clf = sklearn.tree.DecisionTreeClassifier(random_state=1)
pipe = Pipeline(steps=[('std_slc', StandardScaler()),
                           ('dec_tree', clf)])

n_components = list(range(1,X_train.shape[1]+1,2))
criterion = ['gini', 'entropy']
max_depth = [None,2,8,12]
min_samples_split = [2,4,8,10]
min_samples_leaf = [1,2,5]

parameters = dict(dec_tree__criterion=criterion,
                      dec_tree__min_samples_split=min_samples_split,
                      dec_tree__min_samples_leaf=min_samples_leaf,
                      dec_tree__max_depth=max_depth)
#clf_GS = GridSearchCV(pipe, parameters)
#clf_GS.fit(X_train, y_train)

#clf.fit(X_train, y_train)

In [27]:
#pred_train = clf_GS.predict(X_train)
#pred_val = clf_GS.predict(X_validation)

# Voting Classifier with GridSearch

In [43]:
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from joblib import dump, load
from google.colab import files
from scipy.stats import uniform, randint

#if performVoting == 'y':
  
try:
  clf_GS = load('anomaly-4G-detection/voting_model.joblib') 
  

except:
  voting_clf = VotingClassifier( estimators=[ 
      ('xgb', XGBClassifier(random_state=1)),
      ('dt', DecisionTreeClassifier(random_state=1)),
      ('knn', KNeighborsClassifier())
      ], voting='soft')
  
  pipe = Pipeline(steps=[('std_slc', StandardScaler()),
                              ('voting_clf', XGBClassifier(random_state=1))])
  
  parameters = {
  #'voting_clf__dt__criterion'    : ['entropy', 'gini'],
  #'voting_clf__dt__min_samples_leaf'    : randint(2, 30),
  #'voting_clf__dt__min_samples_split'    : randint(2, 30),
  #'voting_clf__dt__max_depth'    : randint(2, 30),
  #'voting_clf__knn__n_neighbors'    : randint(5, 30),
  #'voting_clf__knn__metric'    : ['euclidean', 'manhattan'],
  #'voting_clf__knn__weights'    : ['uniform', 'distance'],
  #'voting_clf__knn__p'    : randint(1, 5),
  'voting_clf__eta'    : uniform(0.1, 0.3),
  "voting_clf__colsample_bytree": uniform(0.0, 0.3),
  "voting_clf__min_child_weight": randint(1, 5),
  "voting_clf__gamma": uniform(0, 0.5),
  "voting_clf__learning_rate": uniform(0.03, 0.3), # default 0.1 
  "voting_clf__max_depth": randint(2, 20), # default 3
  "voting_clf__n_estimators": randint(100, 1000), # default 100
  "voting_clf__subsample": uniform(0.9, 0.4)
  }

  #clf_GS = GridSearchCV(estimator=pipe, param_grid=parameters, n_jobs=10, verbose=1, )
  clf_GS = RandomizedSearchCV(estimator=pipe, param_distributions=parameters, n_jobs=10, verbose=1, cv=5, n_iter= 1000 )
  clf_GS.fit(X_train,y_train)

  #Save the model in a file and download locally.
  dump(clf_GS, 'voting_model.joblib')
  files.download('voting_model.joblib') 

Fitting 5 folds for each of 1000 candidates, totalling 5000 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:  2.0min
[Parallel(n_jobs=10)]: Done 180 tasks      | elapsed:  6.6min
[Parallel(n_jobs=10)]: Done 430 tasks      | elapsed: 11.8min
[Parallel(n_jobs=10)]: Done 780 tasks      | elapsed: 24.5min
[Parallel(n_jobs=10)]: Done 1230 tasks      | elapsed: 35.2min
[Parallel(n_jobs=10)]: Done 1780 tasks      | elapsed: 58.8min
[Parallel(n_jobs=10)]: Done 2430 tasks      | elapsed: 74.9min
[Parallel(n_jobs=10)]: Done 3180 tasks      | elapsed: 93.7min
[Parallel(n_jobs=10)]: Done 4030 tasks      | elapsed: 111.7min
[Parallel(n_jobs=10)]: Done 4980 tasks      | elapsed: 133.6min
[Parallel(n_jobs=10)]: Done 5000 out of 5000 | elapsed: 134.0min finished


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [44]:
pred_train = clf_GS.predict(X_train)
pred_val = clf_GS.predict(X_validation)

In [45]:
print("TRAINING\n" + classification_report(y_train, pred_train))
print("\nTESTING\n" + classification_report(y_validation, pred_val))

TRAINING
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     21377
           1       1.00      1.00      1.00      8146

    accuracy                           1.00     29523
   macro avg       1.00      1.00      1.00     29523
weighted avg       1.00      1.00      1.00     29523


TESTING
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      5344
           1       1.00      0.98      0.99      2037

    accuracy                           0.99      7381
   macro avg       1.00      0.99      0.99      7381
weighted avg       0.99      0.99      0.99      7381



In [46]:
train_error = 1. - accuracy_score(y_train, pred_train)
train_cmat = confusion_matrix(y_train, pred_train)
val_error = 1. - accuracy_score(y_validation, pred_val)
val_cmat = confusion_matrix(y_validation, pred_val)

print('train error: %f ' % train_error)
print('train confusion matrix:')
print(train_cmat)
print('test error: %f ' % val_error)
print('test confusion matrix:')
print(val_cmat)

train error: 0.000000 
train confusion matrix:
[[21377     0]
 [    0  8146]]
test error: 0.006503 
test confusion matrix:
[[5343    1]
 [  47 1990]]


In [52]:
pred_test = clf_GS.predict(X_test)

# Submission Formatting

In [53]:
%%shell
# Create submission file if it does not exists
file=predictions.csv
if [ ! -e "$file" ] ; then
    touch anomaly-4G-detection/"$file"
fi



In [54]:
# Create index column in data frame object
submission_dataframe = pd.DataFrame(np.arange(1, 9159), columns=['Id']) 

# Append predictions of test data as column
submission_dataframe['Label'] = pred_test

# Convert Data Frame object to CSV
submission_dataframe.to_csv('predictions.csv', index=False)

!mv predictions.csv anomaly-4G-detection/
predictions = pd.read_csv('anomaly-4G-detection/predictions.csv')
predictions

Unnamed: 0,Id,Label
0,1,1
1,2,0
2,3,0
3,4,0
4,5,1
...,...,...
9153,9154,0
9154,9155,1
9155,9156,0
9156,9157,0


In [55]:
#!rm anomaly-4G-detection/predictions.csv

In [56]:
clf_GS.best_params_
#print(clf_GS.grid_scores_)

{'voting_clf__colsample_bytree': 0.12152600839706042,
 'voting_clf__eta': 0.2873424356895357,
 'voting_clf__gamma': 0.45599470961992583,
 'voting_clf__learning_rate': 0.20572871889866548,
 'voting_clf__max_depth': 19,
 'voting_clf__min_child_weight': 1,
 'voting_clf__n_estimators': 696,
 'voting_clf__subsample': 0.9321481608544492}