# Load Datasets

In [30]:
import numpy as np
import pandas as pd
import os

import sklearn.tree
import sklearn.ensemble

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA

In [45]:
# Clone repository in order to get access locally to the datasets
!rm -rf .git README.md
!git clone -b playing-emsembled-methods https://github.com/sergio-gimenez/anomaly-4G-detection . 

fatal: destination path '.' already exists and is not an empty directory.


In [46]:
!ls -la

total 36
drwxr-xr-x 1 root root 4096 Dec 30 15:25 .
drwxr-xr-x 1 root root 4096 Dec 30 14:43 ..
drwxr-xr-x 3 root root 4096 Dec 30 15:24 a
drwxr-xr-x 4 root root 4096 Dec 30 15:21 anomaly-4G-detectiona
drwxr-xr-x 3 root root 4096 Dec 30 15:22 anomaly-4G-detectionsd
drwxr-xr-x 1 root root 4096 Dec 21 17:29 .config
drwxr-xr-x 3 root root 4096 Dec 30 15:22 ds
drwxr-xr-x 2 root root 4096 Dec 30 15:20 .ipynb_checkpoints
drwxr-xr-x 1 root root 4096 Dec 21 17:29 sample_data


In [32]:
train = pd.read_csv('anomaly-4G-detection/ML-MATT-CompetitionQT2021_train.csv', sep=';')
test = pd.read_csv('anomaly-4G-detection/ML-MATT-CompetitionQT2021_test.xls', sep=';' )

In [33]:
# Separate labels from data 
X = train.drop('Unusual', axis='columns')#.to_numpy()
y = train['Unusual']#.to_numpy()

# We split the data into training and validation subsets (80% and 20%) in
# order to validate our training
X_train, X_validation, y_train, y_validation = train_test_split(X, y, 
                                                                train_size=0.8,
                                                                random_state=1, stratify = y)
X_test = test

In [34]:
#Refactor time feature to minuts and cellName to unique identifier 1:1
def getTimeInMinutes(x):
  hh, mm  = x.split(":")
  return int(hh)* 60 + int(mm)

def createCellNameDictionary(data):
  cellList = []
  for i in data["CellName"]:
    cellList.append(i)
  cellList = set(cellList)
  cellDict = {}
  for idx, value in enumerate(cellList):
    cellDict[value]=idx
  return cellDict

def refactorFeaturesDataframe(data):
  data["Time"] = data["Time"].apply(lambda x: getTimeInMinutes(x))
  cellNameDict = createCellNameDictionary(data);
  data["CellName"] = data["CellName"].apply(lambda x: cellNameDict[x])
  return data


In [35]:
#Refactoring data from features to useful values
X_train = refactorFeaturesDataframe(X_train).to_numpy()
y_train = y_train.to_numpy()

X_validation = refactorFeaturesDataframe(X_validation).to_numpy()
y_validation = y_validation.to_numpy()

X_test = refactorFeaturesDataframe(test).to_numpy()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


# Solving the Classification Problem

In [36]:
clf = sklearn.tree.DecisionTreeClassifier(random_state=1)
pipe = Pipeline(steps=[('std_slc', StandardScaler()),
                           ('dec_tree', clf)])

n_components = list(range(1,X_train.shape[1]+1,2))
criterion = ['gini', 'entropy']
max_depth = [None,2,8,12]
min_samples_split = [2,4,8,10]
min_samples_leaf = [1,2,5]

parameters = dict(dec_tree__criterion=criterion,
                      dec_tree__min_samples_split=min_samples_split,
                      dec_tree__min_samples_leaf=min_samples_leaf,
                      dec_tree__max_depth=max_depth)
#clf_GS = GridSearchCV(pipe, parameters)
#clf_GS.fit(X_train, y_train)

clf.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=1, splitter='best')

In [37]:
#pred_train = clf_GS.predict(X_train)
#pred_val = clf_GS.predict(X_validation)

# AdaBoost with GridSearch

In [48]:
#ADABOOST
from sklearn.ensemble import AdaBoostClassifier
from joblib import dump, load

adaboost_mode_file = 'anomaly-4G-detection/adaboost_model.joblib'
try:
  clf_GS = load(adaboost_mode_file) 
  print(clf_GS)
  input("Press Enter to continue...")

except:
  clf = AdaBoostClassifier(random_state=1)
  pipe = Pipeline(steps=[('std_slc', StandardScaler()),
                            ('ada', clf)])
  n_components = list(range(1,X_train.shape[1]+1,2))
  n_estimators = [100, 150, 200, 250]
  learning_rate = [0.01,0.1,1,10]
  parameters = dict(ada__n_estimators=n_estimators,
                        ada__learning_rate=learning_rate
                    )
  clf_GS = GridSearchCV(pipe, parameters)
  clf_GS.fit(X_train, y_train)
  dump(clf_GS, adaboost_mode_file) 

pred_train = clf_GS.predict(X_train)
pred_val = clf_GS.predict(X_validation)


# Voting Classifier with GridSearch

In [38]:
'''
from sklearn.ensemble import VotingClassifier

eclf = VotingClassifier( estimators=[ 
    ('svm', SVC(probability=True)),
    ('lr', LogisticRegression()),
    ], voting='soft')

#Use the key for the classifier followed by __ and the attribute
params = {'lr__C': [1.0, 100.0],
      'svm__C': [2,3,4],}

grid = GridSearchCV( estimator=eclf, param_grid=params, cv=2)

grid.fit(X,y)
'''

NameError: ignored

In [None]:
pred_train = clf_GS.predict(X_train)
pred_val = clf_GS.predict(X_validation)

In [49]:
print("TRAINING\n" + classification_report(y_train, pred_train))
print("\nTESTING\n" + classification_report(y_validation, pred_val))

TRAINING
              precision    recall  f1-score   support

           0       0.87      0.99      0.93     21377
           1       0.95      0.62      0.75      8146

    accuracy                           0.89     29523
   macro avg       0.91      0.80      0.84     29523
weighted avg       0.89      0.89      0.88     29523


TESTING
              precision    recall  f1-score   support

           0       0.88      0.99      0.93      5344
           1       0.96      0.63      0.76      2037

    accuracy                           0.89      7381
   macro avg       0.92      0.81      0.85      7381
weighted avg       0.90      0.89      0.88      7381



In [50]:
train_error = 1. - accuracy_score(y_train, pred_train)
train_cmat = confusion_matrix(y_train, pred_train)
val_error = 1. - accuracy_score(y_validation, pred_val)
val_cmat = confusion_matrix(y_validation, pred_val)

print('train error: %f ' % train_error)
print('train confusion matrix:')
print(train_cmat)
print('test error: %f ' % val_error)
print('test confusion matrix:')
print(val_cmat)

train error: 0.113945 
train confusion matrix:
[[21130   247]
 [ 3117  5029]]
test error: 0.109064 
test confusion matrix:
[[5294   50]
 [ 755 1282]]


In [None]:
pred_test = clf_GS.predict(X_test)

# Submission Formatting

In [None]:
%%shell
# Create submission file if it does not exists
file=predictions.csv
if [ ! -e "$file" ] ; then
    touch anomaly-4G-detection/"$file"
fi

In [None]:
# Create index column in data frame object
submission_dataframe = pd.DataFrame(np.arange(1, 9159), columns=['Id']) 

# Append predictions of test data as column
submission_dataframe['Label'] = pred_test

# Convert Data Frame object to CSV
submission_dataframe.to_csv('predictions.csv', index=False)

!mv predictions.csv anomaly-4G-detection/
predictions = pd.read_csv('anomaly-4G-detection/predictions.csv')
predictions

In [None]:
#!rm anomaly-4G-detection/predictions.csv

In [None]:
clf_GS.best_params_