# Load Datasets

In [823]:
import numpy as np
import pandas as pd
import os

import sklearn.tree
import sklearn.ensemble

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [824]:
# Clone repository in order to get access locally to the datasets
!git clone https://github.com/sergio-gimenez/anomaly-4G-detection

fatal: destination path 'anomaly-4G-detection' already exists and is not an empty directory.


In [825]:
train = pd.read_csv('anomaly-4G-detection/ML-MATT-CompetitionQT2021_train.csv', sep=';')
test = pd.read_csv('anomaly-4G-detection/ML-MATT-CompetitionQT2021_test.xls', sep=';' )

In [826]:
# Separate labels from data 
X = train.drop('Unusual', axis='columns')#.to_numpy()
y = train['Unusual']#.to_numpy()

# We split the data into training and validation subsets (80% and 20%) in
# order to validate our training
X_train, X_validation, y_train, y_validation = train_test_split(X, y, 
                                                                train_size=0.8,
                                                                random_state=1)
X_test = test

In [827]:
# Using only features that are actually numbers

X_train = X_train.iloc[:,2:13].to_numpy()
y_train = y_train.to_numpy()

X_validation = X_validation.iloc[:,2:13].to_numpy()
y_validation = y_validation.to_numpy()

X_test = test.iloc[:,2:13].to_numpy()

# Solving the Classification Problem

In [828]:
clf = sklearn.tree.DecisionTreeClassifier(criterion='entropy', random_state=1)
clf.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=1, splitter='best')

In [829]:
pred_train = clf.predict(X_train)
pred_val = clf.predict(X_validation)

In [830]:
print("TRAINING\n" + classification_report(y_train, pred_train))
print("\nTESTING\n" + classification_report(y_validation, pred_val))

TRAINING
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     21378
           1       1.00      1.00      1.00      8145

    accuracy                           1.00     29523
   macro avg       1.00      1.00      1.00     29523
weighted avg       1.00      1.00      1.00     29523


TESTING
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      5343
           1       0.95      0.94      0.94      2038

    accuracy                           0.97      7381
   macro avg       0.96      0.96      0.96      7381
weighted avg       0.97      0.97      0.97      7381



In [831]:
train_error = 1. - accuracy_score(y_train, pred_train)
train_cmat = confusion_matrix(y_train, pred_train)
val_error = 1. - accuracy_score(y_validation, pred_val)
val_cmat = confusion_matrix(y_validation, pred_val)

print('train error: %f ' % train_error)
print('train confusion matrix:')
print(train_cmat)
print('test error: %f ' % val_error)
print('test confusion matrix:')
print(val_cmat)

train error: 0.000000 
train confusion matrix:
[[21378     0]
 [    0  8145]]
test error: 0.030890 
test confusion matrix:
[[5242  101]
 [ 127 1911]]


In [832]:
pred_test = clf.predict(X_test)

# Submission Formatting

In [833]:
%%shell
# Create submission file if it does not exists
file=predictions.csv
if [ ! -e "$file" ] ; then
    touch anomaly-4G-detection/"$file"
fi



In [834]:
# Create index column in data frame object
submission_dataframe = pd.DataFrame(np.arange(1, 9159), columns=['Id']) 

# Append predictions of test data as column
submission_dataframe['Label'] = pred_test

# Convert Data Frame object to CSV
submission_dataframe.to_csv('predictions.csv', index=False)

!mv predictions.csv anomaly-4G-detection/
predictions = pd.read_csv('anomaly-4G-detection/predictions.csv')
predictions

Unnamed: 0,Id,Label
0,1,0
1,2,0
2,3,0
3,4,0
4,5,1
...,...,...
9153,9154,0
9154,9155,1
9155,9156,0
9156,9157,0


In [835]:
!rm anomaly-4G-detection/predictions.csv