# Load Datasets

In [15]:
import numpy as np
import pandas as pd
import os

# Decision tree classifier
import sklearn.tree
import sklearn.ensemble

# SVM Classifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

# MLP Classifier
import sklearn.neural_network

# Voting Classifier
from sklearn.ensemble import VotingClassifier

# Reporting tools
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [2]:
# Clone repository in order to get access locally to the datasets
!git clone https://github.com/sergio-gimenez/anomaly-4G-detection

fatal: destination path 'anomaly-4G-detection' already exists and is not an empty directory.


In [3]:
train = pd.read_csv('anomaly-4G-detection/ML-MATT-CompetitionQT2021_train.csv', sep=';')
test = pd.read_csv('anomaly-4G-detection/ML-MATT-CompetitionQT2021_test.xls', sep=';' )

In [4]:
# Separate labels from data 
X = train.drop('Unusual', axis='columns')#.to_numpy()
y = train['Unusual']#.to_numpy()

# We split the data into training and validation subsets (80% and 20%) in
# order to validate our training
X_train, X_validation, y_train, y_validation = train_test_split(X, y, 
                                                                train_size=0.8,
                                                                random_state=1)
X_test = test

In [5]:
# Using only features that are actually numbers

X_train = X_train.iloc[:,2:13].to_numpy()
y_train = y_train.to_numpy()

X_validation = X_validation.iloc[:,2:13].to_numpy()
y_validation = y_validation.to_numpy()

X_test = test.iloc[:,2:13].to_numpy()

X = X.iloc[:,2:13].to_numpy()
y = y.to_numpy()

# Solving the Classification Problem

In [6]:
# List containing the classifiers for the voting system
classifiers = []

## Decission Tree Classifier

In [7]:
tree_clf = sklearn.tree.DecisionTreeClassifier(criterion='entropy', random_state=1)
classifiers.append(tree_clf)

tree_clf.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=1, splitter='best')

In [8]:
pred_tree_train = tree_clf.predict(X_train)
pred_tree_val = tree_clf.predict(X_validation)

In [9]:
print("TRAINING\n" + classification_report(y_train, pred_tree_train))
print("\nTESTING\n" + classification_report(y_validation, pred_tree_val))

TRAINING
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     21378
           1       1.00      1.00      1.00      8145

    accuracy                           1.00     29523
   macro avg       1.00      1.00      1.00     29523
weighted avg       1.00      1.00      1.00     29523


TESTING
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      5343
           1       0.95      0.94      0.94      2038

    accuracy                           0.97      7381
   macro avg       0.96      0.96      0.96      7381
weighted avg       0.97      0.97      0.97      7381



In [25]:
tree_train_error = 1. - accuracy_score(y_train, pred_tree_train)
tree_train_cmat = confusion_matrix(y_train, pred_tree_train)
tree_val_error = 1. - accuracy_score(y_validation, pred_tree_val)
tree_val_cmat = confusion_matrix(y_validation, pred_tree_val)

print('train error: %f ' % tree_train_error)
print('train confusion matrix:')
print(tree_train_cmat)
print('test error: %f ' % tree_val_error)
print('test confusion matrix:')
print(tree_val_cmat)

train error: 0.000000 
train confusion matrix:
[[21378     0]
 [    0  8145]]
test error: 0.030890 
test confusion matrix:
[[5242  101]
 [ 127 1911]]


## Non-linear SVM Classifier

In [11]:
svm_clf = SVC(0.1, kernel="rbf", gamma=1)
classifiers.append(svm_clf)

# svm_model = Pipeline([
#     ('center', StandardScaler()),
#     ('clf', svm_clf)
# ])

# svm_model.fit(X_train, y_train)
svm_clf.fit(X_train, y_train)

SVC(C=0.1, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=1, kernel='rbf', max_iter=-1,
    probability=False, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

In [12]:
# svm_model_pred_train = svm_model.predict(X_train)
# svm_model_pred_val = svm_model.predict(X_validation)

# print("TRAINING\n" + classification_report(y_train, svm_model_pred_train))
# print("\nTESTING\n" + classification_report(y_validation, svm_model_pred_val))

TRAINING
              precision    recall  f1-score   support

           0       0.72      1.00      0.84     21378
           1       0.00      0.00      0.00      8145

    accuracy                           0.72     29523
   macro avg       0.36      0.50      0.42     29523
weighted avg       0.52      0.72      0.61     29523


TESTING
              precision    recall  f1-score   support

           0       0.72      1.00      0.84      5343
           1       0.00      0.00      0.00      2038

    accuracy                           0.72      7381
   macro avg       0.36      0.50      0.42      7381
weighted avg       0.52      0.72      0.61      7381



  _warn_prf(average, modifier, msg_start, len(result))


In [13]:
svm_clf_pred_train = svm_clf.predict(X_train)
svm_clf_pred_val = svm_clf.predict(X_validation)

print("TRAINING\n" + classification_report(y_train, svm_clf_pred_train))
print("\nTESTING\n" + classification_report(y_validation, svm_clf_pred_val))

TRAINING
              precision    recall  f1-score   support

           0       0.74      0.99      0.85     21378
           1       0.83      0.07      0.13      8145

    accuracy                           0.74     29523
   macro avg       0.79      0.53      0.49     29523
weighted avg       0.76      0.74      0.65     29523


TESTING
              precision    recall  f1-score   support

           0       0.74      0.99      0.85      5343
           1       0.79      0.07      0.12      2038

    accuracy                           0.74      7381
   macro avg       0.76      0.53      0.48      7381
weighted avg       0.75      0.74      0.65      7381



In [27]:
svm_train_error = 1. - accuracy_score(y_train, svm_clf_pred_train)
svm_train_cmat = confusion_matrix(y_train, svm_clf_pred_train)
svm_val_error = 1. - accuracy_score(y_validation, svm_clf_pred_val)
svm_val_cmat = confusion_matrix(y_validation, svm_clf_pred_val)

print('train error: %f ' % svm_train_error)
print('train confusion matrix:')
print(svm_train_cmat)
print('test error: %f ' % svm_val_error)
print('test confusion matrix:')
print(svm_val_cmat)

train error: 0.260272 
train confusion matrix:
[[21264   114]
 [ 7570   575]]
test error: 0.262702 
test confusion matrix:
[[5308   35]
 [1904  134]]


## MLP Classifier

In [17]:
mlp_clf = sklearn.neural_network.MLPClassifier(hidden_layer_sizes=(20,),solver='sgd',momentum=0)
classifiers.append(mlp_clf)
mlp_clf.fit(X_train, y_train)



MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(20,), learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=200, momentum=0,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=None, shuffle=True, solver='sgd', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [18]:
mlp_clf_pred_train = mlp_clf.predict(X_train)
mlp_clf_pred_val = mlp_clf.predict(X_validation)

print("TRAINING\n" + classification_report(y_train, mlp_clf_pred_train))
print("\nTESTING\n" + classification_report(y_validation, mlp_clf_pred_val))

TRAINING
              precision    recall  f1-score   support

           0       0.73      0.99      0.84     21378
           1       0.66      0.05      0.09      8145

    accuracy                           0.73     29523
   macro avg       0.70      0.52      0.47     29523
weighted avg       0.71      0.73      0.64     29523


TESTING
              precision    recall  f1-score   support

           0       0.73      0.99      0.84      5343
           1       0.66      0.05      0.10      2038

    accuracy                           0.73      7381
   macro avg       0.70      0.52      0.47      7381
weighted avg       0.71      0.73      0.64      7381



In [28]:
mlp_train_error = 1. - accuracy_score(y_train, mlp_clf_pred_train)
mlp_train_cmat = confusion_matrix(y_train, mlp_clf_pred_train)
mlp_val_error = 1. - accuracy_score(y_validation, mlp_clf_pred_val)
mlp_val_cmat = confusion_matrix(y_validation, mlp_clf_pred_val)

print('train error: %f ' % mlp_train_error)
print('train confusion matrix:')
print(mlp_train_cmat)
print('test error: %f ' % mlp_val_error)
print('test confusion matrix:')
print(mlp_val_cmat)

train error: 0.269044 
train confusion matrix:
[[21168   210]
 [ 7733   412]]
test error: 0.268934 
test confusion matrix:
[[5288   55]
 [1930  108]]


## Voting Classifier

In [19]:
voting_clf = VotingClassifier(
    estimators=[('tree', tree_clf), ('svm', svm_clf), ('mlp', mlp_clf)],
    voting='hard')

classifiers.append(voting_clf)
voting_clf.fit(X_train, y_train)



VotingClassifier(estimators=[('tree',
                              DecisionTreeClassifier(ccp_alpha=0.0,
                                                     class_weight=None,
                                                     criterion='entropy',
                                                     max_depth=None,
                                                     max_features=None,
                                                     max_leaf_nodes=None,
                                                     min_impurity_decrease=0.0,
                                                     min_impurity_split=None,
                                                     min_samples_leaf=1,
                                                     min_samples_split=2,
                                                     min_weight_fraction_leaf=0.0,
                                                     presort='deprecated',
                                                     random_state=1,
 

In [20]:
for clf in classifiers:
    #clf.fit(X_train, y_train)
    pred_val = clf.predict(X_validation)
    print(clf.__class__.__name__, accuracy_score(y_validation, pred_val))

DecisionTreeClassifier 0.9691098767104729
SVC 0.7372984690421353
MLPClassifier 0.7310662511854762
VotingClassifier 0.7456984148489365


In [21]:
voting_clf_pred_train = mlp_clf.predict(X_train)
voting_clf_pred_val = mlp_clf.predict(X_validation)

print("TRAINING\n" + classification_report(y_train, voting_clf_pred_train))
print("\nTESTING\n" + classification_report(y_validation, voting_clf_pred_val))

TRAINING
              precision    recall  f1-score   support

           0       0.73      0.99      0.84     21378
           1       0.66      0.05      0.09      8145

    accuracy                           0.73     29523
   macro avg       0.70      0.52      0.47     29523
weighted avg       0.71      0.73      0.64     29523


TESTING
              precision    recall  f1-score   support

           0       0.73      0.99      0.84      5343
           1       0.66      0.05      0.10      2038

    accuracy                           0.73      7381
   macro avg       0.70      0.52      0.47      7381
weighted avg       0.71      0.73      0.64      7381



In [29]:
voting_train_error = 1. - accuracy_score(y_train, voting_clf_pred_train)
voting_train_cmat = confusion_matrix(y_train, voting_clf_pred_train)
voting_val_error = 1. - accuracy_score(y_validation, voting_clf_pred_val)
voting_val_cmat = confusion_matrix(y_validation, voting_clf_pred_val)

print('train error: %f ' % voting_train_error)
print('train confusion matrix:')
print(voting_train_cmat)
print('test error: %f ' % voting_val_error)
print('test confusion matrix:')
print(voting_val_cmat)

train error: 0.269044 
train confusion matrix:
[[21168   210]
 [ 7733   412]]
test error: 0.268934 
test confusion matrix:
[[5288   55]
 [1930  108]]


## Test Prediction

In [22]:
pred_test = voting_clf.predict(X_test)

# Submission Formatting

In [23]:
%%shell
# Create submission file if it does not exists
file=predictions.csv
if [ ! -e "$file" ] ; then
    touch anomaly-4G-detection/"$file"
fi



In [24]:
# Create index column in data frame object
submission_dataframe = pd.DataFrame(np.arange(1, 9159), columns=['Id']) 

# Append predictions of test data as column
submission_dataframe['Label'] = pred_test

# Convert Data Frame object to CSV
submission_dataframe.to_csv('predictions.csv', index=False)

!mv predictions.csv anomaly-4G-detection/
predictions = pd.read_csv('anomaly-4G-detection/predictions.csv')
predictions

Unnamed: 0,Id,Label
0,1,0
1,2,0
2,3,0
3,4,0
4,5,0
...,...,...
9153,9154,0
9154,9155,0
9155,9156,0
9156,9157,0


In [None]:
!rm anomaly-4G-detection/predictions.csv