In [None]:
#Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn import tree, linear_model
from sklearn.ensemble import RandomForestClassifier

In [None]:
#Mounting my google drive here in google colab using the following code
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Adding the path to train and test data
path_to_data = "/content/drive/MyDrive/Colab Notebooks/Datasets/Assignment/anomaly_label.csv"

In [None]:
 # Load dataset
dataset = pd.read_csv(path_to_data)
# Check dataset
dataset

Unnamed: 0,BlockId,Label
0,blk_-1608999687919862906,Normal
1,blk_7503483334202473044,Normal
2,blk_-3544583377289625738,Anomaly
3,blk_-9073992586687739851,Normal
4,blk_7854771516489510256,Normal
...,...,...
575056,blk_1019720114020043203,Normal
575057,blk_-2683116845478050414,Normal
575058,blk_5595059397348477632,Normal
575059,blk_1513937873877967730,Normal


In [None]:
# Check label column
dataset["Label"].value_counts()

Normal     558223
Anomaly     16838
Name: Label, dtype: int64

In [None]:
# Describe dataset
dataset.describe()

Unnamed: 0,BlockId,Label
count,575061,575061
unique,575061,2
top,blk_-1608999687919862906,Normal
freq,1,558223


In [None]:
 # Convert values
dataset.loc[dataset["Label"] == "Anomaly", "Label"] = 1
dataset.loc[dataset["Label"] == "Normal", "Label"] = 0
# Check if it is correct
dataset["Label"].value_counts()


0    558223
1     16838
Name: Label, dtype: int64

In [None]:
 # Convert BlockId to float
# Remove blk_ from all
dataset['BlockId'] = dataset['BlockId'].str[4:]
# Check changes
dataset

Unnamed: 0,BlockId,Label
0,-1608999687919862906,0
1,7503483334202473044,0
2,-3544583377289625738,1
3,-9073992586687739851,0
4,7854771516489510256,0
...,...,...
575056,1019720114020043203,0
575057,-2683116845478050414,0
575058,5595059397348477632,0
575059,1513937873877967730,0


In [None]:
# Separate Label and Block ID into X and Y
X = dataset.drop("Label", axis=1)
Y = dataset["Label"]
Y = Y.astype('int')

(575061,)

In [None]:
# Check X shape
X.shape

(575061, 1)

In [None]:
# Check Y shape
Y.shape

(575061,)

In [None]:
# Check X
X

Unnamed: 0,BlockId
0,-1608999687919862906
1,7503483334202473044
2,-3544583377289625738
3,-9073992586687739851
4,7854771516489510256
...,...
575056,1019720114020043203
575057,-2683116845478050414
575058,5595059397348477632
575059,1513937873877967730


In [None]:
# Check Y
Y

0         0
1         0
2         1
3         0
4         0
         ..
575056    0
575057    0
575058    0
575059    0
575060    1
Name: Label, Length: 575061, dtype: int64

In [None]:
# Split data into X_train, X_test, y_train, Y_test for training and data sets (80%-20% split)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state = 42)


In [None]:
#Check X_train shape
X_train.shape

(460048, 1)

In [None]:
#Check Y_train shape
Y_train.shape

(460048,)

In [None]:
# Check X_test shape
X_test.shape

(115013, 1)

In [None]:
# Check Y_test shape
Y_test.shape

(115013,)

In [None]:
# Check X_train
X_train

Unnamed: 0,BlockId
426975,-7034321469539528516
556117,2137659942019812932
106452,332547444036945178
231197,2834915389484063830
509632,7673612786835345682
...,...
110268,2122890464894056509
259178,-5100079166404607866
365838,-5706182803835190474
131932,1437813838110062153


In [None]:
# Check X_test
X_test

Unnamed: 0,BlockId
169883,8994833829145112749
508216,4780650702265629827
548973,7189932238461412280
558812,4457861953679819986
384967,-7774453643692748332
...,...
154937,-2535255597435826591
494415,1868589195894796285
527656,-5943798092542649607
80968,2015205369552156251


In [None]:
# Check Y_train
Y_train

426975    0
556117    0
106452    0
231197    0
509632    0
         ..
110268    0
259178    0
365838    0
131932    0
121958    0
Name: Label, Length: 460048, dtype: int64

In [None]:
# Check Y_test
Y_test

169883    0
508216    0
548973    0
558812    0
384967    0
         ..
154937    0
494415    0
527656    0
80968     0
8228      0
Name: Label, Length: 115013, dtype: int64

In [None]:
 # Initialize classifier with training datasets
rfc = RandomForestClassifier()
# Fit the model
rfc.fit(X_train, Y_train)

RandomForestClassifier()

In [None]:
#Initialize a DecisionTreeClassifier
dtc = tree.DecisionTreeClassifier()
# Fit the model
dtc.fit(X_train, Y_train)

DecisionTreeClassifier()

In [None]:
# predict the test results
y_pred_rfc = rfc.predict(X_test)
y_pred_dtc = dtc.predict(X_test)

In [None]:
# Check prediction
y_pred_rfc


array([0, 0, 1, ..., 0, 0, 0])

In [None]:
y_pred_dtc


array([0, 0, 1, ..., 0, 0, 0])

In [None]:
# Evaluate the RandomForestClassifier
print("====================== RandomForestClassifier Model Evaluation======================")
print(f"Confusion matrix:")
print(confusion_matrix(Y_test, y_pred_rfc))
print()
print(f"Classification report:")
print(classification_report(Y_test, y_pred_rfc))


Confusion matrix:
[[108484   3180]
 [  3237    112]]

Classification report:
              precision    recall  f1-score   support

           0       0.97      0.97      0.97    111664
           1       0.03      0.03      0.03      3349

    accuracy                           0.94    115013
   macro avg       0.50      0.50      0.50    115013
weighted avg       0.94      0.94      0.94    115013



In [None]:
# Evaluate the DecisionTreeClassifier
print("====================== DecisionTree Model Evaluation======================")
print(f"Confusion matrix:")
print(confusion_matrix(Y_test, y_pred_dtc))
print()
print(f"Classification report:")
print(classification_report(Y_test, y_pred_dtc))

Confusion matrix:
[[108478   3186]
 [  3236    113]]

Classification report:
              precision    recall  f1-score   support

           0       0.97      0.97      0.97    111664
           1       0.03      0.03      0.03      3349

    accuracy                           0.94    115013
   macro avg       0.50      0.50      0.50    115013
weighted avg       0.94      0.94      0.94    115013

