In [1]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
from sklearn.preprocessing import LabelEncoder

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import OneClassSVM

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score

In [2]:
scaler = StandardScaler()

In [3]:
testing_data = pd.read_csv('t_data/processed_test_5.csv')

testing_data_features = testing_data.drop('class', axis=1)
testing_data_labels = testing_data['class']

testing_data_features = scaler.fit_transform(testing_data_features)

In [4]:
training = pd.read_csv('data/processed_network_5.csv')
training_features = training.drop('class', axis=1)
training_labels = training['class']

training_features = scaler.fit_transform(training_features)

In [5]:
testing_cols = set(testing_data.columns)
training_cols = set(training.columns)

if testing_cols == training_cols:
    print("same columns")
else:
    only_test = testing_cols - training_cols
    print("only in testing_data:", only_test)
    only_train = training_cols - testing_cols
    print("only in training:", only_train)

same columns


In [6]:
%%time

svm_model = SVC(C=10, gamma='auto', kernel='rbf')
svm_model.fit(training_features, training_labels)
svm_predicted_labels = svm_model.predict(testing_data_features)

accuracy = accuracy_score(testing_data_labels, svm_predicted_labels)
precision = precision_score(testing_data_labels, svm_predicted_labels)
f1 = f1_score(testing_data_labels, svm_predicted_labels)
auc = roc_auc_score(testing_data_labels, svm_predicted_labels)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"F1 Score: {f1}")
print(f"AUC Score: {auc}")

Accuracy: 0.251
Precision: 0.2271186440677966
F1 Score: 0.24358715411028076
AUC Score: 0.2518774611879677
CPU times: total: 109 ms
Wall time: 104 ms


In [7]:
%%time

knn_model = KNeighborsClassifier(n_neighbors=3, weights='distance')

knn_model.fit(training_features, training_labels)

knn_predicted_labels = knn_model.predict(testing_data_features)

knn_accuracy = accuracy_score(testing_data_labels, knn_predicted_labels)
knn_precision = precision_score(testing_data_labels, knn_predicted_labels)
knn_f1 = f1_score(testing_data_labels, knn_predicted_labels)
knn_auc = roc_auc_score(testing_data_labels, knn_predicted_labels)

print(f"Accuracy: {knn_accuracy}")
print(f"Precision: {knn_precision}")
print(f"F1 Score: {knn_f1}")
print(f"AUC Score: {knn_auc}")

Accuracy: 0.163
Precision: 0.106294289287203
F1 Score: 0.10862619808306709
AUC Score: 0.15908165474300556
CPU times: total: 2.16 s
Wall time: 122 ms


In [8]:
%%time

dt_model = DecisionTreeClassifier(random_state=42,
                            criterion='entropy',
                            max_depth=9,
                            min_samples_split=2,
                            min_samples_leaf=2)

dt_model.fit(training_features, training_labels)

dt_predicted_labels = dt_model.predict(testing_data_features)

dt_accuracy = accuracy_score(testing_data_labels, dt_predicted_labels)
dt_precision = precision_score(testing_data_labels, dt_predicted_labels)
dt_f1 = f1_score(testing_data_labels, dt_predicted_labels)
dt_auc = roc_auc_score(testing_data_labels, dt_predicted_labels)

print(f"Accuracy: {dt_accuracy}")
print(f"Precision: {dt_precision}")
print(f"F1 Score: {dt_f1}")
print(f"AUC Score: {dt_auc}")

Accuracy: 0.1414
Precision: 0.14326545194712398
F1 Score: 0.15740922473012756
AUC Score: 0.14390862420881184
CPU times: total: 516 ms
Wall time: 20.2 ms


In [9]:
%%time

rf_model = RandomForestClassifier(random_state=42,
                                  n_estimators= 100,
                                  max_depth=15,
                                  min_samples_split=2,
                                  min_samples_leaf=1,
                                  max_features='sqrt')

rf_model.fit(training_features, training_labels)

rf_predicted_labels = rf_model.predict(testing_data_features)

rf_accuracy = accuracy_score(testing_data_labels, rf_predicted_labels)
rf_precision = precision_score(testing_data_labels, rf_predicted_labels)
rf_f1 = f1_score(testing_data_labels, rf_predicted_labels)
rf_auc = roc_auc_score(testing_data_labels, rf_predicted_labels)

print(f"Accuracy: {rf_accuracy}")
print(f"Precision: {rf_precision}")
print(f"F1 Score: {rf_f1}")
print(f"AUC Score: {rf_auc}")

Accuracy: 0.1206
Precision: 0.13688212927756654
F1 Score: 0.1526305646560031
AUC Score: 0.12451356101684435
CPU times: total: 453 ms
Wall time: 454 ms
