In [7]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
import pandas as pd
import csv
from google.colab import drive


from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_recall_fscore_support, precision_score, recall_score, accuracy_score, f1_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score, cross_validate, cross_val_predict

from google.colab import drive
drive.mount('/content/gdrive')
path_folder ="/content/gdrive/My Drive/PhD/Prosper/"

df = pd.read_excel(path_folder+'features_data_with_label_v2.xlsx')
print(df['date'])
df['month']=df['date'].apply(lambda x: int(x.split('/')[1]))
print(df['month'])

df_training = df.loc[df['month'] <= 8]
print(df_training.shape[0])

df_testing = df.loc[df['month'] > 8]
print(df_testing.shape[0])

X = df_training[['number_of_create' , 'number_of_read' , 'number_of_update' , 'number_of_delete' , 'number_of_patient_record' , 'number_of_unique_patient_record' , 'number_of_modules' , 'number_of_report_module' , 'number_of_finance_module' , 'number_of_patient_module' , 'number_of_lab_module' , 'number_of_pharmacy_module' , 'number_of_access_warning' , 'number_of_outside_access' , 'number_of_browser' , 'number_of_chrome' , 'number_of_ie' , 'number_of_safari' , 'number_of_firefox' , 'number_of_otherbrowser' ]]
y = df_training['anomaly']

S = df_testing[['number_of_create' , 'number_of_read' , 'number_of_update' , 'number_of_delete' , 'number_of_patient_record' , 'number_of_unique_patient_record' , 'number_of_modules' , 'number_of_report_module' , 'number_of_finance_module' , 'number_of_patient_module' , 'number_of_lab_module' , 'number_of_pharmacy_module' , 'number_of_access_warning' , 'number_of_outside_access' , 'number_of_browser' , 'number_of_chrome' , 'number_of_ie' , 'number_of_safari' , 'number_of_firefox' , 'number_of_otherbrowser' ]]
t = df_testing['anomaly']


df_testing_normal = df_testing.loc[df_testing['anomaly'] == 0]
df_testing_anomaly = df_testing.loc[df_testing['anomaly'] == 1]

print(df_testing_normal['number_of_patient_record'])
print(df_testing_anomaly['number_of_patient_record'])

all_models = [
    ("mult_nb", MultinomialNB()),
    ("bern_nb", BernoulliNB()),
    ("knn", KNeighborsClassifier(5)),
    ("nn", MLPClassifier()),
    ("lr", LogisticRegression()),
    ("rf", RandomForestClassifier()),
    ("dt", DecisionTreeClassifier()),
    ("svm", SVC(kernel='linear', probability=True))
]

for name, model in all_models: 
  clf=model
  print(name)
  clf.fit(X, y)
  result = clf.predict(S)
  prec = precision_score(result, t)
  rec = recall_score(result, t)
  acc = accuracy_score(result, t)
  f1 = f1_score(result, t)
  print(name+' & '+ str(acc) + ' & ' +str(prec)+ ' & ' +str(rec)+ ' & ' +str(f1)+ '\\\\')


Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
0        01/01
1        03/02
2        05/02
3        13/02
4        03/03
         ...  
24643    22/12
24644    24/12
24645    26/12
24646    28/12
24647    30/12
Name: date, Length: 24648, dtype: object
0         1
1         2
2         2
3         2
4         3
         ..
24643    12
24644    12
24645    12
24646    12
24647    12
Name: month, Length: 24648, dtype: int64
16512
8136
13       1000
14          4
15       1000
16          4
17       1000
         ... 
24643       6
24644       2
24645      18
24646      12
24647       9
Name: number_of_patient_record, Length: 8083, dtype: int64
36        10
38         8
39        24
306      216
491       17
675       19
676        1
863       21
1049      19
1050       2
1238      15
1422      20
1609       5
1610       1
8553      13
8554      20
8555      28
8557      30
10190    205
16708     35
17185 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


lr & 0.9992625368731564 & 0.9245283018867925 & 0.9607843137254902 & 0.9423076923076923\\
rf
rf & 0.9997541789577188 & 0.9811320754716981 & 0.9811320754716981 & 0.9811320754716981\\
dt
dt & 0.9996312684365781 & 0.9811320754716981 & 0.9629629629629629 & 0.9719626168224299\\
svm
svm & 0.9996312684365781 & 0.9622641509433962 & 0.9807692307692307 & 0.9714285714285713\\
