# Notebook of all models

## Import liberaries

In [None]:
# import datamanagement
import pandas as pd
import numpy as np
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
import time
import datetime
import copy

# import models
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier

# import evaluation
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, roc_curve, classification_report, confusion_matrix
from sklearn.model_selection import RandomizedSearchCV

# import visualisation
from sklearn.tree import export_graphviz
import matplotlib.pyplot as plt
import seaborn as sns

## Import notebook

In [None]:
df_clean = pd.read_csv("../NOTEBOOKS TO REVIEW/Job/dataset_cyb&non_v2.csv", sep=';')

### Backup

In [None]:
df_copy = copy.deepcopy(df_clean)

In [None]:
df_copy.shape

# Machine Learning Models functions

In [None]:
def now():
    now = datetime.datetime.now()   
    return now.strftime('%Y-%m-%d %H:%M:%S')

## Support Vector Machine

In [None]:
def svm_model(train,test,train_labels,test_labels):
    #train model
    clf = svm.SVC(kernel='linear')
    clf.fit(train, train_labels)
    
    #test model
    y_pred = clf.predict(test)
    score = classification_report(y_pred, test_labels, output_dict=True)
    df_score = pd.DataFrame(score).transpose()
    df_score.to_csv('svm_score_run_'+now()+'.csv', sep=";", index=False)
    
    #saving TP
    SVMTest = test.copy()
    SVMTest['Predicted'] = y_pred
    SVMTest['Actual'] = test_labels
    SVMTest = SVMTest.loc[SVMTest['Predicted'] == SVMTest['Actual']]
    SVM_Predictions = data.iloc[list(SVMTest.index.values.tolist())]
    SVM_Predictions.to_csv('svm_TP_run_'+now()+'.csv', sep=";", index=False)
      
    #postitive target feature importance
    svm_feature_result = pd.DataFrame({'feature': train.columns,'importance': clf.coef_[0]}).sort_values('importance', ascending = False)
    svm_feature_result.to_csv('svm_featureimp_run_'+now()+'.csv', sep=";", index=False)

## Random Forest

In [None]:
def rf_model(train,test,train_labels,test_labels):
    #train model
    model = RandomForestClassifier(n_estimators=29, max_features = 0.7999999999999999, n_jobs=-1, verbose = 1, max_depth = 10, bootstrap = True)
    model.fit(train, train_labels)
   
    #test model
    rf_predictions = model.predict(test)
    score = classification_report(rf_predictions, test_labels, output_dict=True)
    df_score = pd.DataFrame(score).transpose()
    df_score.to_csv('rf_score_run_'+now()+'.csv', sep=";", index=False)
    
    #saving TP
    RFTest = test.copy()
    RFTest['Predicted'] = rf_predictions
    RFTest['Actual'] = test_labels
    RFTest = RFTest.loc[RFTest['Predicted'] == RFTest['Actual']]
    RF_Predictions = data.iloc[list(RFTest.index.values.tolist())]
    RF_Predictions.to_csv('rf_TP_run_'+now()+'.csv', sep=";", index=False)

    #postitive target feature importance
    rf_feature_result = pd.DataFrame({'feature': train.columns,'importance': model.feature_importances_}).sort_values('importance', ascending = False)
    rf_feature_result.to_csv('rf_featureimp_run_'+now()+'.csv', sep=";", index=False)

## Decision tree with Gradient Boosting

In [None]:
def dtgb_model(train,test,train_labels,test_labels):
    #train model
    model = GradientBoostingClassifier(n_estimators=100, max_depth=4, subsample=0.99, learning_rate=0.977)
    model.fit(train, train_labels)
    
    #test model
    dtgb_predictions = model.predict(test)
    score = classification_report(dtgb_predictions, test_labels, output_dict=True)
    df_score = pd.DataFrame(score).transpose()
    df_score.to_csv('/Job/run/dtgb_score_run_'+now()+'.csv', sep=";", index=False)
    
    #saving TP
    DTGBTest = test.copy()
    DTGBTest['Predicted'] = dtgb_predictions
    DTGBTest['Actual'] = test_labels
    DTGBTest = DTGBTest.loc[DTGBTest['Predicted'] == DTGBTest['Actual']]
    DTGB_Predictions = data.iloc[list(DTGBTest.index.values.tolist())]
    DTGB_Predictions.to_csv('/run/dtgb_final_TP_run_'+now()+'.csv', sep=";", index=False)

    #postitive target feature importance
    dtgb_feature_result = pd.DataFrame({'feature': train.columns,'importance': model.feature_importances_}).sort_values('importance', ascending = False)
    dtgb_feature_result.to_csv('/run/dtgb_featureimp_run_'+now()+'.csv', sep=";", index=False)

## Logistic Regression

In [None]:
def logreg_model(train,test,train_labels,test_labels):
    #train model
    logmodel = LogisticRegression(solver = 'lbfgs')
    logmodel.fit(train, train_labels)
    
    #test model
    predictions = logmodel.predict(test)
    score = classification_report(predictions, test_labels, output_dict=True)
    df_score = pd.DataFrame(score).transpose()    
    df_score.to_csv('/run/logreg_score_run_'+now()+'.csv', sep=";", index=False)
    
    #saving TP
    LRTest = test.copy()
    LRTest['Predicted'] = predictions
    LRTest['Actual'] = test_labels
    LRTest = LRTest.loc[LRTest['Predicted'] == LRTest['Actual']]
    LR_Predictions = data.iloc[list(LRTest.index.values.tolist())]
    LR_Predictions.to_csv('/run/logreg_final_TP_run_'+now()+'.csv', sep=";", index=False)
    
    #postive target feature importance
    logreg_feature_result = pd.DataFrame({'feature': train.columns,'importance': logmodel.coef_[0]}).sort_values('importance', ascending = False)
    logreg_feature_result.to_csv('/run/logreg_featureimp_run_'+now()+'.csv', sep=";", index=False)

### Variable declaration

In [None]:
df_non = df_copy.loc[df_copy['is_cyber_victim_1'] == 0]
df_cyber = df_copy.loc[df_copy['is_cyber_victim_1'] == 1]
sample_size = len(df_cyber)
loop_size = 1

### Running model mulitple times

In [None]:
for i in range(loop_size):
    #sampling data
    data = pd.read_csv("stratified_TP_data_2020-01-14 19:06:54.csv", sep=';')
    
    #splitting data
    labels = np.array(data.pop('is_cyber_victim_1'))
    train, test, train_labels, test_labels = train_test_split(data, labels, stratify = labels, test_size = 0.3, random_state = 21)
    
    #running models
    dtgb_model(train, test, train_labels, test_labels)
    logreg_model(train, test, train_labels, test_labels)
    rf_model(train, test, train_labels, test_labels)   
    svm_model(train, test, train_labels, test_labels)