###In this notebook, I will be using a support vector machine to predict if somebody is a victim of cybercrime or not

Importing all libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn import metrics

Reading the sample datasets

In [None]:
df_cyb = pd.read_csv("../NOTEBOOKS TO REVIEW/Sample datasets for ML/cyber_victims.csv", sep=';')
df_tra = pd.read_csv("../NOTEBOOKS TO REVIEW/Sample datasets for ML/trad_victims.csv", sep=';')
df_non = pd.read_csv("../NOTEBOOKS TO REVIEW/Sample datasets for ML/non_victims.csv", sep=';')

In [None]:
frames = [df_cyb, df_non]
df_merged = pd.concat(frames, sort=False)

Creating age categories, since we only need categorial variables

In [None]:
df_merged['Age'] = pd.cut(x=df_merged['age'], bins=[0, 20, 39, 59, 79, 999])

In [None]:
df_merged = df_merged.drop(['id', 'age', 'birth_date', 'age_5_cath', 'age1', 'hh_std_4_years', 'comp_hh_income', 'comp_hh_income_4_years'], axis=1)

Creating binary variables, since the entire dataset consists of categorical variables

df_clean = pd.get_dummies(df_merged)

Making the labels more readable

In [None]:
df_clean['Has Kids'] = df_clean['hh_type_Gehuwd paar met kinderen'] + df_clean['hh_type_Gehuwd paar zonder kinderen'] + df_clean['hh_type_Eenouderhuishouden']
df_clean['Has No Kids'] = df_clean['hh_type_Niet-gehuwd paar zonder kinderen'] + df_clean['hh_type_Niet-gehuwd paar zonder kinderen'] + df_clean['hh_type_Eenouderhuishouden']
df_clean['Other Household'] = df_clean['hh_type_Institutioneel huishouden'] + df_clean['hh_type_Onbekend'] + df_clean['hh_type_Overig huishouden']

df_clean = df_clean[df_clean.columns.drop(list(df_clean.filter(regex = 'hh_type')))]


In [None]:
data = df_clean.drop(['is_trad_victim'], axis=1)
data = data.drop(['city_popu_density_Onbekend'], axis=1)
data = data[['Age_(59, 79]', 'Age_(79, 999]', 'city_popu_density_Matig stedelijk (OAD 1000 tot 1500)',
            'city_popu_density_Weinig stedelijk (OAD 500 tot 1000)', 'city_popu_density_Sterk stedelijk (OAD 1500 tot 2500)',
            'city_popu_density_Zeer sterk stedelijk (OAD 2500 of meer)',
           'city_popu_density_Niet stedelijk (OAD minder dan 500)',
            'is_cyber_victim']]

In [None]:
list(data.columns)

Creating test and train sets, that I will use to train the model

In [None]:
labels = np.array(data.pop('is_cyber_victim'))

train, test, train_labels, test_labels = train_test_split(data, labels, stratify = labels, test_size = 0.3, random_state = 21)

Training the model

In [None]:
clf = svm.SVC(kernel='linear')
clf.fit(train, train_labels)
y_pred = clf.predict(test)

Evaluating how good the model performs on the test set

In [None]:
print("Accuracy:",metrics.accuracy_score(test_labels,y_pred))

In [None]:
print("Recall:",metrics.recall_score(test_labels,y_pred))

In [None]:
print("Precision:",metrics.precision_score(test_labels,y_pred))

Showing the features with the highest calculated coefficients, that the model deemed most useful to predict a cybercrime victim

In [None]:
pd.Series(abs(clf.coef_[0]), index = data.columns).nlargest(10).plot(kind='barh')