In [1]:
import scipy.io as sio
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn import svm
from sklearn.model_selection import GridSearchCV
%config InlineBackend.figure_format = 'retina'
import pandas as pd

import networkx as nx

In [2]:
def draw_heatmap_RBF(acc, acc_desc, gamma_list, C_list):
    plt.figure(figsize = (5,4))
    ax = sns.heatmap(acc, annot=True, fmt='.3f', 
                     xticklabels=gamma_list, yticklabels=C_list)
    ax.collections[0].colorbar.set_label("accuracy")
    ax.set(xlabel = '$\gamma$', ylabel='$C$')
    plt.title(acc_desc + ' w.r.t $C$ and $\gamma$')
    sns.set_style("whitegrid", {'axes.grid' : False})
    plt.show()

## Dataset 1

In [59]:
df_2001=pd.read_csv('crime_2001.csv')
df_2002=pd.read_csv('crime_2002.csv')
df_2003=pd.read_csv('crime_2003.csv')
df_crime = pd.concat([df_2001,df_2002,df_2003])
df_crime.drop(labels=['ID','Case Number',
                      'IUCR','Year',
                      'Updated On','Block','Location',
                      'Date','Description','FBI Code',
                     'Latitude','Longitude'],axis=1,inplace=True)
df_crime.dropna(axis=0,how='any',inplace=True)
df_crime.shape

(517997, 10)

In [60]:
df_crime.columns

Index(['Primary Type', 'Location Description', 'Arrest', 'Domestic', 'Beat',
       'District', 'Ward', 'Community Area', 'X Coordinate', 'Y Coordinate'],
      dtype='object')

In [61]:
df_crime.replace({True:1,False:-1},inplace=True)

In [62]:
df_crime=pd.get_dummies(df_crime)
# use one-hot-encoding to change categorical data into numerical data

In [63]:
df_crime=df_crime.sample(frac=1)[:3000]
Y=df_crime['Arrest']
X=df_crime.drop(labels='Arrest',axis=1)
Y.replace({True:1,False:-1},inplace=True)

In [64]:
df_crime.head(2)

Unnamed: 0,Arrest,Domestic,Beat,District,Ward,Community Area,X Coordinate,Y Coordinate,Primary Type_ARSON,Primary Type_ASSAULT,...,Location Description_TRUCKING TERMINAL,Location Description_VACANT LOT,Location Description_VACANT LOT/LAND,Location Description_VEHICLE NON-COMMERCIAL,Location Description_VEHICLE-COMMERCIAL,Location Description_VESTIBULE,Location Description_WAREHOUSE,Location Description_WOODED AREA,Location Description_YARD,Location Description_YMCA
305936,-1,-1,2111,2.0,25.0,34.0,1174606.0,1890151.0,0,0,...,0,0,0,0,0,0,0,0,0,0
15247,-1,-1,933,9.0,20.0,61.0,1167796.0,1873334.0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [65]:
X.to_csv('data/1_X.csv',index=False,header=True)
Y.to_csv('data/1_Y.csv',index=False,header=True)

## Dataset 2

In [66]:
df_disease=pd.read_csv('heart-attack-charges.txt',sep='\t',header=None)

In [67]:
df_disease.rename(columns={0:'CHARGES',1:'LOS',2:'AGE',3:'SEX',4:'DRG',5:'DIED',6:'Sex#'},inplace=True)
df_disease.drop(['SEX','DRG'],axis=1,inplace=True)
df_disease.dropna(axis=0,how='any',inplace=True);

In [68]:
df_disease=df_disease[1:].sample(frac=1)
df_disease['CHARGES']=pd.to_numeric(df_disease['CHARGES'], downcast='integer')
df_disease.shape

(12145, 5)

In [69]:
df_disease.head(2)

Unnamed: 0,CHARGES,LOS,AGE,DIED,Sex#
2991,3572.0,4,51,0,0
6144,4975.0,4,75,0,0


In [70]:
Y_2=df_disease['DIED']
X_2=df_disease.drop(['DIED'],axis=1)

In [71]:
X_2.to_csv('data/2_X.csv',index=False,header=True)
Y_2.to_csv('data/2_Y.csv',index=False,header=True)

##  Dataset 3

In [72]:
df_4_Train=pd.read_csv('dota2Train.csv')
df_4_Train=df_4_Train.rename(columns={'-1':'Win'})
df_4_Train.dropna(axis=0,how='any',inplace=True)
df_4_Train['Win']=df_4_Train['Win'].astype(int).replace({-1:0})

In [73]:
df_4_Train.shape

(92649, 117)

In [74]:
df_4_Train=df_4_Train.sample(frac=1)[:7000]

In [75]:
df_4_Train.shape

(7000, 117)

In [76]:
df_4_Train.head(5)

Unnamed: 0,Win,223,2,2.1,0,0.1,0.2,0.3,0.4,0.5,...,0.93,0.94,0.95,0.96,0.97,0.98,0.99,0.100,0.101,0.102
60987,0,138,9,2,0,0,0,0,0,0,...,0,0,-1,0,0,0,0,0,0,0
60377,1,152,2,2,0,0,0,0,0,-1,...,0,0,0,0,0,0,0,0,0,0
28149,1,223,2,2,0,0,0,1,0,0,...,-1,0,0,0,0,0,0,0,0,0
32078,1,224,8,2,0,0,0,1,0,0,...,0,0,0,0,0,0,-1,0,0,0
56018,0,224,2,2,0,0,0,0,0,1,...,0,0,0,0,0,1,-1,0,0,0


In [77]:
df_4_Train.columns
df_4_Train['Win'].value_counts()

1    3717
0    3283
Name: Win, dtype: int64

In [78]:
Y_4=df_4_Train['Win']
X_4=df_4_Train.drop(labels=['Win','223'],axis=1)
X_4.to_csv('data/4_X.csv',index=False,header=True)
Y_4.to_csv('data/4_Y.csv',index=False,header=True)

## Dataset 4

In [79]:
df_adult=pd.read_csv('adult.data',names=['Age','Work_status','null','Education','education-num','marital-status',
                                        'occupation','relationship','race','sex','capital-gain','capital-loss',
                                        'hours-per-week','native-country','Salary'])


In [80]:
df_adult.drop('null',axis=1,inplace=True)
df_adult.dropna(how='any',axis=0,inplace=True)

In [81]:
set(df_adult['Salary'])

{' <=50K', ' >50K'}

In [82]:
df_adult.columns

Index(['Age', 'Work_status', 'Education', 'education-num', 'marital-status',
       'occupation', 'relationship', 'race', 'sex', 'capital-gain',
       'capital-loss', 'hours-per-week', 'native-country', 'Salary'],
      dtype='object')

In [83]:
df_adult['Salary']=df_adult['Salary'].replace({' <=50K':0,' >50K':1})

In [84]:
df_adult=pd.get_dummies((df_adult.sample(frac=1)))

In [85]:
df_adult.shape

(32561, 108)

In [86]:
df_adult.head(2)

Unnamed: 0,Age,education-num,capital-gain,capital-loss,hours-per-week,Salary,Work_status_ ?,Work_status_ Federal-gov,Work_status_ Local-gov,Work_status_ Never-worked,...,native-country_ Portugal,native-country_ Puerto-Rico,native-country_ Scotland,native-country_ South,native-country_ Taiwan,native-country_ Thailand,native-country_ Trinadad&Tobago,native-country_ United-States,native-country_ Vietnam,native-country_ Yugoslavia
25401,48,10,3325,0,60,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1764,46,14,27828,0,50,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [87]:
Y_3=df_adult['Salary'][:7000]
X_3=df_adult.drop('Salary',axis=1)[:7000]
X_3.to_csv('data/3_X.csv',index=False,header=True)
Y_3.to_csv('data/3_Y.csv',index=False,header=True)

In [88]:
X_4.shape

(7000, 115)