Import required packages

In [3]:
import pandas as pd, numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, cross_val_score

load in the data

Target Analysis

In [None]:
print(df['target'].value_counts().plot(kind='bar'))

Data pre-processing

In [None]:
invalid = [col for col in df if (df[col].nunique()==1)]
df = df.drop(invalid,axis=1)
characters = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z']
string_total = [col for col in df if any (df[col].astype(str).str.lower().str.contains(('|'.join(characters))))]
cols = df.columns.drop(strings_total)
df[cols] = df[cols].apply(pd.to_numeric)
categorical = []
dummies = strings_total+categorical
df_ready = pd.get_dummies(df, drop_first=True, columns=dummies)

Create test and training set

In [None]:
x = df_ready.loc[:,df_ready.columns != 'target']
y = df_ready.loc[:,df_ready.columns == 'target']
x_train, x_test, y_train, y_test = train_test_split (x, y, test_size = 0.25, random_state=34)

Feature Scaling

In [None]:
# Step 1, create a list of the columns that need to be scaled.
scale = list(set(df_ready.columns)-set(dummies))
scale.remove('target')
# Step 2, create a list of columns that don't need to be scaled.
unscale = list(set(df_ready.columns)-set(scale))
unscale.remove ('target')
# Step 3, create seperate dataframes that are resp. scaled and unscaled from the test and training set.
sc_x = StandardScaler()
scaled_vars_train = pd.Dataframe(sc_x.fit_transform(x_train[scale]),
                                 index=x_train.index,columns=x_train.columns)
unscaled_vars_train = pd.Dataframe(x_train[unscale],
                                   index=x_train,columns=x_train.columns)
scaled_vars_test = pd.Dataframe(sc_x.fit(x_test[scale]),
                                 index=x_test.index,columns=x_test.columns)
unscaled_vars_test = pd.Dataframe(x_test[unscale],
                                   index=x_test,columns=x_test.columns)
# Step 4, merge the dataframes back together.
x_train = scaled_vars_train.join(unscaled_vars_train, how = 'inner')
x_test = scaled_vars_test.join(unscaled_vars_test, how = 'inner')
# Step 5, make a copy of the x_train dataframe for indexing columns names with output later
x_df = x_train.copy()
# Step 6, keep only the numpy arrays for ML apllications:
x_train = x_train.copy().values
y_train = y_train.copy().values
x_test = x_test.copy().values
y_test = y_test.copy().values

Fit Gaussian and Radial kernel in Support Vector Machines via Grid Search and assess Linearity.

In [None]:
model = SVC(kernel = 'linear', random_state =34)
parameters = [{'c': [1, 10, 100, 1000], 'kernel':['linear']},
              {'c': [1, 10, 100, 1000], 'kernel':['rbf'], 'gamma': [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3,
                                                                    0.4, 0.5, 0.6, 0.7, 0.8, 0.9]}
             ]
grid_search = GridSearchCV(estimator = model, param_grid = parameters, scoring = 'accuracy', cv=10, n_jobs= -1)
grid_search = grid_search.fit(x_train,y_train.ravel())
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
print(best_accuracy)
print(best_parameters)

In [None]:
# Evaluate all model if possible in a consistent manner:
report_accuracy = cross_val_score (estimator = model,X=x_train,Y=y_train.ravel(),cv=10)
print(round((report_accuracy.mean()*100)).astype('str')+'%'+',This is the mean accuracy of 10 model evaluations.')
print(round(report_accuracy.std()).astype('str')+'%'+' ,This means that on average'+
     ', the differences of the 10 model accuracy estimation of the model is'+ 
      round(report_accuracy.std()).astype('str')+'%')