1. split data into training and test sets
2. encode categorical features
3. hyperparameter grid search for SVM and kNN 
4. evaluate final model on test set

In [None]:
import numpy as np
import pandas as pd 
import os
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, MinMaxScaler
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import plot_roc_curve, plot_confusion_matrix
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
data=pd.read_csv('/kaggle/input/customer-analytics/Train.csv')

In [None]:
#columns with missing values? 
data.isnull().sum().sum()

In [None]:
#7 continuous features
#4 categorical features
#1 target variable (arrived on time y/n)
data.info()

In [None]:
# split the data into test/train 

X=data.iloc[:,1:-1] #dependent variable (target) 
Y=data.iloc[:,-1] #independent variables

x_train, x_test, y_train, y_test = train_test_split(X,Y,test_size=0.2, stratify=Y, random_state=24)

print(f'number of training examples {len(x_train)}')
print(f'number of test examples {len(x_test)}')

In [None]:

num_feats=x_train.select_dtypes(include=['int64','float32']).columns
cat_feats=x_train.select_dtypes(include='object').columns
print(x_train[cat_feats].nunique())

In [None]:
#encode categorical features 
print(x_train['Gender'].value_counts())
print(x_train['Product_importance'].value_counts())
print(x_train['Mode_of_Shipment'].value_counts())

transforms= [('num_t', MinMaxScaler(), list(num_feats)), ('warehouse', OneHotEncoder(categories= 'auto', sparse= False), ['Warehouse_block']), ('gender', OrdinalEncoder(categories= [['M', 'F']]), ['Gender']), ('importance', OrdinalEncoder(categories= [['low', 'medium', 'high']]), ['Product_importance']), ('shipment', OrdinalEncoder(categories= [['Ship','Flight','Road']]), ['Mode_of_Shipment'])]

col_transforms= ColumnTransformer(transforms)

In [None]:
#define model - support vector classifier 
model_svc = SVC()
pipeline_svc=Pipeline(steps=[('prep', col_transforms),('mod', model_svc)])
params_svc={'mod__C':[0.5,1,10], 'mod__kernel':['linear', 'rbf'], 'mod__class_weight':[None, 'balanced'], 'mod__random_state':[24]}
search_svc=GridSearchCV(pipeline_svc,param_grid=params_svc,cv=20, n_jobs=-1, scoring='roc_auc')
search_svc.fit(x_train,y_train)
print(search_svc.best_params_)
print(search_svc.best_score_)

In [None]:
model_knn = KNeighborsClassifier()

In [None]:
final_mod=SVC(C=0.5, class_weight='balanced', kernel='rbf', random_state=24)
final_pipeline=Pipeline(steps=[('prep', col_transforms),('mod', model_knn)])
final_pipeline.fit(x_train, y_train)
predictions=final_pipeline.predict(x_test)
plot_roc_curve(final_pipeline, x_test, y_test); plot_confusion_matrix(final_pipeline, x_test, y_test)


visualisations show false positive error rate 