<h1>Imports</h1>

In [None]:
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn import svm
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split

from sklearn.metrics import roc_curve, roc_auc_score

In [None]:
pd.set_option("display.max_columns", 100)
%matplotlib inline

In [None]:
RANDOM_SEED = 6

<h1>Dataset Importing</h1>

In [None]:
DATA_PATH = Path.cwd() / "../../datasets"

In [None]:
training_df = pd.read_csv(DATA_PATH / "train.csv", index_col="tripid")
training_df.head()

<h1>Data Preprocessing</h1>

In [None]:
features = training_df[['additional_fare', 'duration', 'meter_waiting', 'meter_waiting_fare',
       'meter_waiting_till_pickup', 'pickup_time', 'drop_time', 'pick_lat',
       'pick_lon', 'drop_lat', 'drop_lon','fare']]
labels = training_df[['label']]
mapping = {'correct': 1, 'incorrect': 0}
labels = labels.replace({'label':mapping})

In [None]:
numerical_features = features.columns[features.dtypes != "object"].values
categorical_features = features.columns[features.dtypes == "object"].values

<h1>Data Cleaning</h1>

<h1>Feature Engineering</h1>

<h1>Model Training</h1>

In [None]:

## chain numerical preprocessing into a pipeline object
numeric_preprocessing_steps = Pipeline([
    ('simple_imputer', SimpleImputer(strategy='median')),
    ('minmax_scaler', MinMaxScaler())
])

## chain non-numerical preprocessing into a pipeline object
non_numeric_preprocessing_steps = Pipeline([
    ('simple_imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot_encoder', OneHotEncoder(handle_unknown='ignore')),
#     ('label_encoder', LabelEncoder())
])

## create preprocessor stage of the final pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('non_numeric', non_numeric_preprocessing_steps, categorical_features),
        ('numeric', numeric_preprocessing_steps, numerical_features)
    ],
    remainder = 'drop'
)

In [None]:
estimators = MultiOutputClassifier(
    estimator = svm.SVC()
)

In [None]:
full_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('estimators', estimators)
])

full_pipeline

In [None]:

X_train, X_eval, y_train, y_eval = train_test_split(features, labels, test_size=0.3, shuffle=True, stratify=labels, random_state=RANDOM_SEED)

## Train the model
full_pipeline.fit(X_train, y_train)

# Predict for the evaluation set
print("Training Accuracy: %.2f" % (full_pipeline.score(X_eval, y_eval)*100), "%")
preds = full_pipeline.predict(X_eval)

<h1>Model Validation</h1>

In [None]:
test_set = pd.read_csv(DATA_PATH / "test.csv", index_col="tripid")
test_set.head()

In [None]:
test_probs = full_pipeline.predict(test_set)

In [None]:
submission_set = pd.read_csv(DATA_PATH / "sample_submission.csv", index_col="tripid")
submission_set.head()

submission_set['prediction']= test_probs

In [None]:
submission_set.to_csv('../../submissions/model1/teamCluster_submission_02.csv', index=True)
print("Completed!")

In [None]:
submission_set['prediction'].idxmin()

In [None]:
submission_set['prediction'].value_counts()