<h1>Imports</h1>

In [None]:
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn import svm
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import fbeta_score
from sklearn.metrics import classification_report

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

from imblearn.combine import SMOTETomek


In [None]:
pd.set_option("display.max_columns", 100)
%matplotlib inline

In [None]:
RANDOM_SEED = 6

<h1>Dataset Importing</h1>

In [None]:
DATA_PATH = Path.cwd() / "../../datasets"

In [None]:
training_df = pd.read_csv(DATA_PATH / "train.csv", index_col="tripid")
training_df.head()

In [None]:
training_df = training_df.dropna(subset=['fare'])

<h1>Data Preprocessing</h1>

In [None]:
training_df['pickup_time'] = pd.to_datetime(training_df['pickup_time'], format="%m/%d/%Y %H:%M")
training_df['drop_time'] = pd.to_datetime(training_df['drop_time'], format="%m/%d/%Y %H:%M")

In [None]:
training_df = training_df.assign(timeOfDay=pd.cut(training_df.pickup_time.dt.hour,[-1, 8, 20, 24],labels=['dawn','day', 'night']))

In [None]:
def dist_from_coordinates(lat1, lon1, lat2, lon2):
  R = 6371  # Earth radius in km

  #conversion to radians
  d_lat = np.radians(lat2-lat1)
  d_lon = np.radians(lon2-lon1)

  r_lat1 = np.radians(lat1)
  r_lat2 = np.radians(lat2)

  #haversine formula
  a = np.sin(d_lat/2.) **2 + np.cos(r_lat1) * np.cos(r_lat2) * np.sin(d_lon/2.)**2

  haversine = 2 * R * np.arcsin(np.sqrt(a))

  return haversine

In [None]:
new_column = []                    #empty column for distance
for index,row in training_df.iterrows():
  lat1 = row['pick_lat'] #first row of location.lat column here
  lon1 = row['pick_lon'] #first row of location.long column here
  lat2 = row['drop_lat'] #second row of location.lat column here
  lon2 = row['drop_lon'] #second row of location.long column here
  value = dist_from_coordinates(lat1, lon1, lat2, lon2)  #get the distance
  new_column.append(value)   #append the empty list with distance values

training_df.insert(4,"distance",new_column)

In [None]:
training_df

In [None]:
correct_training_df = training_df[training_df['label'] == 'correct']

In [None]:
features_predictor = correct_training_df[['additional_fare', 'duration', 'meter_waiting', 'meter_waiting_fare',
       'meter_waiting_till_pickup','distance','timeOfDay']]
fare_prediction = correct_training_df[['fare']]

In [None]:
features_predictor.dtypes

In [None]:
numerical_features = features_predictor.columns[features_predictor.dtypes == "float64"].values
categorical_features = features_predictor.columns[features_predictor.dtypes == "category" ].values

In [None]:
X_train, X_eval, y_train, y_eval = train_test_split(features_predictor, fare_prediction, test_size=0.3, shuffle=True)

<h1>Data Cleaning</h1>

<h1>Feature Engineering</h1>

<h1>Model Training</h1>

In [None]:

## chain numerical preprocessing into a pipeline object
numeric_preprocessing_steps = Pipeline([
    ('simple_imputer', SimpleImputer(strategy='median'))
])

non_numeric_preprocessing_steps = Pipeline([
    ('simple_imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot_encoder', OneHotEncoder(handle_unknown='ignore')),
#     ('label_encoder', LabelEncoder())
])

## create preprocessor stage of the final pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('non_numeric', non_numeric_preprocessing_steps, categorical_features),
        ('numeric', numeric_preprocessing_steps, numerical_features)
    ],
    remainder = 'drop'
)

In [None]:
new_columns = ['isDawn','isDay','isNight','additional_fare', 'duration', 'meter_waiting', 'meter_waiting_fare',
       'meter_waiting_till_pickup', 'distance']

In [None]:
preprocessed_train_features = preprocessor.fit_transform(X_train)
preprocessed_train_features_data_frame = pd.DataFrame(data=preprocessed_train_features, columns=new_columns)

In [None]:
preprocessed_test_features = preprocessor.fit_transform(X_eval)
preprocessed_test_features_data_frame = pd.DataFrame(data=preprocessed_test_features, columns=new_columns)

In [None]:
estimator = LinearRegression()
## Train the model
estimator.fit(preprocessed_train_features, y_train)

# Predict for the evaluation set
print("Training Accuracy: %.2f" % (estimator.score(preprocessed_test_features_data_frame, y_eval)*100), "%")
y_pred = estimator.predict(preprocessed_test_features_data_frame)

In [None]:
required_features = training_df[['additional_fare', 'duration', 'meter_waiting', 'meter_waiting_fare',
       'meter_waiting_till_pickup','distance','timeOfDay']]
preprocessed_full_features = preprocessor.fit_transform(required_features)
preprocessed_full_features_data_frame = pd.DataFrame(data=preprocessed_full_features, columns=new_columns)

In [None]:
predicted_prices = estimator.predict(preprocessed_full_features_data_frame)

In [None]:
preprocessed_full_features_data_frame

In [None]:
predicted_prices

In [None]:
training_df.insert(10,'predicted_price',predicted_prices)

In [None]:
training_df

In [None]:
confusion_matrix = confusion_matrix(y_eval, y_pred)
confusion_matrix

In [None]:
print(classification_report(y_eval, y_pred))

In [None]:
estimator.feature_importances_

<h1>Model Validation</h1>

In [None]:
test_set = pd.read_csv(DATA_PATH / "test.csv", index_col="tripid")
test_set.head()

In [None]:
test_probs = estimator.predict(test_set[['additional_fare', 'duration', 'meter_waiting', 'meter_waiting_fare',
       'meter_waiting_till_pickup','fare']])

In [None]:
submission_set = pd.read_csv(DATA_PATH / "sample_submission.csv", index_col="tripid")
submission_set.head()

submission_set['prediction']= test_probs

In [None]:
submission_set.to_csv('../../submissions/model6/teamCluster_submission_01.csv', index=True)
print("Completed!")

In [None]:
submission_set['prediction'].idxmin()

In [None]:
submission_set['prediction'].value_counts()