<b>PLEASE NOTE THAT THIS NOTEBOOK WAS RUN IN GOOGLE COLAB AS SUCH CERTAIN IMPORT STATEMENTS MAY DIFFER</b>

<h1>Initial Setup</h1>

In [None]:
!pip install catboost



In [None]:
!pip install pyproj

Collecting pyproj
[?25l  Downloading https://files.pythonhosted.org/packages/e5/c3/071e080230ac4b6c64f1a2e2f9161c9737a2bc7b683d2c90b024825000c0/pyproj-2.6.1.post1-cp36-cp36m-manylinux2010_x86_64.whl (10.9MB)
[K     |████████████████████████████████| 10.9MB 2.7MB/s 
[?25hInstalling collected packages: pyproj
Successfully installed pyproj-2.6.1.post1


<h2>Import Packages</h2>

In [None]:
import numpy as np
import pandas as pd
import lightgbm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
import math
from sklearn import metrics
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import KFold, StratifiedKFold
import matplotlib.pyplot as plt
import os
import glob
import pyproj
from sklearn.cluster import DBSCAN, KMeans
import datetime

<h2>Loading the Dataset</h2>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [None]:
train_path=os.path.abspath(os.path.join(os.path.dirname("__file__"), '..', '/content/drive/My Drive/datasets/train.csv'))
training_df = pd.read_csv(train_path, index_col="tripid")

**Geospatial clustering method additiona**

In [None]:
def convert_to_utm(df, src_epsg, dst_epsg, col_lat, col_lon, alias_lon=None,
                   alias_lat=None):
    """
    Cython wrapper to converts from geographic (longitude,latitude)
    to native map projection (x,y) coordinates. Values of x and y are
    given in meters.
    OpenStreetMap is in a projected coordinate system that is based on the
    wgs84 datum. (EPSG 4326)
    :param df: DataFrame input
    :param src_epsg: Geographic coordinate system used in the source points;
    :param dst_epsg: UTM coordinate system to convert the input;
    :param col_lat: Latitude column name;
    :param col_lon:  Longitude column name;
    :param alias_lon: Longitude column name (default, replace the input);
    :param alias_lat: Latitude column name (default, replace the input);
    """
    old_proj = pyproj.Proj(src_epsg, preserve_units=True)
    new_proj = pyproj.Proj(dst_epsg, preserve_units=True)
    print("Formal definition string for the old projection:",
          old_proj.definition_string())
    print("Formal definition string for the new projection:",
          new_proj.definition_string())
    lon = df[col_lon].values
    lat = df[col_lat].values
    x1, y1 = old_proj(lon, lat)

    x2, y2 = pyproj.transform(old_proj, new_proj, x1, y1)

    if alias_lon is None:
        alias_lon = col_lon

    if alias_lat is None:
        alias_lat = col_lat

    df[alias_lon] = x2
    df[alias_lat] = y2

    return df

In [None]:
from datetime import timedelta

class STDBSCAN(object):

    def __init__(self, spatial_threshold=500.0, temporal_threshold=60.0,
                 min_neighbors=15):
        """
        Python ST-DBSCAN implementation.
        Because this algorithm needs to calculate multiple distances between
        points, it optimizes by assuming latitude and longitude columns in
        UTM projection. If it is not, convert them by using the
        `coordinates.convert_to_utm` available method.
        UTM projects onto a cylinder, and a cylinder is essentially flat (zero
        Gaussian curvature) so the Euclidean formula would be accurate for
        points on the cylinder (same Zone).
        :param spatial_threshold: Maximum geographical coordinate (spatial)
             distance value (meters);
        :param temporal_threshold: Maximum non-spatial distance value (seconds);
        :param min_neighbors: Minimum number of points within Eps1 and Eps2
             distance;
        """
        self.spatial_threshold = spatial_threshold
        self.temporal_threshold = temporal_threshold
        self.min_neighbors = min_neighbors

    def _retrieve_neighbors(self, index_center, matrix):

        center_point = matrix[index_center, :]

        # filter by time
        min_time = center_point[2] - timedelta(seconds=self.temporal_threshold)
        max_time = center_point[2] + timedelta(seconds=self.temporal_threshold)
        matrix = matrix[(matrix[:, 2] >= min_time) &
                        (matrix[:, 2] <= max_time), :]
        # filter by distance
        tmp = (matrix[:, 0]-center_point[0])*(matrix[:, 0]-center_point[0]) + \
            (matrix[:, 1]-center_point[1])*(matrix[:, 1]-center_point[1])
        neigborhood = matrix[tmp <= (
            self.spatial_threshold*self.spatial_threshold), 4].tolist()
        neigborhood.remove(index_center)

        return neigborhood

    def fit_transform(self, df, col_lat, col_lon, col_time,
                      col_cluster='cluster'):
        """
        :param df: DataFrame input
        :param col_lat: Latitude column name;
        :param col_lon:  Longitude column name;
        :param col_time: Date time column name;
        :param col_cluster: Alias for predicted cluster (default, 'cluster');
        """
        cluster_label = 0
        noise = -1
        unmarked = 777777
        stack = []

        # initial setup
        df = df[[col_lon, col_lat, col_time]]
        df[col_cluster] = unmarked
        df['index'] = range(df.shape[0])
        matrix = df.values
        df.drop(['index'], inplace=True, axis=1)

        # for each point in database
        for index in range(matrix.shape[0]):
            if matrix[index, 3] == unmarked:
                neighborhood = self._retrieve_neighbors(index, matrix)

                if len(neighborhood) < self.min_neighbors:
                    matrix[index, 3] = noise
                else:  # found a core point
                    cluster_label += 1
                    # assign a label to core point
                    matrix[index, 3] = cluster_label

                    # assign core's label to its neighborhood
                    for neig_index in neighborhood:
                        matrix[neig_index, 3] = cluster_label
                        stack.append(neig_index)  # append neighbors to stack

                    # find new neighbors from core point neighborhood
                    while len(stack) > 0:
                        current_point_index = stack.pop()
                        new_neighborhood = \
                            self._retrieve_neighbors(current_point_index,
                                                     matrix)

                        # current_point is a new core
                        if len(new_neighborhood) >= self.min_neighbors:
                            for neig_index in new_neighborhood:
                                neig_cluster = matrix[neig_index, 3]
                                if any([neig_cluster == noise,
                                        neig_cluster == unmarked]):
                                    matrix[neig_index, 3] = cluster_label
                                    stack.append(neig_index)

        df[col_cluster] = matrix[:, 3]
        return df

In [None]:
def parse_dates(x):
    return datetime.strptime(x, '%Y-%m-%d %H:%M:%S')

<h1>Feature Addition</h1>

In [None]:
def dist_from_coordinates(lat1, lon1, lat2, lon2):
  R = 6371  # Earth radius in km

  #conversion to radians
  d_lat = np.radians(lat2-lat1)
  d_lon = np.radians(lon2-lon1)

  r_lat1 = np.radians(lat1)
  r_lat2 = np.radians(lat2)

  #haversine formula
  a = np.sin(d_l4/0gEdHi_reLRUVSm3waTM_kuZn--6nRRU3udhjrb_D6fkc3T_upOZ5lAat/2.) **2 + np.cos(r_lat1) * np.cos(r_lat2) * np.sin(d_lon/2.)**2

  haversine = 2 * R * np.arcsin(np.sqrt(a))

  return haversine

In [None]:
training_df['pickup_time'] = pd.to_datetime(training_df['pickup_time'], format="%m/%d/%Y %H:%M")
training_df['drop_time'] = pd.to_datetime(training_df['drop_time'], format="%m/%d/%Y %H:%M")

In [None]:
training_df = training_df.assign(timeOfDay=pd.cut(training_df.pickup_time.dt.hour,[-1, 8, 20, 24],labels=['dawn','day', 'night']))

In [None]:
training_df.loc[training_df['timeOfDay'] == 'day', 'isNormalCharge'] = 1
training_df.loc[training_df['timeOfDay'] != 'day', 'isNormalCharge'] = 0

In [None]:
durations = []
for index,row in training_df.iterrows():
  provided_duration = row['duration'] #first row of location.lat column here
  if math.isnan(provided_duration) or provided_duration <= 0 :
    time_dif = (row['drop_time'] - row['pickup_time']).seconds
    if(time_dif == 0):
        time_dif = np.nan
    durations.append(time_dif)
  else :  
    durations.append(provided_duration)

training_df.insert(4,"time_dif",durations)

In [None]:
new_column = []                    #empty column for distance
for index,row in training_df.iterrows():
  lat1 = row['pick_lat'] #first row of location.lat column here
  lon1 = row['pick_lon'] #first row of location.long column here
  lat2 = row['drop_lat'] #second row of location.lat column here
  lon2 = row['drop_lon'] #second row of location.long column here
  value = dist_from_coordinates(lat1, lon1, lat2, lon2)  #get the distance
  new_column.append(value)   #append the empty list with distance values

training_df.insert(4,"distance",new_column)

In [None]:
training_df['time_driven'] = training_df['duration']  - training_df['meter_waiting']

In [None]:
chargeperhours = []
for index,row in training_df.iterrows():
    if(row['meter_waiting'] == 0):
        chargeperhour = np.nan
    else:
        chargeperhour = (row['meter_waiting_fare'] / row['meter_waiting'] * 3600)
    chargeperhours.append(chargeperhour)

training_df.insert(4,'charge_per_hour',chargeperhours)


In [None]:
training_df['driving_fare'] = training_df['fare']  - training_df['meter_waiting_fare'] - training_df['additional_fare']

In [None]:
avgspeeds = []
for index,row in training_df.iterrows():
    if(row['time_driven'] == 0):
        avgspeed = np.nan
    else:
        avgspeed = (row['distance'] / row['time_driven'] * 3600)
    avgspeeds.append(avgspeed)

training_df.insert(4,"avg_speed",avgspeeds)



In [None]:
costsperkm = []
for index,row in training_df.iterrows():
    if row['distance'] == 0:
        costperkm = np.nan
            
    else:
        costperkm = (row['driving_fare'] / row['distance'])
    costsperkm.append(costperkm)

training_df.insert(4,"cost_per_km",costsperkm)

In [None]:
training_df = training_df.replace({'label': {'incorrect': 0, 'correct' : 1}})

<h2>Spatial Geolocation clustering</h2>

In [None]:
test_path =os.path.abspath(os.path.join(os.path.dirname("__file__"), '..', '/content/drive/My Drive/datasets/test.csv'))
test_set = pd.read_csv(test_path, index_col="tripid")

In [None]:
test_set['pickup_time'] = pd.to_datetime(test_set['pickup_time'], format="%m/%d/%Y %H:%M")
test_set['drop_time'] = pd.to_datetime(test_set['drop_time'], format="%m/%d/%Y %H:%M")

In [None]:
pickup_locations = pd.DataFrame()
drop_locations = pd.DataFrame()
test_pickup_locations = pd.DataFrame()
test_drop_locations = pd.DataFrame()

In [None]:
pickup_locations[['lat','lon','time']] = training_df[['pick_lat','pick_lon','pickup_time']]
drop_locations[['lat','lon','time']] = training_df[['drop_lat','drop_lon','drop_time']]

In [None]:
test_pickup_locations[['lat','lon','time']] = test_set[['pick_lat','pick_lon','pickup_time']]
test_drop_locations[['lat','lon','time']] = test_set[['drop_lat','drop_lon','drop_time']]

In [None]:
pickup_locations = pickup_locations.reset_index()
drop_locations = drop_locations.reset_index()
test_pickup_locations = test_pickup_locations.reset_index()
test_drop_locations = test_drop_locations.reset_index()

In [None]:
df = pd.concat([pickup_locations,test_pickup_locations],ignore_index=True )

In [None]:
df['time'] = pd.to_datetime(df['time'], format="%m/%d/%Y %H:%M")

In [None]:
df['time'] = df['time'].dt.time

In [None]:
df['day'] = datetime.date(2011, 1, 1)

In [None]:
df['daytime'] = pd.to_datetime(df['day'].astype(str)+' '+df['time'].astype(str))

In [None]:
st_dbscan = STDBSCAN(spatial_threshold=1000, temporal_threshold=600,
                         min_neighbors=5)

df = convert_to_utm(df, src_epsg=4326, dst_epsg=32633,
                        col_lat='lat', col_lon='lon')

result_t601 = st_dbscan.fit_transform(df, col_lat='lat',
                                          col_lon='lon',
                                          col_time='daytime')

Formal definition string for the old projection: proj=longlat datum=WGS84 no_defs ellps=WGS84 towgs84=0,0,0
Formal definition string for the new projection: proj=utm zone=33 datum=WGS84 units=m no_defs ellps=WGS84 towgs84=0,0,0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-ve

In [None]:
result_t601['cluster'].value_counts()

 2     18556
-1      3238
 3      2520
 1       189
 4       110
       ...  
 54        6
 89        6
 44        6
 85        6
 45        5
Name: cluster, Length: 90, dtype: int64

In [None]:
training_df.shape

(17176, 22)

In [None]:
training_df['pickup_cluster_label'] = result_t601['cluster'][0:17176].values.tolist()

In [None]:
training_df['pickup_cluster_label'] = training_df['pickup_cluster_label']  + 1
training_df['pickup_cluster_label'] = training_df['pickup_cluster_label'].astype(str)

In [None]:
#training_df['drop_cluster_label'] = result_t601['cluster'][17176:17176*2].values.tolist()

<h2>Final Feature Selection</h2>

In [None]:
training_df.columns

Index(['additional_fare', 'duration', 'meter_waiting', 'meter_waiting_fare',
       'cost_per_km', 'avg_speed', 'charge_per_hour', 'distance', 'time_dif',
       'meter_waiting_till_pickup', 'pickup_time', 'drop_time', 'pick_lat',
       'pick_lon', 'drop_lat', 'drop_lon', 'fare', 'label', 'timeOfDay',
       'isNormalCharge', 'time_driven', 'driving_fare',
       'pickup_cluster_label'],
      dtype='object')

In [None]:
training_columns = ['meter_waiting','meter_waiting_fare','fare','additional_fare', 'distance','cost_per_km', 'avg_speed',  'time_dif','time_driven', 'charge_per_hour', 'driving_fare', 'isNormalCharge','pick_lat','pick_lon','drop_lat','drop_lon','pickup_cluster_label']

In [None]:
target_column = ['label']

In [None]:
training_df = training_df.drop(190167541)

In [None]:
training_df.shape

(17175, 23)

In [None]:
x = training_df[training_columns]
y = training_df[target_column]

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
training_df['label'].value_counts()

1    15494
0     1681
Name: label, dtype: int64

<h1>Model Training</h1>

<h2>Eval Metric Definition</h2>

In [None]:
def focal_loss_lgb_eval_error(y_true, y_pred, alpha=.25, gamma=2.):
    a,g = alpha, gamma
    p = 1/(1+np.exp(-y_pred))
    loss = -( a*y_true + (1-a)*(1-y_true) ) * (( 1 - ( y_true*p + (1-y_true)*(1-p)) )**g) * ( y_true*np.log(p)+(1-y_true)*np.log(1-p) )
    return 'focal_loss', np.mean(loss), False

In [None]:
from sklearn.metrics import f1_score
def evaluate_macroF1_lgb(y_true, y_pred):  
    y_hat = np.where(y_pred < 0.5, 0, 1) 
    f1 = f1_score(y_true, y_hat, average='macro')
    return ('macroF1', f1, True) 

<h2>Training the Model</h2>

In [None]:
gkf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
from catboost import CatBoostClassifier

In [None]:
x_train.shape

(13740, 17)

In [None]:
model = CatBoostClassifier(class_weights=[1.5,1], learning_rate=0.1, cat_features=[16])

In [None]:
for training_index, testing_index in gkf.split(X=x_train, y=y_train):
    x_train_fold, y_train_fold = x_train.iloc[training_index], y_train.iloc[training_index]
    x_test_fold, y_test_fold = x_train.iloc[testing_index], y_train.iloc[testing_index]
    model.fit(x_train_fold, y_train_fold, eval_set=(x_test_fold,y_test_fold), early_stopping_rounds=50)

0:	learn: 0.5923475	test: 0.5924855	best: 0.5924855 (0)	total: 72.6ms	remaining: 1m 12s
1:	learn: 0.5115463	test: 0.5122463	best: 0.5122463 (1)	total: 98.6ms	remaining: 49.2s
2:	learn: 0.4487203	test: 0.4501351	best: 0.4501351 (2)	total: 129ms	remaining: 42.9s
3:	learn: 0.4047804	test: 0.4070804	best: 0.4070804 (3)	total: 159ms	remaining: 39.7s
4:	learn: 0.3683624	test: 0.3723379	best: 0.3723379 (4)	total: 180ms	remaining: 35.8s
5:	learn: 0.3409077	test: 0.3457336	best: 0.3457336 (5)	total: 200ms	remaining: 33.1s
6:	learn: 0.3180592	test: 0.3245757	best: 0.3245757 (6)	total: 221ms	remaining: 31.3s
7:	learn: 0.2966076	test: 0.3036839	best: 0.3036839 (7)	total: 242ms	remaining: 30s
8:	learn: 0.2788393	test: 0.2850168	best: 0.2850168 (8)	total: 263ms	remaining: 29s
9:	learn: 0.2649947	test: 0.2707657	best: 0.2707657 (9)	total: 289ms	remaining: 28.6s
10:	learn: 0.2555058	test: 0.2615224	best: 0.2615224 (10)	total: 309ms	remaining: 27.8s
11:	learn: 0.2486142	test: 0.2536707	best: 0.2536707 

<h2>Model Evaluation</h2>

In [None]:
predicted_y = model.predict(x_test)
train_pred_y = model.predict(x)

In [None]:
print(); print(metrics.classification_report(y,train_pred_y))
print(); print(metrics.confusion_matrix(y, train_pred_y))

print(); print(metrics.classification_report(y_test, predicted_y))
print(); print(metrics.confusion_matrix(y_test, predicted_y))


              precision    recall  f1-score   support

           0       0.92      0.74      0.82      1681
           1       0.97      0.99      0.98     15494

    accuracy                           0.97     17175
   macro avg       0.95      0.87      0.90     17175
weighted avg       0.97      0.97      0.97     17175


[[ 1244   437]
 [  111 15383]]

              precision    recall  f1-score   support

           0       0.86      0.69      0.76       336
           1       0.97      0.99      0.98      3099

    accuracy                           0.96      3435
   macro avg       0.91      0.84      0.87      3435
weighted avg       0.96      0.96      0.96      3435


[[ 231  105]
 [  39 3060]]


<h1>Testing</h1>

<h2>Loading the Test Dataset</h2>

In [None]:
test_path =os.path.abspath(os.path.join(os.path.dirname("__file__"), '..', '/content/drive/My Drive/datasets/test.csv'))
test_set = pd.read_csv(test_path, index_col="tripid")

<h2>Feature Addition on testing dataset</h2>

In [None]:
test_set['pickup_time'] = pd.to_datetime(test_set['pickup_time'], format="%m/%d/%Y %H:%M")
test_set['drop_time'] = pd.to_datetime(test_set['drop_time'], format="%m/%d/%Y %H:%M")

In [None]:
test_set = test_set.assign(timeOfDay=pd.cut(test_set.pickup_time.dt.hour,[-1, 8, 20, 24],labels=['dawn','day', 'night']))

In [None]:
test_set.loc[test_set['timeOfDay'] == 'day', 'isNormalCharge'] = 1
test_set.loc[test_set['timeOfDay'] != 'day', 'isNormalCharge'] = 0

In [None]:
durations = []
for index,row in test_set.iterrows():
  provided_duration = row['duration'] #first row of location.lat column here
  if math.isnan(provided_duration) or provided_duration <= 0 :
    time_dif = (row['drop_time'] - row['pickup_time']).seconds
    if(time_dif == 0):
        time_dif = np.nan
    durations.append(time_dif)
  else :  
    durations.append(provided_duration)

test_set.insert(4,"time_dif",durations)

In [None]:
new_column = []                    #empty column for distance
for index,row in test_set.iterrows():
  lat1 = row['pick_lat'] #first row of location.lat column here
  lon1 = row['pick_lon'] #first row of location.long column here
  lat2 = row['drop_lat'] #second row of location.lat column here
  lon2 = row['drop_lon'] #second row of location.long column here
  value = dist_from_coordinates(lat1, lon1, lat2, lon2)  #get the distance
  new_column.append(value)   #append the empty list with distance values

test_set.insert(4,"distance",new_column)

In [None]:
test_set['time_driven'] = test_set['duration']  - test_set['meter_waiting']

In [None]:
chargeperhours = []
for index,row in test_set.iterrows():
    if(row['meter_waiting'] == 0):
        chargeperhour = np.nan
    else:
        chargeperhour = (row['meter_waiting_fare'] / row['meter_waiting'] * 3600)
    chargeperhours.append(chargeperhour)

test_set.insert(4,'charge_per_hour',chargeperhours)


In [None]:
test_set['driving_fare'] = test_set['fare']  - test_set['meter_waiting_fare'] - test_set['additional_fare']

In [None]:
avgspeeds = []
for index,row in test_set.iterrows():
    if(row['time_driven'] == 0):
        avgspeed = np.nan    
    else:
        avgspeed = (row['distance'] / row['time_driven'] * 3600)
    avgspeeds.append(avgspeed)

test_set.insert(4,"avg_speed",avgspeeds)



In [None]:
costsperkm = []
for index,row in test_set.iterrows():
    if row['distance'] == 0:
        costperkm = np.nan
            
    else:
        costperkm = (row['driving_fare'] / row['distance'])
    costsperkm.append(costperkm)

test_set.insert(4,"cost_per_km",costsperkm)

<h2>Adding cluster labels to test dataset</h2>

In [None]:
test_set['pickup_cluster_label'] = result_t601['cluster'][17176:17176 + 8576 ].values.tolist()
#test_set['drop_cluster_label'] = result_t601['cluster'][17176*2 + 8576: 2*(17176+8576) ].values.tolist()

In [None]:
test_set['pickup_cluster_label'] = test_set['pickup_cluster_label']  + 1
test_set['pickup_cluster_label'] = test_set['pickup_cluster_label'].astype(str)

In [None]:
test_features = test_set[training_columns]

In [None]:
test_features

Unnamed: 0_level_0,meter_waiting,meter_waiting_fare,fare,additional_fare,distance,cost_per_km,avg_speed,time_dif,time_driven,charge_per_hour,driving_fare,isNormalCharge,pick_lat,pick_lon,drop_lat,drop_lon,pickup_cluster_label
tripid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
213284604,42,2.44860,289.27,10.5,6.705702,41.206933,27.370211,924,882,209.880000,276.32140,0.0,6.83454,79.8750,6.77490,79.8840,2
213286352,20,0.00000,1912.70,10.5,41.558513,45.771609,35.377311,4249,4229,0.000000,1902.20000,0.0,6.91168,79.8723,6.55091,79.9706,2
213293973,255,2.65880,394.00,10.5,5.916678,64.367402,16.422545,1552,1297,37.536000,380.84120,0.0,6.92145,79.8478,6.90539,79.8989,64
213294622,16,0.00000,154.32,10.5,3.301761,43.558571,26.650987,462,446,0.000000,143.82000,0.0,6.77433,79.9416,6.80401,79.9407,0
213298687,392,12.36920,147.47,10.5,2.588542,48.135517,22.082347,814,422,113.594694,124.60080,0.0,6.97968,79.9130,6.98875,79.8914,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
222856243,429,24.83332,388.48,10.5,3.934272,89.761643,10.945423,1723,1294,208.391497,353.14668,0.0,6.85103,79.9567,6.85588,79.9214,3
222857785,80,0.00000,379.85,10.5,7.517433,49.132466,20.849582,1378,1298,0.000000,369.35000,0.0,6.91293,79.9656,6.92112,79.8980,0
222858416,56,3.28440,112.79,10.5,2.057225,48.125809,20.458588,418,362,211.140000,99.00560,0.0,6.85718,79.9081,6.83868,79.9083,3
222858691,548,31.67440,248.46,10.5,3.900888,52.881696,13.298484,1604,1056,208.080000,206.28560,0.0,6.91289,79.8846,6.93159,79.9145,3


<h2>Prediction using trained model</h2>

In [None]:
predicted_labels = model.predict(test_features)

In [None]:
predicted_labels_df = pd.DataFrame(predicted_labels )

In [None]:
sub_path =os.path.abspath(os.path.join(os.path.dirname("__file__"), '..', '/content/drive/My Drive/datasets/sample_submission.csv'))
submission_set = pd.read_csv(sub_path)

In [None]:
submission_set['prediction']= predicted_labels_df.values[:,0]

<h2>Submission file generation</h2>

In [None]:
theNotebook = "dbscan_only_pickup_cat"

In [None]:
filename = '/content/drive/My Drive/datasets/'+theNotebook+'/'+theNotebook+'_{%i}.csv'
dirname = '/content/drive/My Drive/datasets/'+theNotebook
fileversion = 1

if not os.path.exists(dirname):
    os.makedirs(dirname)
while glob.glob(filename.replace('{%i}',str(fileversion))) :
    fileversion+=1
submission_set.to_csv(filename.replace('{%i}',str(fileversion)), index=False)
print("Completed!")

Completed!


In [None]:
submission_set['prediction'].value_counts()

1    8117
0     459
Name: prediction, dtype: int64