<h1>Imports</h1>

In [1]:
from pathlib import Path

import numpy as np
import sklearn as slearn
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn import svm
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import fbeta_score
from sklearn.metrics import classification_report

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.utils.class_weight import compute_class_weight


from imblearn.combine import SMOTETomek
import math
import glob
import os

Using TensorFlow backend.


In [2]:
pd.set_option("display.max_columns", 100)
%matplotlib inline

In [3]:
RANDOM_SEED = 6

<h1>Dataset Importing</h1>

In [4]:
DATA_PATH = Path.cwd() / "../../datasets"

In [5]:
training_df = pd.read_csv(DATA_PATH / "train.csv", index_col="tripid")
training_df.head()

Unnamed: 0_level_0,additional_fare,duration,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,pickup_time,drop_time,pick_lat,pick_lon,drop_lat,drop_lon,fare,label
tripid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
189123628,10.5,834.0,56.0,0.0,64.0,11/1/2019 0:20,11/1/2019 0:34,6.86252,79.8993,6.9033,79.8783,270.32,correct
189125358,10.5,791.0,47.0,0.0,134.0,11/1/2019 0:56,11/1/2019 1:09,6.88589,79.8984,6.91373,79.8923,197.85,correct
189125719,10.5,1087.0,80.0,0.0,61.0,11/1/2019 1:08,11/1/2019 1:26,6.90839,79.8651,6.93669,79.9146,301.64,correct
189127273,10.5,598.0,271.0,15.6638,68.0,11/1/2019 2:27,11/1/2019 2:37,6.9257,79.8895,6.92748,79.8971,82.3,correct
189128020,,,,,,11/1/2019 3:34,11/1/2019 3:51,6.87441,79.8615,6.84478,79.929,358.39,correct


In [6]:
training_df = training_df.dropna(subset=['fare'])

<h1>Data Preprocessing</h1>

<h2>Data Conversions</h2>

In [7]:
# Converting date time objects from Objects to datetime64
training_df['pickup_time'] = pd.to_datetime(training_df['pickup_time'], format="%m/%d/%Y %H:%M")
training_df['drop_time'] = pd.to_datetime(training_df['drop_time'], format="%m/%d/%Y %H:%M")

<h2>Adding Features</h2>
Adding features deemed useful from Feature Engineering

In [8]:
def dist_from_coordinates(lat1, lon1, lat2, lon2):
  R = 6371  # Earth radius in km

  #conversion to radians
  d_lat = np.radians(lat2-lat1)
  d_lon = np.radians(lon2-lon1)

  r_lat1 = np.radians(lat1)
  r_lat2 = np.radians(lat2)

  #haversine formula
  a = np.sin(d_lat/2.) **2 + np.cos(r_lat1) * np.cos(r_lat2) * np.sin(d_lon/2.)**2

  haversine = 2 * R * np.arcsin(np.sqrt(a))

  return haversine

In [9]:
training_df = training_df.assign(timeOfDay=pd.cut(training_df.pickup_time.dt.hour,[-1, 8, 20, 24],labels=['dawn','day', 'night']))

In [10]:
new_column = []                    #empty column for distance
for index,row in training_df.iterrows():
  lat1 = row['pick_lat'] #first row of location.lat column here
  lon1 = row['pick_lon'] #first row of location.long column here
  lat2 = row['drop_lat'] #second row of location.lat column here
  lon2 = row['drop_lon'] #second row of location.long column here
  value = dist_from_coordinates(lat1, lon1, lat2, lon2)  #get the distance
  new_column.append(value)   #append the empty list with distance values

training_df.insert(4,"distance",new_column)

In [11]:
durations = []
for index,row in training_df.iterrows():
  provided_duration = row['duration'] #first row of location.lat column here
  if math.isnan(provided_duration) or provided_duration <= 0 :
    time_dif = (row['drop_time'] - row['pickup_time']).seconds
    if(time_dif == 0):
        time_dif = 60
    durations.append(time_dif)
  else :  
    durations.append(provided_duration)

training_df.insert(4,"time_dif",durations)



In [12]:
training_df['avg_speed'] = (training_df['distance'] /  ( training_df['time_dif']) * 3600 )

In [13]:
training_df = training_df.drop(index=190167541)

In [14]:
# Dataset after adding Columns
training_df

Unnamed: 0_level_0,additional_fare,duration,meter_waiting,meter_waiting_fare,time_dif,distance,meter_waiting_till_pickup,pickup_time,drop_time,pick_lat,pick_lon,drop_lat,drop_lon,fare,label,timeOfDay,avg_speed
tripid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
189123628,10.5,834.0,56.0,0.000000,834.0,5.092770,64.0,2019-11-01 00:20:00,2019-11-01 00:34:00,6.86252,79.8993,6.90330,79.8783,270.32,correct,dawn,21.983180
189125358,10.5,791.0,47.0,0.000000,791.0,3.168058,134.0,2019-11-01 00:56:00,2019-11-01 01:09:00,6.88589,79.8984,6.91373,79.8923,197.85,correct,dawn,14.418467
189125719,10.5,1087.0,80.0,0.000000,1087.0,6.305395,61.0,2019-11-01 01:08:00,2019-11-01 01:26:00,6.90839,79.8651,6.93669,79.9146,301.64,correct,dawn,20.882633
189127273,10.5,598.0,271.0,15.663800,598.0,0.861946,68.0,2019-11-01 02:27:00,2019-11-01 02:37:00,6.92570,79.8895,6.92748,79.8971,82.30,correct,dawn,5.188973
189128020,,,,,1020.0,8.147782,,2019-11-01 03:34:00,2019-11-01 03:51:00,6.87441,79.8615,6.84478,79.9290,358.39,correct,dawn,28.756878
189129552,10.5,3407.0,182.0,0.000000,3407.0,24.207039,112.0,2019-11-01 05:38:00,2019-11-01 06:35:00,7.13402,79.8969,6.91865,79.8649,1065.02,correct,dawn,25.578321
189132829,10.5,1246.0,487.0,0.000000,1246.0,4.777624,133.0,2019-11-01 06:29:00,2019-11-01 06:49:00,6.84371,79.9051,6.85069,79.8624,266.62,correct,dawn,13.803728
189135103,10.5,1333.0,295.0,17.198500,1333.0,5.322544,212.0,2019-11-01 06:50:00,2019-11-01 07:12:00,6.90760,79.9524,6.90634,79.9042,318.05,correct,dawn,14.374462
189139296,10.5,360.0,80.0,4.664000,360.0,1.035302,3.0,2019-11-01 07:00:00,2019-11-01 07:06:00,7.26706,80.6064,7.27422,80.6124,100.32,correct,dawn,10.353018
189138671,10.5,1539.0,588.0,33.986400,1539.0,2.930715,43.0,2019-11-01 07:02:00,2019-11-01 07:28:00,6.85137,79.9537,6.84779,79.9274,257.89,correct,dawn,6.855474


<h1>Model Training</h1>

<h2> Part 1 : Fare Predictor </h2>
Predict fare using features

In [15]:
# Extract Correctly predicted fares
correct_training_df = training_df[training_df['label'] == 'correct']

In [16]:
# Extract the useful features deemed in feature engineering
features_predictor = correct_training_df[['additional_fare', 'duration', 'meter_waiting', 'meter_waiting_fare']]
fare_prediction = correct_training_df[['fare']]

In [17]:
features_predictor.dtypes

additional_fare       float64
duration              float64
meter_waiting         float64
meter_waiting_fare    float64
dtype: object

In [18]:
numerical_features = features_predictor.columns[features_predictor.dtypes == "float64"].values
categorical_features = features_predictor.columns[features_predictor.dtypes == "category" ].values

In [19]:
X_train, X_eval, y_train, y_eval = train_test_split(features_predictor, fare_prediction, test_size=0.3, shuffle=True)

In [20]:

## chain numerical preprocessing into a pipeline object
numeric_preprocessing_steps = Pipeline([
    ('simple_imputer', SimpleImputer(strategy='median'))
])

non_numeric_preprocessing_steps = Pipeline([
    ('simple_imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot_encoder', OneHotEncoder(handle_unknown='ignore')),
#     ('label_encoder', LabelEncoder())
])

## create preprocessor stage of the final pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('non_numeric', non_numeric_preprocessing_steps, categorical_features),
        ('numeric', numeric_preprocessing_steps, numerical_features)
    ],
    remainder = 'drop'
)

In [21]:
# Set of new columns
new_columns = ['additional_fare', 'duration', 'meter_waiting', 'meter_waiting_fare']

In [22]:
# Transform training data columns 
preprocessed_train_features = preprocessor.fit_transform(X_train)
preprocessed_train_features_data_frame = pd.DataFrame(data=preprocessed_train_features, columns=new_columns)

In [23]:
# Transform tvalidation data columns 
preprocessed_test_features = preprocessor.fit_transform(X_eval)
preprocessed_test_features_data_frame = pd.DataFrame(data=preprocessed_test_features, columns=new_columns)

In [24]:
preprocessed_train_features_data_frame

Unnamed: 0,additional_fare,duration,meter_waiting,meter_waiting_fare
0,10.5,1674.0,450.0,0.000000
1,10.5,565.0,72.0,0.000000
2,10.5,1238.0,377.0,0.000000
3,5.0,475.0,89.0,5.188700
4,10.5,1971.0,514.0,29.709200
5,10.5,865.0,232.0,13.525600
6,5.0,535.0,172.0,10.027600
7,5.0,857.0,500.0,29.150000
8,5.0,289.0,6.0,0.349800
9,10.5,977.0,180.0,10.494000


In [25]:
# Choosing the Predictor
estimator = RandomForestRegressor()

# Train the model
estimator.fit(preprocessed_train_features, y_train)

# Predict for the evaluation set
y_pred = estimator.predict(preprocessed_test_features_data_frame)
slearn.metrics.mean_squared_error(y_pred, y_eval)

  """


7878.927144522414

In [26]:
# training using split data done prepare features for training with full dataset !! Optional !!

required_features = training_df[['additional_fare', 'duration', 'meter_waiting', 'meter_waiting_fare',
       'meter_waiting_till_pickup','distance','avg_speed']]
preprocessed_full_features = preprocessor.fit_transform(required_features)
preprocessed_full_features_data_frame = pd.DataFrame(data=preprocessed_full_features, columns=new_columns)

In [27]:
predicted_prices = estimator.predict(preprocessed_full_features_data_frame)

In [28]:
training_df.insert(13,'predicted_price',predicted_prices)

In [29]:
# Training dataset after new features inserted
training_df

Unnamed: 0_level_0,additional_fare,duration,meter_waiting,meter_waiting_fare,time_dif,distance,meter_waiting_till_pickup,pickup_time,drop_time,pick_lat,pick_lon,drop_lat,drop_lon,predicted_price,fare,label,timeOfDay,avg_speed
tripid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
189123628,10.5,834.0,56.0,0.000000,834.0,5.092770,64.0,2019-11-01 00:20:00,2019-11-01 00:34:00,6.86252,79.8993,6.90330,79.8783,178.310200,270.32,correct,dawn,21.983180
189125358,10.5,791.0,47.0,0.000000,791.0,3.168058,134.0,2019-11-01 00:56:00,2019-11-01 01:09:00,6.88589,79.8984,6.91373,79.8923,190.362300,197.85,correct,dawn,14.418467
189125719,10.5,1087.0,80.0,0.000000,1087.0,6.305395,61.0,2019-11-01 01:08:00,2019-11-01 01:26:00,6.90839,79.8651,6.93669,79.9146,277.195100,301.64,correct,dawn,20.882633
189127273,10.5,598.0,271.0,15.663800,598.0,0.861946,68.0,2019-11-01 02:27:00,2019-11-01 02:37:00,6.92570,79.8895,6.92748,79.8971,88.106700,82.30,correct,dawn,5.188973
189128020,,,,,1020.0,8.147782,,2019-11-01 03:34:00,2019-11-01 03:51:00,6.87441,79.8615,6.84478,79.9290,220.341298,358.39,correct,dawn,28.756878
189129552,10.5,3407.0,182.0,0.000000,3407.0,24.207039,112.0,2019-11-01 05:38:00,2019-11-01 06:35:00,7.13402,79.8969,6.91865,79.8649,1008.757400,1065.02,correct,dawn,25.578321
189132829,10.5,1246.0,487.0,0.000000,1246.0,4.777624,133.0,2019-11-01 06:29:00,2019-11-01 06:49:00,6.84371,79.9051,6.85069,79.8624,242.772000,266.62,correct,dawn,13.803728
189135103,10.5,1333.0,295.0,17.198500,1333.0,5.322544,212.0,2019-11-01 06:50:00,2019-11-01 07:12:00,6.90760,79.9524,6.90634,79.9042,295.229000,318.05,correct,dawn,14.374462
189139296,10.5,360.0,80.0,4.664000,360.0,1.035302,3.0,2019-11-01 07:00:00,2019-11-01 07:06:00,7.26706,80.6064,7.27422,80.6124,99.481400,100.32,correct,dawn,10.353018
189138671,10.5,1539.0,588.0,33.986400,1539.0,2.930715,43.0,2019-11-01 07:02:00,2019-11-01 07:28:00,6.85137,79.9537,6.84779,79.9274,237.263400,257.89,correct,dawn,6.855474


<h2>Part 2: Training the classifier</h2>

In [30]:
classifier_features = training_df[['predicted_price','fare']]
classifier_label = training_df[['label']]
mapping = {'correct': 1, 'incorrect': 0}
classifier_label = classifier_label.replace({'label':mapping})

In [31]:
np.unique(classifier_label)

array([0, 1])

In [32]:
class_weights = compute_class_weight('balanced', np.unique(classifier_label), classifier_label['label'])

In [33]:
class_weights
weightmap = {0:1, 1:10000}

In [34]:
X_train, X_eval, y_train, y_eval = train_test_split(classifier_features, classifier_label, test_size=0.3,stratify= classifier_label, shuffle=True )

In [35]:
classifier = RandomForestClassifier(random_state=42, class_weight=weightmap)
## Train the model
classifier.fit(X_train, y_train)

# Predict for the evaluation set
print("Training Accuracy: %.2f" % (classifier.score(X_eval, y_eval)*100), "%")
y_pred = classifier.predict(X_eval)

  This is separate from the ipykernel package so we can avoid doing imports until


Training Accuracy: 94.56 %


In [36]:
confusion_matrix = confusion_matrix(y_eval, y_pred)
confusion_matrix

array([[ 257,  206],
       [  72, 4577]])

In [37]:
print(classification_report(y_eval, y_pred))

              precision    recall  f1-score   support

           0       0.78      0.56      0.65       463
           1       0.96      0.98      0.97      4649

    accuracy                           0.95      5112
   macro avg       0.87      0.77      0.81      5112
weighted avg       0.94      0.95      0.94      5112



<h1>Model Validation</h1>

<h2>Loading the test data set</h2>

In [38]:
test_set = pd.read_csv(DATA_PATH / "test.csv", index_col="tripid")
test_set.head()

Unnamed: 0_level_0,additional_fare,duration,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,pickup_time,drop_time,pick_lat,pick_lon,drop_lat,drop_lon,fare
tripid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
213284604,10.5,924,42,2.4486,148,2/1/2020 0:38,2/1/2020 0:53,6.83454,79.875,6.7749,79.884,289.27
213286352,10.5,4249,20,0.0,91,2/1/2020 1:02,2/1/2020 2:13,6.91168,79.8723,6.55091,79.9706,1912.7
213293973,10.5,1552,255,2.6588,23,2/1/2020 5:02,2/1/2020 5:28,6.92145,79.8478,6.90539,79.8989,394.0
213294622,10.5,462,16,0.0,198,2/1/2020 5:30,2/1/2020 5:38,6.77433,79.9416,6.80401,79.9407,154.32
213298687,10.5,814,392,12.3692,69,2/1/2020 7:00,2/1/2020 7:14,6.97968,79.913,6.98875,79.8914,147.47


In [39]:
test_set.isna().sum()

additional_fare              0
duration                     0
meter_waiting                0
meter_waiting_fare           0
meter_waiting_till_pickup    0
pickup_time                  0
drop_time                    0
pick_lat                     0
pick_lon                     0
drop_lat                     0
drop_lon                     0
fare                         0
dtype: int64

<h2>Feature Addition for the Test Dataset</h2>

In [40]:
test_set['pickup_time'] = pd.to_datetime(test_set['pickup_time'], format="%m/%d/%Y %H:%M")
test_set['drop_time'] = pd.to_datetime(test_set['drop_time'], format="%m/%d/%Y %H:%M")

In [41]:
test_set = test_set.assign(timeOfDay=pd.cut(test_set.pickup_time.dt.hour,[-1, 8, 20, 24],labels=['dawn','day', 'night']))

In [42]:
new_column = []                    #empty column for distance
for index,row in test_set.iterrows():
  lat1 = row['pick_lat'] #first row of location.lat column here
  lon1 = row['pick_lon'] #first row of location.long column here
  lat2 = row['drop_lat'] #second row of location.lat column here
  lon2 = row['drop_lon'] #second row of location.long column here
  value = dist_from_coordinates(lat1, lon1, lat2, lon2)  #get the distance
  new_column.append(value)   #append the empty list with distance values

test_set .insert(4,"distance",new_column)

In [43]:
durations = []
for index,row in test_set.iterrows():
  provided_duration = row['duration'] #first row of location.lat column here
  if math.isnan(provided_duration) or provided_duration <= 0 :
    time_dif = (row['drop_time'] - row['pickup_time']).seconds
    if(time_dif == 0):
        time_dif = 60
    durations.append(time_dif)
  else :  
    durations.append(provided_duration)

test_set.insert(4,"time_dif",durations)



In [44]:
test_set['avg_speed'] = (test_set['distance'] /  ( test_set['time_dif']) * 3600 )

In [45]:
test_set

Unnamed: 0_level_0,additional_fare,duration,meter_waiting,meter_waiting_fare,time_dif,distance,meter_waiting_till_pickup,pickup_time,drop_time,pick_lat,pick_lon,drop_lat,drop_lon,fare,timeOfDay,avg_speed
tripid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
213284604,10.5,924,42,2.448600,924,6.705702,148,2020-02-01 00:38:00,2020-02-01 00:53:00,6.83454,79.8750,6.77490,79.8840,289.27,dawn,26.126111
213286352,10.5,4249,20,0.000000,4249,41.558513,91,2020-02-01 01:02:00,2020-02-01 02:13:00,6.91168,79.8723,6.55091,79.9706,1912.70,dawn,35.210790
213293973,10.5,1552,255,2.658800,1552,5.916678,23,2020-02-01 05:02:00,2020-02-01 05:28:00,6.92145,79.8478,6.90539,79.8989,394.00,dawn,13.724253
213294622,10.5,462,16,0.000000,462,3.301761,198,2020-02-01 05:30:00,2020-02-01 05:38:00,6.77433,79.9416,6.80401,79.9407,154.32,dawn,25.728009
213298687,10.5,814,392,12.369200,814,2.588542,69,2020-02-01 07:00:00,2020-02-01 07:14:00,6.97968,79.9130,6.98875,79.8914,147.47,dawn,11.448097
213299545,10.5,2495,351,16.530800,2495,17.247478,9,2020-02-01 07:13:00,2020-02-01 07:55:00,6.99819,79.9378,7.13916,79.8726,1156.97,dawn,24.886140
213302332,10.5,1108,454,23.929200,1108,3.132721,43,2020-02-01 07:47:00,2020-02-01 08:05:00,6.79064,79.8878,6.81875,79.8859,196.81,dawn,10.178517
213302671,10.5,2737,320,18.496000,2737,11.556896,17,2020-02-01 07:48:00,2020-02-01 08:33:00,6.81545,79.9707,6.82144,79.8662,688.43,dawn,15.200886
213305594,10.5,1154,29,0.000000,1154,6.458780,130,2020-02-01 08:11:00,2020-02-01 08:30:00,6.82920,79.9798,6.79732,79.9309,288.77,dawn,20.148706
213305134,10.5,1372,277,16.046498,1372,4.218669,63,2020-02-01 08:12:00,2020-02-01 08:35:00,6.05588,80.2391,6.04033,80.2043,199.57,dawn,11.069394


In [46]:
test_features = test_set[['additional_fare', 'duration', 'meter_waiting', 'meter_waiting_fare',
       'meter_waiting_till_pickup','distance','avg_speed']]
preprocessed_test_features = preprocessor.fit_transform(test_features)
preprocessed_test_features_data_frame = pd.DataFrame(data=preprocessed_test_features, columns=new_columns)

<h2>Fare prediction and correctness prediction using Test Dataset </h2>

In [47]:
test_probs = estimator.predict(preprocessed_test_features_data_frame)

In [48]:
test_set.insert(10,'predicted_price',test_probs)

In [49]:
classifier_test_features = test_set[['predicted_price','fare']]

In [50]:
predicted_labels = classifier.predict(classifier_test_features)

<h2>Writing to the Submission File</h2>

In [51]:
submission_set = pd.read_csv(DATA_PATH / "sample_submission.csv", index_col="tripid")
submission_set.head()

submission_set['prediction']= predicted_labels

In [52]:
%%javascript
var kernel = IPython.notebook.kernel;
var thename = window.document.getElementById("notebook_name").innerHTML;
var command = "theNotebook = " + "'"+thename+"'";
kernel.execute(command);

<IPython.core.display.Javascript object>

In [53]:
filename = '../../submissions/'+theNotebook+'/teamCluster_submission_{%i}.csv'
dirname = '../../submissions/'+theNotebook
fileversion = 1

if not os.path.exists(dirname):
    os.makedirs(dirname)
while glob.glob(filename.replace('{%i}',str(fileversion))) :
    fileversion+=1
submission_set.to_csv(filename.replace('{%i}',str(fileversion)), index=True)
print("Completed!")

Completed!


In [54]:
submission_set['prediction'].idxmin()

213299545

In [55]:
submission_set['prediction'].value_counts()

1    8022
0     554
Name: prediction, dtype: int64