<h1>Imports</h1>

In [1]:
from pathlib import Path

import numpy as np
import sklearn as slearn
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.covariance import EllipticEnvelope
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn import svm
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.svm import OneClassSVM
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import fbeta_score
from sklearn.metrics import classification_report

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.utils.class_weight import compute_class_weight
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from imblearn.combine import SMOTETomek
import math
import glob
import os

Using TensorFlow backend.


In [2]:
pd.set_option("display.max_columns", 100)
%matplotlib inline

In [3]:
RANDOM_SEED = 6

<h1>Dataset Importing</h1>

In [4]:
DATA_PATH = Path.cwd() / "../../datasets"

In [5]:
training_df = pd.read_csv(DATA_PATH / "train.csv", index_col="tripid")
training_df.head()

Unnamed: 0_level_0,additional_fare,duration,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,pickup_time,drop_time,pick_lat,pick_lon,drop_lat,drop_lon,fare,label
tripid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
189123628,10.5,834.0,56.0,0.0,64.0,11/1/2019 0:20,11/1/2019 0:34,6.86252,79.8993,6.9033,79.8783,270.32,correct
189125358,10.5,791.0,47.0,0.0,134.0,11/1/2019 0:56,11/1/2019 1:09,6.88589,79.8984,6.91373,79.8923,197.85,correct
189125719,10.5,1087.0,80.0,0.0,61.0,11/1/2019 1:08,11/1/2019 1:26,6.90839,79.8651,6.93669,79.9146,301.64,correct
189127273,10.5,598.0,271.0,15.6638,68.0,11/1/2019 2:27,11/1/2019 2:37,6.9257,79.8895,6.92748,79.8971,82.3,correct
189128020,,,,,,11/1/2019 3:34,11/1/2019 3:51,6.87441,79.8615,6.84478,79.929,358.39,correct


In [6]:
training_df = training_df.dropna(subset=['fare'])

<h1>Data Preprocessing</h1>

<h2>Data Conversions</h2>

In [7]:
# Converting date time objects from Objects to datetime64
training_df['pickup_time'] = pd.to_datetime(training_df['pickup_time'], format="%m/%d/%Y %H:%M")
training_df['drop_time'] = pd.to_datetime(training_df['drop_time'], format="%m/%d/%Y %H:%M")

<h2>Adding Features</h2>
Adding features deemed useful from Feature Engineering

In [8]:
def dist_from_coordinates(lat1, lon1, lat2, lon2):
  R = 6371  # Earth radius in km

  #conversion to radians
  d_lat = np.radians(lat2-lat1)
  d_lon = np.radians(lon2-lon1)

  r_lat1 = np.radians(lat1)
  r_lat2 = np.radians(lat2)

  #haversine formula
  a = np.sin(d_lat/2.) **2 + np.cos(r_lat1) * np.cos(r_lat2) * np.sin(d_lon/2.)**2

  haversine = 2 * R * np.arcsin(np.sqrt(a))

  return haversine

In [9]:
training_df = training_df.assign(timeOfDay=pd.cut(training_df.pickup_time.dt.hour,[-1, 8, 20, 24],labels=['dawn','day', 'night']))

In [10]:
new_column = []                    #empty column for distance
for index,row in training_df.iterrows():
  lat1 = row['pick_lat'] #first row of location.lat column here
  lon1 = row['pick_lon'] #first row of location.long column here
  lat2 = row['drop_lat'] #second row of location.lat column here
  lon2 = row['drop_lon'] #second row of location.long column here
  value = dist_from_coordinates(lat1, lon1, lat2, lon2)  #get the distance
  new_column.append(value)   #append the empty list with distance values

training_df.insert(4,"distance",new_column)

In [11]:
durations = []
for index,row in training_df.iterrows():
  provided_duration = row['duration'] #first row of location.lat column here
  if math.isnan(provided_duration) or provided_duration <= 0 :
    time_dif = (row['drop_time'] - row['pickup_time']).seconds
    if(time_dif == 0):
        time_dif = np.nan
    durations.append(time_dif)
  else :  
    durations.append(provided_duration)

training_df.insert(4,"time_dif",durations)



In [12]:
training_df['time_driven'] = training_df['duration']  - training_df['meter_waiting']

In [13]:
training_df['charge_per_hour'] = training_df['meter_waiting_fare'] / training_df['meter_waiting'] * 3600

In [14]:
training_df['driving_fare'] = training_df['fare']  - training_df['meter_waiting_fare'] - training_df['additional_fare']

In [15]:
costsperkm = []
for index,row in training_df.iterrows():
    if row['driving_fare'] == 0:
        costperkm = np.nan
    else:
        costperkm = (row['distance'] / row['driving_fare'])
    costsperkm.append(costperkm)

training_df.insert(4,"cost_per_km",costsperkm)


In [16]:
avgspeeds = []
for index,row in training_df.iterrows():
    if(row['time_driven'] == 0):
        avgspeed = np.nan
    else:
        avgspeed = (row['distance'] / row['time_driven'])
    avgspeeds.append(avgspeed)

training_df.insert(4,"avg_speed",avgspeeds)


In [17]:
training_df = training_df.drop(index=190167541)

In [18]:
# Dataset after adding Columns
training_df

Unnamed: 0_level_0,additional_fare,duration,meter_waiting,meter_waiting_fare,avg_speed,cost_per_km,time_dif,distance,meter_waiting_till_pickup,pickup_time,drop_time,pick_lat,pick_lon,drop_lat,drop_lon,fare,label,timeOfDay,time_driven,charge_per_hour,driving_fare
tripid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
189123628,10.5,834.0,56.0,0.000000,0.006546,0.019601,834.0,5.092770,64.0,2019-11-01 00:20:00,2019-11-01 00:34:00,6.86252,79.8993,6.90330,79.8783,270.32,correct,dawn,778.0,0.000000,259.820000
189125358,10.5,791.0,47.0,0.000000,0.004258,0.016910,791.0,3.168058,134.0,2019-11-01 00:56:00,2019-11-01 01:09:00,6.88589,79.8984,6.91373,79.8923,197.85,correct,dawn,744.0,0.000000,187.350000
189125719,10.5,1087.0,80.0,0.000000,0.006262,0.021658,1087.0,6.305395,61.0,2019-11-01 01:08:00,2019-11-01 01:26:00,6.90839,79.8651,6.93669,79.9146,301.64,correct,dawn,1007.0,0.000000,291.140000
189127273,10.5,598.0,271.0,15.663800,0.002636,0.015355,598.0,0.861946,68.0,2019-11-01 02:27:00,2019-11-01 02:37:00,6.92570,79.8895,6.92748,79.8971,82.30,correct,dawn,327.0,208.080000,56.136200
189128020,,,,,,,1020.0,8.147782,,2019-11-01 03:34:00,2019-11-01 03:51:00,6.87441,79.8615,6.84478,79.9290,358.39,correct,dawn,,,
189129552,10.5,3407.0,182.0,0.000000,0.007506,0.022956,3407.0,24.207039,112.0,2019-11-01 05:38:00,2019-11-01 06:35:00,7.13402,79.8969,6.91865,79.8649,1065.02,correct,dawn,3225.0,0.000000,1054.520000
189132829,10.5,1246.0,487.0,0.000000,0.006295,0.018654,1246.0,4.777624,133.0,2019-11-01 06:29:00,2019-11-01 06:49:00,6.84371,79.9051,6.85069,79.8624,266.62,correct,dawn,759.0,0.000000,256.120000
189135103,10.5,1333.0,295.0,17.198500,0.005128,0.018331,1333.0,5.322544,212.0,2019-11-01 06:50:00,2019-11-01 07:12:00,6.90760,79.9524,6.90634,79.9042,318.05,correct,dawn,1038.0,209.880000,290.351500
189139296,10.5,360.0,80.0,4.664000,0.003698,0.012158,360.0,1.035302,3.0,2019-11-01 07:00:00,2019-11-01 07:06:00,7.26706,80.6064,7.27422,80.6124,100.32,correct,dawn,280.0,209.880000,85.156000
189138671,10.5,1539.0,588.0,33.986400,0.003082,0.013733,1539.0,2.930715,43.0,2019-11-01 07:02:00,2019-11-01 07:28:00,6.85137,79.9537,6.84779,79.9274,257.89,correct,dawn,951.0,208.080000,213.403600


<h1>Model Training</h1>

<h2> Part 1 : Fare Predictor </h2>
Predict fare using features

In [19]:
# Extract Correctly predicted fares
correct_training_df = training_df[training_df['label'] == 'correct']
incorrect_training_df = training_df[training_df['label'] == 'incorrect']

In [20]:
new_columns = ['time_driven','charge_per_hour','driving_fare','avg_speed','cost_per_km','fare']

In [21]:
# Extract the useful features deemed in feature engineering
inlier_features = correct_training_df[new_columns]
outlier_features = incorrect_training_df[new_columns]

In [22]:
inlier_features.dtypes

time_driven        float64
charge_per_hour    float64
driving_fare       float64
avg_speed          float64
cost_per_km        float64
fare               float64
dtype: object

In [23]:
numerical_features = inlier_features.columns[inlier_features.dtypes == "float64"].values
categorical_features = inlier_features.columns[inlier_features.dtypes == "category" ].values

In [24]:

## chain numerical preprocessing into a pipeline object
numeric_preprocessing_steps = Pipeline([
    ('simple_imputer', SimpleImputer(strategy='median'))
])

non_numeric_preprocessing_steps = Pipeline([
    ('simple_imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot_encoder', OneHotEncoder(handle_unknown='ignore')),
#     ('label_encoder', LabelEncoder())
])

## create preprocessor stage of the final pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('non_numeric', non_numeric_preprocessing_steps, categorical_features),
        ('numeric', numeric_preprocessing_steps, numerical_features)
    ],
    remainder = 'drop'
)

In [25]:
X_train = inlier_features
y_test = outlier_features

In [26]:
# Transform training data columns 
preprocessed_train_features = preprocessor.fit_transform(X_train)
preprocessed_train_features_data_frame = pd.DataFrame(data=preprocessed_train_features, columns=new_columns)

In [27]:
# Choosing the Predictor
clf = LocalOutlierFactor(novelty=True,n_neighbors=45,n_jobs=100)
#clf = OneClassSVM()
#clf = IsolationForest(max_features=2,max_samples=100)
# Train the model
clf.fit(preprocessed_train_features)



LocalOutlierFactor(algorithm='auto', contamination='auto', leaf_size=30,
                   metric='minkowski', metric_params=None, n_jobs=100,
                   n_neighbors=45, novelty=True, p=2)

In [28]:
# Transform tvalidation data columns 
preprocessed_test_features = preprocessor.fit_transform(y_test)
preprocessed_test_features_data_frame = pd.DataFrame(data=preprocessed_test_features, columns=new_columns)

In [29]:
y_pred = clf.predict(preprocessed_test_features_data_frame)

In [30]:
list(y_pred).count(-1)/y_pred.shape[0]

0.6541450777202072

In [31]:
clf.score_samples(preprocessed_test_features_data_frame)

array([-3.40551153, -2.47874259, -2.99633624, ..., -1.06054316,
       -4.06264297, -6.15913435])

In [32]:
x_pred = clf.predict(preprocessed_train_features)

In [33]:
list(x_pred).count(1)/x_pred.shape[0]

0.9515941654834129

In [34]:
clf.score_samples(preprocessed_train_features)

array([-1.17940959, -1.00210815, -1.01579816, ..., -0.9794377 ,
       -1.14076858, -1.0905964 ])

In [35]:
# Choosing the Predictor
an_clf = IsolationForest(max_features=2,max_samples=100)
#an_clf = LocalOutlierFactor(novelty=True,n_neighbors=45,n_jobs=100)
#an_clf = OneClassSVM()

# Train the model
an_clf.fit(preprocessed_test_features)

IsolationForest(behaviour='deprecated', bootstrap=False, contamination='auto',
                max_features=2, max_samples=100, n_estimators=100, n_jobs=None,
                random_state=None, verbose=0, warm_start=False)

In [36]:
y_pred = an_clf.predict(preprocessed_test_features_data_frame)

In [37]:
list(y_pred).count(1)/y_pred.shape[0]

0.8795336787564767

In [38]:
x_pred = an_clf.predict(preprocessed_train_features)

In [39]:
list(x_pred).count(-1)/x_pred.shape[0]

0.012972763650445333

In [40]:
# training using split data done prepare features for training with full dataset !! Optional !!

required_features = training_df[new_columns]
preprocessed_full_features = preprocessor.fit_transform(required_features)
preprocessed_full_features_data_frame = pd.DataFrame(data=preprocessed_full_features, columns=new_columns)

In [41]:
scores = clf.score_samples(preprocessed_full_features_data_frame)
an_scores = an_clf.score_samples(preprocessed_full_features_data_frame)

In [42]:
preds = clf.predict(preprocessed_full_features_data_frame)

In [43]:
training_df['anomaly_prediction']= preds

In [44]:
training_df = training_df.replace({'anomaly_prediction': {-1:2, 1:4}})


In [45]:
training_df = training_df.replace({'anomaly_prediction': {2:0, 4:1}})

In [46]:
training_df['class_label'] = training_df['label']

In [47]:
training_df = training_df.replace({'class_label': {'correct': 1, 'incorrect': 0}})

In [48]:
training_df['anomaly_scores'] = scores

In [49]:
training_df['anti_anomaly_scores'] = an_scores

In [50]:
confusion_matrix = confusion_matrix(training_df['class_label'].values, training_df['anomaly_prediction'].values)

In [51]:
confusion_matrix

array([[ 1003,   541],
       [  755, 14739]])

In [52]:
print(classification_report(training_df['class_label'].values, training_df['anomaly_prediction'].values))

              precision    recall  f1-score   support

           0       0.57      0.65      0.61      1544
           1       0.96      0.95      0.96     15494

    accuracy                           0.92     17038
   macro avg       0.77      0.80      0.78     17038
weighted avg       0.93      0.92      0.93     17038



In [53]:
training_df

Unnamed: 0_level_0,additional_fare,duration,meter_waiting,meter_waiting_fare,avg_speed,cost_per_km,time_dif,distance,meter_waiting_till_pickup,pickup_time,drop_time,pick_lat,pick_lon,drop_lat,drop_lon,fare,label,timeOfDay,time_driven,charge_per_hour,driving_fare,anomaly_prediction,class_label,anomaly_scores,anti_anomaly_scores
tripid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
189123628,10.5,834.0,56.0,0.000000,0.006546,0.019601,834.0,5.092770,64.0,2019-11-01 00:20:00,2019-11-01 00:34:00,6.86252,79.8993,6.90330,79.8783,270.32,correct,dawn,778.0,0.000000,259.820000,1,1,-1.179410,-0.389926
189125358,10.5,791.0,47.0,0.000000,0.004258,0.016910,791.0,3.168058,134.0,2019-11-01 00:56:00,2019-11-01 01:09:00,6.88589,79.8984,6.91373,79.8923,197.85,correct,dawn,744.0,0.000000,187.350000,1,1,-1.002108,-0.379301
189125719,10.5,1087.0,80.0,0.000000,0.006262,0.021658,1087.0,6.305395,61.0,2019-11-01 01:08:00,2019-11-01 01:26:00,6.90839,79.8651,6.93669,79.9146,301.64,correct,dawn,1007.0,0.000000,291.140000,1,1,-1.015798,-0.388439
189127273,10.5,598.0,271.0,15.663800,0.002636,0.015355,598.0,0.861946,68.0,2019-11-01 02:27:00,2019-11-01 02:37:00,6.92570,79.8895,6.92748,79.8971,82.30,correct,dawn,327.0,208.080000,56.136200,1,1,-1.090038,-0.403683
189128020,,,,,,,1020.0,8.147782,,2019-11-01 03:34:00,2019-11-01 03:51:00,6.87441,79.8615,6.84478,79.9290,358.39,correct,dawn,,,,0,1,-1.570209,-0.454902
189129552,10.5,3407.0,182.0,0.000000,0.007506,0.022956,3407.0,24.207039,112.0,2019-11-01 05:38:00,2019-11-01 06:35:00,7.13402,79.8969,6.91865,79.8649,1065.02,correct,dawn,3225.0,0.000000,1054.520000,1,1,-1.158229,-0.486824
189132829,10.5,1246.0,487.0,0.000000,0.006295,0.018654,1246.0,4.777624,133.0,2019-11-01 06:29:00,2019-11-01 06:49:00,6.84371,79.9051,6.85069,79.8624,266.62,correct,dawn,759.0,0.000000,256.120000,1,1,-1.268984,-0.384976
189135103,10.5,1333.0,295.0,17.198500,0.005128,0.018331,1333.0,5.322544,212.0,2019-11-01 06:50:00,2019-11-01 07:12:00,6.90760,79.9524,6.90634,79.9042,318.05,correct,dawn,1038.0,209.880000,290.351500,1,1,-1.032865,-0.413340
189139296,10.5,360.0,80.0,4.664000,0.003698,0.012158,360.0,1.035302,3.0,2019-11-01 07:00:00,2019-11-01 07:06:00,7.26706,80.6064,7.27422,80.6124,100.32,correct,dawn,280.0,209.880000,85.156000,1,1,-1.008201,-0.416835
189138671,10.5,1539.0,588.0,33.986400,0.003082,0.013733,1539.0,2.930715,43.0,2019-11-01 07:02:00,2019-11-01 07:28:00,6.85137,79.9537,6.84779,79.9274,257.89,correct,dawn,951.0,208.080000,213.403600,1,1,-1.004925,-0.390320


<h2>Part 2: Training the classifier</h2>

In [54]:
classifier_features = training_df[['anomaly_scores','anti_anomaly_scores']]
classifier_label = training_df[['label']]
mapping = {'correct': 1, 'incorrect': 0}
classifier_label = classifier_label.replace({'label':mapping})

In [55]:
np.unique(classifier_label)

array([0, 1])

In [56]:
class_weights = compute_class_weight('balanced', np.unique(classifier_label), classifier_label['label'])

In [57]:
X_train, X_eval, y_train, y_eval = train_test_split(classifier_features, classifier_label, test_size=0.3,stratify= classifier_label, shuffle=True )

In [58]:

from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
classifier = QuadraticDiscriminantAnalysis()
## Train the model
classifier.fit(X_train, y_train)

# Predict for the evaluation set
print("Training Accuracy: %.2f" % (classifier.score(X_eval, y_eval)*100), "%")
y_pred2 = classifier.predict(X_eval)

Training Accuracy: 90.92 %


  y = column_or_1d(y, warn=True)


In [59]:
confusion_matrix2 = confusion_matrix(y_eval, y_pred2)
confusion_matrix2

TypeError: 'numpy.ndarray' object is not callable

In [60]:
print(classification_report(y_eval, y_pred2))

              precision    recall  f1-score   support

           0       0.50      0.16      0.24       463
           1       0.92      0.98      0.95      4649

    accuracy                           0.91      5112
   macro avg       0.71      0.57      0.60      5112
weighted avg       0.88      0.91      0.89      5112



<h1>Model Validation</h1>

<h2>Loading the test data set</h2>

In [None]:
test_set = pd.read_csv(DATA_PATH / "test.csv", index_col="tripid")
test_set.head()

In [None]:
test_set.isna().sum()

<h2>Feature Addition for the Test Dataset</h2>

In [None]:
test_set['pickup_time'] = pd.to_datetime(test_set['pickup_time'], format="%m/%d/%Y %H:%M")
test_set['drop_time'] = pd.to_datetime(test_set['drop_time'], format="%m/%d/%Y %H:%M")

In [None]:
test_set = test_set.assign(timeOfDay=pd.cut(test_set.pickup_time.dt.hour,[-1, 8, 20, 24],labels=['dawn','day', 'night']))

In [None]:
new_column = []                    #empty column for distance
for index,row in test_set.iterrows():
  lat1 = row['pick_lat'] #first row of location.lat column here
  lon1 = row['pick_lon'] #first row of location.long column here
  lat2 = row['drop_lat'] #second row of location.lat column here
  lon2 = row['drop_lon'] #second row of location.long column here
  value = dist_from_coordinates(lat1, lon1, lat2, lon2)  #get the distance
  new_column.append(value)   #append the empty list with distance values

test_set .insert(4,"distance",new_column)

In [None]:
durations = []
for index,row in test_set.iterrows():
  provided_duration = row['duration'] #first row of location.lat column here
  if math.isnan(provided_duration) or provided_duration <= 0 :
    time_dif = (row['drop_time'] - row['pickup_time']).seconds
    if(time_dif == 0):
        time_dif = 60
    durations.append(time_dif)
  else :  
    durations.append(provided_duration)

test_set.insert(4,"time_dif",durations)



In [None]:
test_set['avg_speed'] = (test_set['distance'] /  ( test_set['time_dif']) * 3600 )

In [None]:
test_set

In [None]:
test_features = test_set[['additional_fare', 'duration', 'meter_waiting', 'meter_waiting_fare',
       'meter_waiting_till_pickup','distance','avg_speed']]
preprocessed_test_features = preprocessor.fit_transform(test_features)
preprocessed_test_features_data_frame = pd.DataFrame(data=preprocessed_test_features, columns=new_columns)

<h2>Fare prediction and correctness prediction using Test Dataset </h2>

In [None]:
test_probs = estimator.predict(preprocessed_test_features_data_frame)

In [None]:
test_set.insert(10,'predicted_price',test_probs)

In [None]:
classifier_test_features = test_set[['predicted_price','fare']]

In [None]:
predicted_labels = classifier.predict(classifier_test_features)

<h2>Writing to the Submission File</h2>

In [None]:
submission_set = pd.read_csv(DATA_PATH / "sample_submission.csv", index_col="tripid")
submission_set.head()

submission_set['prediction']= predicted_labels

In [None]:
%%javascript
var kernel = IPython.notebook.kernel;
var thename = window.document.getElementById("notebook_name").innerHTML;
var command = "theNotebook = " + "'"+thename+"'";
kernel.execute(command);

In [None]:
filename = '../../submissions/'+theNotebook+'/teamCluster_submission_{%i}.csv'
dirname = '../../submissions/'+theNotebook
fileversion = 1

if not os.path.exists(dirname):
    os.makedirs(dirname)
while glob.glob(filename.replace('{%i}',str(fileversion))) :
    fileversion+=1
submission_set.to_csv(filename.replace('{%i}',str(fileversion)), index=True)
print("Completed!")

In [None]:
submission_set['prediction'].idxmin()

In [None]:
submission_set['prediction'].value_counts()