In [165]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_breast_cancer
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from xgboost import XGBClassifier
from sklearn.linear_model import LinearRegression
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [114]:
train_df[train_df.cancelled == 1].last_mile_distance.value_counts().head(1000)

0.00    27
1.42    19
1.22    19
1.76    18
2.86    18
        ..
6.79     1
7.66     1
8.46     1
7.19     1
7.30     1
Name: last_mile_distance, Length: 847, dtype: int64

In [126]:
train_df['order_date'].nunique()

12

In [137]:
lb = preprocessing.LabelEncoder()
df['order_date'] = lb.fit_transform(df['order_date'])

In [188]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

order_id = test_df['order_id']

train_df = fix_columns(train_df)
test_df = fix_columns_2(test_df)


train_df['order_date'] = lb.fit_transform(train_df['order_date'])
test_df['order_date'] = lb.fit_transform(test_df['order_date'])
train_df

Unnamed: 0,order_date,last_mile_distance,cancelled,undelivered_orders,lifetime_order_count,session_time,accept_duration
0,0,2.65,0,0.0,621.0,,9.0
1,0,2.76,0,0.0,105.0,3.266667,48.0
2,0,4.80,0,0.0,66.0,9.816667,16.0
3,0,6.38,0,0.0,127.0,17.533333,41.0
4,0,4.01,0,0.0,84.0,1.350000,36.0
...,...,...,...,...,...,...,...
449995,11,0.19,0,0.0,127.0,369.516667,30.0
449996,11,1.19,0,0.0,105.0,239.133333,76.0
449997,11,1.61,0,0.0,1488.0,204.150000,33.0
449998,11,4.68,0,0.0,105.0,65.583333,142.0


In [187]:
train_df.undelivered_orders.value_counts()

0.0    232686
1.0    118103
2.0     50582
3.0     20011
4.0      7733
5.0      1851
6.0       932
7.0       600
8.0       105
9.0        56
Name: undelivered_orders, dtype: int64

In [177]:
def main():
    #importing data
    train_df = pd.read_csv('train.csv')
    test_df = pd.read_csv('test.csv')
    #preprocessing data
    train_df = fix_columns(train_df)
    test_df = fix_columns_2(test_df)
    
    
    y = train_df['cancelled']
    X = train_df.drop(['cancelled'], axis = 1)
    
    train_df = fix_na(train_df)
    test_df = fix_na(test_df)
        
    X = scale_numeric(X)
    
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=42)
    
    trees = DecisionTreeClassifier(random_state=0)
    forest = RandomForestClassifier(n_estimators=20, random_state=2)

    preds_trees = trees.fit(X_train,y_train).predict(X_test)
    preds_forest = forest.fit(X_train,y_train).predict(X_test)

    print(f"Trees Test accuracy { accuracy_score(preds_trees, y_test)}")
    print(f"Forest Test accuracy { accuracy_score(preds_forest, y_test)}")


In [128]:
def xgb():
    train_df = pd.read_csv('train.csv')
    test_df = pd.read_csv('test.csv')
    #preprocessing data
    train_df = fix_columns(train_df)
    test_df = fix_columns_2(test_df)
    
    train_df = fix_na(train_df)
    test_df = fix_na(test_df)
    
    y = train_df['cancelled']
    X = train_df.drop(['cancelled'], axis = 1)
        
    X = scale_numeric(X)
    
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=42)
    model = XGBClassifier()
    preds = model.fit(X_train, y_train).predict(X_test)
    print(f"Accuracy of model {accuracy_score(preds, y_test)}")
    print(confusion_matrix(y_test, preds)) 

In [197]:
def fix_columns(df):
    df = df.drop(['order_id', 'reassigned_order', 'reassignment_method', 'reassignment_reason','order_time','rider_id','first_mile_distance'], axis = 1)
    df['allot_time'] = pd.to_datetime(df['allot_time'])
    df['accept_time'] = pd.to_datetime(df['accept_time'])
    t1 = df['accept_time'] - df['allot_time']
    df['accept_duration'] = t1
    df = df.drop(['allot_time','accept_time','pickup_time','cancelled_time','delivered_time','delivered_orders','alloted_orders'], axis = 1)
    df['accept_duration'] = pd.to_timedelta(df['accept_duration']).dt.total_seconds()
    lb = preprocessing.LabelEncoder()
    df['order_date'] = lb.fit_transform(df['order_date'])
    return df
    
def fix_columns_2(df):
    df = df.drop(['order_id', 'reassigned_order', 'reassignment_method', 'reassignment_reason','order_time','rider_id','first_mile_distance'], axis = 1)
    df['allot_time'] = pd.to_datetime(df['allot_time'])
    df['accept_time'] = pd.to_datetime(df['accept_time'])
    t1 = df['accept_time'] - df['allot_time']
    df['accept_duration'] = t1
    df = df.drop(['allot_time','accept_time','delivered_orders','alloted_orders'], axis = 1)
    df['accept_duration'] = pd.to_timedelta(df['accept_duration']).dt.total_seconds()
    lb = preprocessing.LabelEncoder()
    df['order_date'] = lb.fit_transform(df['order_date'])
    return df

def fix_na(df):
    lr = LinearRegression()
    imp = IterativeImputer(estimator = lr, tol = 1e-10, max_iter = 50, verbose = 2, imputation_order = 'roman')
    imp.fit_transform(df)
    df = imp.transform(df)
    df = pd.DataFrame(df)
    return df

def scale_numeric(df):
    x = df.values 
    scaler = preprocessing.StandardScaler()
    x_scaled = scaler.fit_transform(x)
    df = pd.DataFrame(x_scaled)
    return df

In [191]:
test_df

Unnamed: 0,order_date,last_mile_distance,undelivered_orders,lifetime_order_count,accept_duration
0,0,4.54,1.0,747.0,26.0
1,0,5.84,0.0,75.0,9.0
2,0,0.99,0.0,2214.0,80.0
3,0,2.59,3.0,1020.0,23.0
4,0,0.94,2.0,7284.0,60.0
...,...,...,...,...,...
144839,3,3.96,0.0,413.0,9.0
144840,3,1.61,2.0,284.0,34.0
144841,3,1.26,0.0,119.0,67.0
144842,3,5.50,3.0,1759.0,24.0


In [147]:
xgb()



Accuracy of model 0.9890838177498169
[[132360     14]
 [  1447     17]]


In [194]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

order_id = test_df['order_id']

train_df = fix_columns(train_df)
test_df = fix_columns_2(test_df)


train_df['order_date'] = lb.fit_transform(train_df['order_date'])
test_df['order_date'] = lb.fit_transform(test_df['order_date'])

def fix_na_2(df):
    df['session_time']=df['session_time'].fillna(method='bfill')
    df['undelivered_orders']=df['undelivered_orders'].fillna(1)
    df['lifetime_order_count'] = df['lifetime_order_count'].fillna(method = 'bfill')
    df['accept_duration'] = df['accept_duration'].fillna(method = 'bfill')
    return df

train_df = fix_na_2(train_df)
test_df = fix_na_2(test_df)

test_df = scale_numeric(test_df)

y = train_df['cancelled']
X = train_df.drop(['cancelled'], axis = 1)



In [185]:
def fix_na_2(df):
    df['undelivered_orders']=df['undelivered_orders'].fillna(1)
    df['lifetime_order_count'] = df['lifetime_order_count'].fillna(method = 'bfill')
    df['accept_duration'] = df['accept_duration'].fillna(method = 'bfill')
    return df

In [200]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

order_id = test_df['order_id']

train_df = fix_columns(train_df)
test_df = fix_columns_2(test_df)

train_df['order_date'] = lb.fit_transform(train_df['order_date'])
test_df['order_date'] = lb.fit_transform(test_df['order_date'])

train_df = scale_numeric(train_df)
test_df = scale_numeric(test_df)


train_df

Unnamed: 0,0,1,2,3,4,5,6
0,-1.593360,-0.169242,-0.108312,-0.716536,-0.154787,,-0.483978
1,-1.593360,-0.110859,-0.108312,-0.716536,-0.498106,-1.229153,-0.121034
2,-1.593360,0.971873,-0.108312,-0.716536,-0.524055,-1.192087,-0.418834
3,-1.593360,1.810460,-0.108312,-0.716536,-0.483468,-1.148420,-0.186178
4,-1.593360,0.552580,-0.108312,-0.716536,-0.512078,-1.239999,-0.232709
...,...,...,...,...,...,...,...
449995,1.767453,-1.474890,-0.108312,-0.716536,-0.483468,0.843409,-0.288547
449996,1.767453,-0.944139,-0.108312,-0.716536,-0.498106,0.105586,0.139541
449997,1.767453,-0.721223,-0.108312,-0.716536,0.422069,-0.092380,-0.260628
449998,1.767453,0.908183,-0.108312,-0.716536,-0.498106,-0.876511,0.753754


In [203]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

order_id = test_df['order_id']

train_df = fix_columns(train_df)
test_df = fix_columns_2(test_df)

train_df['order_date'] = lb.fit_transform(train_df['order_date'])
test_df['order_date'] = lb.fit_transform(test_df['order_date'])



train_df = fix_na_2(train_df)
test_df = fix_na_2(test_df)

y = train_df['cancelled']
X = train_df.drop(['cancelled'], axis = 1)

test_df = scale_numeric(test_df)
X = scale_numeric(X)


model = RandomForestClassifier(n_estimators=20, random_state=2)
preds = model.fit(X, y).predict(test_df)


In [205]:
y.value_counts()

0    444782
1      5218
Name: cancelled, dtype: int64

In [206]:
type(preds)

numpy.ndarray

In [207]:
unique, counts = np.unique(preds, return_counts=True)
dict(zip(unique, counts))

{0: 144826, 1: 18}

In [208]:
type(order_id)

pandas.core.series.Series

In [209]:
preds = pd.Series(preds)

In [210]:
preds.value_counts()

0    144826
1        18
dtype: int64

In [211]:
ans = pd.concat([order_id,preds], axis=1)

In [93]:
ans

Unnamed: 0,order_id,0
0,130231,0
1,130232,0
2,130233,0
3,130234,0
4,130235,0
...,...,...
144839,41184,0
144840,41185,0
144841,41186,0
144842,41187,0


In [212]:
ans.rename(columns={0:'cancelled'}, inplace=True)

In [95]:
ans

Unnamed: 0,order_id,cancelled
0,130231,0
1,130232,0
2,130233,0
3,130234,0
4,130235,0
...,...,...
144839,41184,0
144840,41185,0
144841,41186,0
144842,41187,0


In [213]:
ans.to_csv('ans3.csv', index=False)

In [214]:
ans.columns

Index(['order_id', 'cancelled'], dtype='object')

In [100]:
train_df

Unnamed: 0,first_mile_distance,last_mile_distance,cancelled,undelivered_orders,lifetime_order_count,accept_duration
0,1.5666,2.65,0,0.0,621.0,9.0
1,2.5207,2.76,0,0.0,105.0,48.0
2,2.2074,4.80,0,0.0,66.0,16.0
3,2.1894,6.38,0,0.0,127.0,41.0
4,2.7870,4.01,0,0.0,84.0,36.0
...,...,...,...,...,...,...
449995,0.5789,0.19,0,0.0,127.0,30.0
449996,1.9863,1.19,0,0.0,105.0,76.0
449997,1.5944,1.61,0,0.0,1488.0,33.0
449998,2.8939,4.68,0,0.0,105.0,142.0
