In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

%matplotlib inline

In [2]:
from sklearn.utils import resample

In [3]:
import time

In [4]:
raw_df = pd.read_csv('train_data.csv')

In [5]:
raw_df[raw_df.dissatisfaction==0].dropna().info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 322949 entries, 0 to 339599
Data columns (total 17 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   order_id              322949 non-null  int64  
 1   gender                322949 non-null  object 
 2   order_time            322949 non-null  object 
 3   allot_time            322949 non-null  object 
 4   pickup_time           322949 non-null  object 
 5   delivered_time        322949 non-null  object 
 6   transport_id          322949 non-null  int64  
 7   first_mile_distance   322949 non-null  float64
 8   last_mile_distance    322949 non-null  float64
 9   alloted_orders        322949 non-null  float64
 10  delivered_orders      322949 non-null  float64
 11  customer_care_calls   322949 non-null  float64
 12  membership            322949 non-null  object 
 13  product_importan      322949 non-null  object 
 14  lifetime_order_count  322949 non-null  float64
 15  

## Resampling the data to make it balanced

In [6]:
no_diss = raw_df[raw_df.dissatisfaction==0].dropna()
diss = raw_df[raw_df.dissatisfaction==1]

In [7]:
down_no_diss = resample(no_diss,n_samples=2*len(diss),replace=False,random_state=42)

In [8]:
norm_df = pd.concat([diss,down_no_diss])
norm_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12015 entries, 20 to 53441
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   order_id              12015 non-null  int64  
 1   gender                12015 non-null  object 
 2   order_time            12015 non-null  object 
 3   allot_time            12015 non-null  object 
 4   pickup_time           10131 non-null  object 
 5   delivered_time        8011 non-null   object 
 6   transport_id          12015 non-null  int64  
 7   first_mile_distance   12015 non-null  float64
 8   last_mile_distance    12015 non-null  float64
 9   alloted_orders        11517 non-null  float64
 10  delivered_orders      11353 non-null  float64
 11  customer_care_calls   12015 non-null  float64
 12  membership            12015 non-null  object 
 13  product_importan      12015 non-null  object 
 14  lifetime_order_count  11977 non-null  float64
 15  undelivered_orders

In [9]:
norm_df_copy = norm_df.copy()

In [10]:
norm_df['delivered_time'].fillna('31-12-9999 0:0',inplace=True)
norm_df['pickup_time'].fillna('31-12-8999 0:0',inplace=True)
#norm_df['lifetime_order_count'].fillna(norm_df['delivered_orders'].apply(lambda x: 8.32*x),inplace=True)
#norm_df['delivered_orders'].fillna(norm_df['lifetime_order_count'].apply(lambda x: x/8.32),inplace=True)

In [11]:
norm_df.dropna()['dissatisfaction'].value_counts()

0    8010
1    3343
Name: dissatisfaction, dtype: int64

In [12]:
3343*100/(3343+8010)

29.445961419889016

In [13]:
norm_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12015 entries, 20 to 53441
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   order_id              12015 non-null  int64  
 1   gender                12015 non-null  object 
 2   order_time            12015 non-null  object 
 3   allot_time            12015 non-null  object 
 4   pickup_time           12015 non-null  object 
 5   delivered_time        12015 non-null  object 
 6   transport_id          12015 non-null  int64  
 7   first_mile_distance   12015 non-null  float64
 8   last_mile_distance    12015 non-null  float64
 9   alloted_orders        11517 non-null  float64
 10  delivered_orders      11353 non-null  float64
 11  customer_care_calls   12015 non-null  float64
 12  membership            12015 non-null  object 
 13  product_importan      12015 non-null  object 
 14  lifetime_order_count  11977 non-null  float64
 15  undelivered_orders

In [14]:
norm_df_wna = norm_df.copy()

In [15]:
norm_df.dropna(inplace=True)

In [16]:
norm_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11353 entries, 20 to 53441
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   order_id              11353 non-null  int64  
 1   gender                11353 non-null  object 
 2   order_time            11353 non-null  object 
 3   allot_time            11353 non-null  object 
 4   pickup_time           11353 non-null  object 
 5   delivered_time        11353 non-null  object 
 6   transport_id          11353 non-null  int64  
 7   first_mile_distance   11353 non-null  float64
 8   last_mile_distance    11353 non-null  float64
 9   alloted_orders        11353 non-null  float64
 10  delivered_orders      11353 non-null  float64
 11  customer_care_calls   11353 non-null  float64
 12  membership            11353 non-null  object 
 13  product_importan      11353 non-null  object 
 14  lifetime_order_count  11353 non-null  float64
 15  undelivered_orders

## 1) Converting string to time, encoding necessary columns
## 2) Calculating time for delivery, pick-up & respective speeds 

In [17]:
norm_df['order_time'] = norm_df['order_time'].apply(lambda x : time.strptime(x,'%d-%m-%Y %H:%M'))
norm_df['allot_time'] = norm_df['allot_time'].apply(lambda x : time.strptime(x,'%d-%m-%Y %H:%M'))
norm_df['pickup_time'] = norm_df['pickup_time'].apply(lambda x : time.strptime(x,'%d-%m-%Y %H:%M'))
norm_df['delivered_time'] = norm_df['delivered_time'].apply(lambda x : time.strptime(x,'%d-%m-%Y %H:%M'))

norm_df['time_to_pickup'] = norm_df.apply(lambda x : (time.mktime(x['pickup_time'])-time.mktime(x['allot_time']))/60,axis=1)
norm_df['time_to_delivery'] = norm_df.apply(lambda x : (time.mktime(x['delivered_time'])-time.mktime(x['pickup_time']))/60,axis=1)

norm_df['pickup_speed'] = (norm_df['first_mile_distance']*60)/(0.5+norm_df['time_to_pickup'])
norm_df['delivery_speed'] = (norm_df['last_mile_distance']*60)/(0.5+norm_df['time_to_delivery'])

norm_df['total_time'] = norm_df.apply(lambda x : (time.mktime(x['delivered_time'])-time.mktime(x['order_time']))/60,axis=1)

norm_df['prime_member'] = norm_df['membership'].apply(lambda x : 1 if x=='Prime' else 0)
norm_df['product_importance'] = norm_df['product_importan'].apply(lambda x : 0 if x=='low' else (1 if x=='other' else 2))

norm_df['gender_F'] = norm_df['gender'].apply(lambda x : 1 if x=='F' else 0)

In [18]:
norm_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11353 entries, 20 to 53441
Data columns (total 25 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   order_id              11353 non-null  int64  
 1   gender                11353 non-null  object 
 2   order_time            11353 non-null  object 
 3   allot_time            11353 non-null  object 
 4   pickup_time           11353 non-null  object 
 5   delivered_time        11353 non-null  object 
 6   transport_id          11353 non-null  int64  
 7   first_mile_distance   11353 non-null  float64
 8   last_mile_distance    11353 non-null  float64
 9   alloted_orders        11353 non-null  float64
 10  delivered_orders      11353 non-null  float64
 11  customer_care_calls   11353 non-null  float64
 12  membership            11353 non-null  object 
 13  product_importan      11353 non-null  object 
 14  lifetime_order_count  11353 non-null  float64
 15  undelivered_orders

In [19]:
norm_df.head(10)

Unnamed: 0,order_id,gender,order_time,allot_time,pickup_time,delivered_time,transport_id,first_mile_distance,last_mile_distance,alloted_orders,...,undelivered_orders,dissatisfaction,time_to_pickup,time_to_delivery,pickup_speed,delivery_speed,total_time,prime_member,product_importance,gender_F
20,1003470,M,"(2021, 1, 26, 3, 28, 0, 1, 26, -1)","(2021, 1, 26, 4, 9, 0, 1, 26, -1)","(2021, 1, 27, 3, 59, 0, 2, 27, -1)","(2021, 1, 27, 4, 15, 0, 2, 27, -1)",12885,28.438,102.15,29.0,...,0.0,1,1430.0,16.0,1.192786,371.4545,1487.0,0,0,0
68,1003518,F,"(2021, 1, 26, 3, 45, 0, 1, 26, -1)","(2021, 1, 26, 3, 45, 0, 1, 26, -1)","(8999, 12, 31, 0, 0, 0, 1, 365, -1)","(9999, 12, 31, 0, 0, 0, 4, 365, -1)",17557,8.617,39.45,13.0,...,0.0,1,3670561000.0,525948500.0,1.408558e-07,4.500441e-06,4196510000.0,1,2,1
100,1003550,M,"(2021, 1, 26, 3, 54, 0, 1, 26, -1)","(2021, 1, 26, 5, 29, 0, 1, 26, -1)","(8999, 12, 31, 0, 0, 0, 1, 365, -1)","(9999, 12, 31, 0, 0, 0, 4, 365, -1)",2391,29.217,78.0,118.0,...,4.0,1,3670561000.0,525948500.0,4.775891e-07,8.89821e-06,4196510000.0,0,2,0
163,1003613,F,"(2021, 1, 26, 4, 11, 0, 1, 26, -1)","(2021, 1, 26, 4, 12, 0, 1, 26, -1)","(8999, 12, 31, 0, 0, 0, 1, 365, -1)","(9999, 12, 31, 0, 0, 0, 4, 365, -1)",15229,21.287,6.75,55.0,...,3.0,1,3670561000.0,525948500.0,3.479631e-07,7.700374e-07,4196510000.0,0,0,1
295,1003745,M,"(2021, 1, 26, 4, 39, 0, 1, 26, -1)","(2021, 1, 26, 4, 39, 0, 1, 26, -1)","(8999, 12, 31, 0, 0, 0, 1, 365, -1)","(9999, 12, 31, 0, 0, 0, 4, 365, -1)",2373,16.053,72.6,104.0,...,0.0,1,3670561000.0,525948500.0,2.624067e-07,8.28218e-06,4196510000.0,1,2,0
317,1003767,M,"(2021, 1, 26, 4, 43, 0, 1, 26, -1)","(2021, 1, 26, 4, 44, 0, 1, 26, -1)","(2021, 1, 26, 4, 55, 0, 1, 26, -1)","(9999, 12, 31, 0, 0, 0, 4, 365, -1)",2049,20.333,88.95,40.0,...,0.0,1,11.0,4196510000.0,106.0852,1.271771e-06,4196510000.0,1,0,0
345,1003795,M,"(2021, 1, 26, 4, 48, 0, 1, 26, -1)","(2021, 1, 26, 4, 49, 0, 1, 26, -1)","(2021, 1, 26, 5, 0, 0, 1, 26, -1)","(9999, 12, 31, 0, 0, 0, 4, 365, -1)",118,7.327,5.1,81.0,...,0.0,1,11.0,4196510000.0,38.22783,7.291774e-08,4196510000.0,0,2,0
445,1003895,F,"(2021, 1, 26, 5, 3, 0, 1, 26, -1)","(2021, 1, 26, 5, 3, 0, 1, 26, -1)","(2021, 1, 26, 5, 24, 0, 1, 26, -1)","(9999, 12, 31, 0, 0, 0, 4, 365, -1)",13577,30.004,43.2,12.0,...,0.0,1,21.0,4196510000.0,83.73209,6.176562e-07,4196510000.0,1,2,1
651,1004101,M,"(2021, 1, 26, 5, 29, 0, 1, 26, -1)","(2021, 1, 26, 5, 37, 0, 1, 26, -1)","(8999, 12, 31, 0, 0, 0, 1, 365, -1)","(9999, 12, 31, 0, 0, 0, 4, 365, -1)",8458,24.691,15.75,8.0,...,0.0,1,3670561000.0,525948500.0,4.036059e-07,1.796754e-06,4196510000.0,0,0,0
774,1004224,F,"(2021, 1, 26, 5, 45, 0, 1, 26, -1)","(2021, 1, 26, 5, 45, 0, 1, 26, -1)","(2021, 1, 26, 5, 57, 0, 1, 26, -1)","(9999, 12, 31, 0, 0, 0, 4, 365, -1)",16856,13.706,80.4,53.0,...,1.0,1,12.0,4196510000.0,65.7888,1.149527e-06,4196510000.0,0,2,1


In [20]:
#norm_df.to_csv('balanced_data_full.csv',index=False)

In [21]:
final_norm_df = norm_df.drop(['order_id','gender','product_importan','membership','order_time','allot_time',
                             'pickup_time','delivered_time','transport_id'],axis=1)

In [22]:
final_norm_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11353 entries, 20 to 53441
Data columns (total 16 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   first_mile_distance   11353 non-null  float64
 1   last_mile_distance    11353 non-null  float64
 2   alloted_orders        11353 non-null  float64
 3   delivered_orders      11353 non-null  float64
 4   customer_care_calls   11353 non-null  float64
 5   lifetime_order_count  11353 non-null  float64
 6   undelivered_orders    11353 non-null  float64
 7   dissatisfaction       11353 non-null  int64  
 8   time_to_pickup        11353 non-null  float64
 9   time_to_delivery      11353 non-null  float64
 10  pickup_speed          11353 non-null  float64
 11  delivery_speed        11353 non-null  float64
 12  total_time            11353 non-null  float64
 13  prime_member          11353 non-null  int64  
 14  product_importance    11353 non-null  int64  
 15  gender_F          

In [23]:
final_norm_df.head()

Unnamed: 0,first_mile_distance,last_mile_distance,alloted_orders,delivered_orders,customer_care_calls,lifetime_order_count,undelivered_orders,dissatisfaction,time_to_pickup,time_to_delivery,pickup_speed,delivery_speed,total_time,prime_member,product_importance,gender_F
20,28.438,102.15,29.0,29.0,2.0,449.0,0.0,1,1430.0,16.0,1.192786,371.4545,1487.0,0,0,0
68,8.617,39.45,13.0,13.0,4.0,168.0,0.0,1,3670561000.0,525948480.0,1.408558e-07,4.500441e-06,4196510000.0,1,2,1
100,29.217,78.0,118.0,114.0,4.0,860.0,4.0,1,3670561000.0,525948480.0,4.775891e-07,8.89821e-06,4196510000.0,0,2,0
163,21.287,6.75,55.0,52.0,1.0,256.0,3.0,1,3670561000.0,525948480.0,3.479631e-07,7.700374e-07,4196510000.0,0,0,1
295,16.053,72.6,104.0,104.0,5.0,3282.0,0.0,1,3670561000.0,525948480.0,2.624067e-07,8.28218e-06,4196510000.0,1,2,0


In [24]:
final_norm_df['dissatisfaction'].value_counts()

0    8010
1    3343
Name: dissatisfaction, dtype: int64

In [25]:
final_norm_df[final_norm_df['time_to_pickup']>0]['dissatisfaction'].value_counts()

0    8009
1    3334
Name: dissatisfaction, dtype: int64

In [26]:
sc_final_df = final_norm_df[final_norm_df['time_to_pickup']>0]

In [27]:
big_cols = ['first_mile_distance','last_mile_distance','pickup_speed','delivery_speed','total_time',
            'time_to_pickup','time_to_delivery']

for col in big_cols:
    sc_final_df[col] = np.log(1+sc_final_df[col])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sc_final_df[col] = np.log(1+sc_final_df[col])


In [28]:
sc_final_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11343 entries, 20 to 53441
Data columns (total 16 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   first_mile_distance   11343 non-null  float64
 1   last_mile_distance    11343 non-null  float64
 2   alloted_orders        11343 non-null  float64
 3   delivered_orders      11343 non-null  float64
 4   customer_care_calls   11343 non-null  float64
 5   lifetime_order_count  11343 non-null  float64
 6   undelivered_orders    11343 non-null  float64
 7   dissatisfaction       11343 non-null  int64  
 8   time_to_pickup        11343 non-null  float64
 9   time_to_delivery      11343 non-null  float64
 10  pickup_speed          11343 non-null  float64
 11  delivery_speed        11343 non-null  float64
 12  total_time            11343 non-null  float64
 13  prime_member          11343 non-null  int64  
 14  product_importance    11343 non-null  int64  
 15  gender_F          

In [29]:
sc_final_df.head(20)

Unnamed: 0,first_mile_distance,last_mile_distance,alloted_orders,delivered_orders,customer_care_calls,lifetime_order_count,undelivered_orders,dissatisfaction,time_to_pickup,time_to_delivery,pickup_speed,delivery_speed,total_time,prime_member,product_importance,gender_F
20,3.382286,4.636184,29.0,29.0,2.0,449.0,0.0,1,7.266129,2.833213,0.7851728,5.920115,7.305188,0,0,0
68,2.263532,3.700067,13.0,13.0,4.0,168.0,0.0,1,22.02361,20.080714,1.408558e-07,4.500431e-06,22.157519,1,2,1
100,3.408405,4.369448,118.0,114.0,4.0,860.0,4.0,1,22.02361,20.080714,4.77589e-07,8.89817e-06,22.157519,0,2,0
163,3.104004,2.047693,55.0,52.0,1.0,256.0,3.0,1,22.02361,20.080714,3.479631e-07,7.700371e-07,22.157519,0,0,1
295,2.836326,4.298645,104.0,104.0,5.0,3282.0,0.0,1,22.02361,20.080714,2.624067e-07,8.282146e-06,22.157519,1,2,0
317,3.060255,4.499254,40.0,40.0,4.0,2635.0,0.0,1,2.484907,22.157519,4.673625,1.27177e-06,22.157519,1,0,0
345,2.119503,1.808289,81.0,81.0,2.0,2852.0,0.0,1,2.484907,22.157519,3.669386,7.291774e-08,22.157519,0,2,0
445,3.434116,3.788725,12.0,12.0,3.0,199.0,0.0,1,3.091042,22.157519,4.439494,6.17656e-07,22.157519,1,2,1
651,3.246141,2.818398,8.0,8.0,3.0,41.0,0.0,1,22.02361,20.080714,4.036058e-07,1.796752e-06,22.157519,0,0,0
774,2.688256,4.399375,53.0,52.0,5.0,1451.0,1.0,1,2.564949,22.157519,4.201535,1.149526e-06,22.157519,0,2,1


In [30]:
X = sc_final_df.drop('dissatisfaction',axis=1)
y = sc_final_df['dissatisfaction']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [31]:
rf = RandomForestClassifier(random_state=42)

rf.fit(X_train,y_train)

In [32]:
print(classification_report(y_test,rf.predict(X_test)))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1990
           1       1.00      1.00      1.00       846

    accuracy                           1.00      2836
   macro avg       1.00      1.00      1.00      2836
weighted avg       1.00      1.00      1.00      2836



In [33]:
#i give up