In [66]:
import pandas as pd

df = pd.read_csv('training.csv')

print(f'headers: {df.columns.values.tolist()}')

headers: ['market_id', 'created_at', 'actual_delivery_time', 'store_id', 'subtotal', 'total_onshift_dashers', 'total_busy_dashers', 'total_outstanding_orders', 'estimated_store_to_consumer_driving_duration']


StoreId seems like an arbitrary number assigned at a database level, so dropping it.

In [67]:
df = df.drop('store_id', axis=1)

In [68]:
print(df.describe())

           market_id       subtotal  total_onshift_dashers  \
count  196441.000000  197428.000000          181166.000000   
mean        2.978706    2682.331402              44.808093   
std         1.524867    1823.093688              34.526783   
min         1.000000       0.000000              -4.000000   
25%         2.000000    1400.000000              17.000000   
50%         3.000000    2200.000000              37.000000   
75%         4.000000    3395.000000              65.000000   
max         6.000000   27100.000000             171.000000   

       total_busy_dashers  total_outstanding_orders  \
count       181166.000000             181166.000000   
mean            41.739747                 58.050065   
std             32.145733                 52.661830   
min             -5.000000                 -6.000000   
25%             15.000000                 17.000000   
50%             34.000000                 41.000000   
75%             62.000000                 85.000000   
m

### Data Cleaning
1. Dropping irrelevant columns.
2. Dropping duplicates.
3. Null exists? Drop them.

In [69]:
observation_count = df.shape[0]
print(f'Number of observations: {observation_count}')

df.drop_duplicates(inplace=True)

observation_count = df.shape[0]
print(f'Number of observations: {observation_count}')

Number of observations: 197428
Number of observations: 197428


In [70]:
# Print all rows with NAN/Nulls before.
print(df[df.isnull().any(axis=1)])

df = df.dropna(axis = 0, how='any')

observation_count = df.shape[0]
print(f'Number of observations: {observation_count}')

print('------------------------------')

# Print all rows with NAN/Nulls after.
print(df[df.isnull().any(axis=1)])



        market_id           created_at actual_delivery_time  subtotal  \
45            NaN  2015-02-09 03:27:37  2015-02-09 04:22:18      2400   
92            3.0  2015-01-29 18:56:20  2015-01-29 19:26:39       975   
109           3.0  2015-02-10 21:51:54                  NaN      1125   
160           6.0  2015-02-06 01:11:56  2015-02-06 01:42:51       575   
161           6.0  2015-02-14 02:07:47  2015-02-14 03:17:37      1415   
...           ...                  ...                  ...       ...   
197199        3.0  2015-01-24 03:15:41  2015-01-24 04:04:19      2776   
197209        1.0  2015-01-29 20:15:51  2015-01-29 21:03:01      7200   
197259        NaN  2015-02-10 01:32:37  2015-02-10 02:02:09       849   
197416        1.0  2015-01-29 19:39:17  2015-01-29 20:34:21       639   
197421        1.0  2015-01-30 03:35:01  2015-01-30 04:42:19       979   

        total_onshift_dashers  total_busy_dashers  total_outstanding_orders  \
45                       94.0               