# Traffic Jam - Predicting People's Movement into Nairobi

We'll create a predictive model using traffic data provided from Uber Movement and historic bus ticket sales data from Mobiticket to predict the number of tickets that will be sold for buses into Nairobi from cities in "up country" Kenya.


In [211]:
import numpy as np
import pandas as pd

In [253]:
# fix random seed for reproducibility
seed = 7
np.random.seed(seed)


### 1. Data

In [254]:
df_raw = pd.read_csv(r'C:\Users\Patrick Munene\Documents\Machine Learning\Zindi - Traffic Jam\data\train_revised.csv')

In [255]:
df_raw.head()

Unnamed: 0,ride_id,seat_number,payment_method,payment_receipt,travel_date,travel_time,travel_from,travel_to,car_type,max_capacity
0,1442,15A,Mpesa,UZUEHCBUSO,17-10-17,7:15,Migori,Nairobi,Bus,49
1,5437,14A,Mpesa,TIHLBUSGTE,19-11-17,7:12,Migori,Nairobi,Bus,49
2,5710,8B,Mpesa,EQX8Q5G19O,26-11-17,7:05,Keroka,Nairobi,Bus,49
3,5777,19A,Mpesa,SGP18CL0ME,27-11-17,7:10,Homa Bay,Nairobi,Bus,49
4,5778,11A,Mpesa,BM97HFRGL9,27-11-17,7:12,Migori,Nairobi,Bus,49


Original train_revised.csv consists of a list of tickets. We need to aggregate them to know how many passengers are for a given ride_id

In [256]:
ride_id_dict = {} 
for ride_id in df_raw["ride_id"]:
    if not ride_id in ride_id_dict:
        ride_id_dict[ride_id] = 1
    else:
        ride_id_dict[ride_id] += 1

In [257]:
df_processed = df_raw.drop(['seat_number','payment_method','payment_receipt','travel_to','car_type'], axis=1)

In [258]:
df_processed.drop_duplicates(inplace=True)
df_processed.reset_index(drop= True, inplace=True)

In [259]:
df_processed.head()

Unnamed: 0,ride_id,travel_date,travel_time,travel_from,max_capacity
0,1442,17-10-17,7:15,Migori,49
1,5437,19-11-17,7:12,Migori,49
2,5710,26-11-17,7:05,Keroka,49
3,5777,27-11-17,7:10,Homa Bay,49
4,5778,27-11-17,7:12,Migori,49


Let's update a number of tickets for a given ride.

In [260]:
for i in range(len(df_processed)):
    ride_id = df_processed.loc[i]["ride_id"]
    df_processed.at[i,"number_of_tickets"] = ride_id_dict[ride_id]

In [261]:
df_processed.head()

Unnamed: 0,ride_id,travel_date,travel_time,travel_from,max_capacity,number_of_tickets
0,1442,17-10-17,7:15,Migori,49,1.0
1,5437,19-11-17,7:12,Migori,49,1.0
2,5710,26-11-17,7:05,Keroka,49,1.0
3,5777,27-11-17,7:10,Homa Bay,49,5.0
4,5778,27-11-17,7:12,Migori,49,31.0


In [262]:
df_processed.to_csv(r'C:\Users\Patrick Munene\Documents\Machine Learning\Zindi - Traffic Jam\data\pre_train_aggregated.csv', index=False)

In [263]:
df_train_set = pd.read_csv(r'C:\Users\Patrick Munene\Documents\Machine Learning\Zindi - Traffic Jam\data\pre_train_aggregated.csv', low_memory=False)

In [264]:
df_train_set.head()

Unnamed: 0,ride_id,travel_date,travel_time,travel_from,max_capacity,number_of_tickets
0,1442,17-10-17,7:15,Migori,49,1.0
1,5437,19-11-17,7:12,Migori,49,1.0
2,5710,26-11-17,7:05,Keroka,49,1.0
3,5777,27-11-17,7:10,Homa Bay,49,5.0
4,5778,27-11-17,7:12,Migori,49,31.0


Extracting just Date, Month and Year from travel_date column

In [265]:
df_train_set['year'] = pd.DatetimeIndex(df_train_set['travel_date']).year
df_train_set['month'] = pd.DatetimeIndex(df_train_set['travel_date']).month
df_train_set['day'] = pd.DatetimeIndex(df_train_set['travel_date']).day

In [266]:
#ride_id is unnecessary in training set
df_train_set.drop(['ride_id'], axis=1, inplace=True) 

In [267]:
df_train_set.head()

Unnamed: 0,travel_date,travel_time,travel_from,max_capacity,number_of_tickets,year,month,day
0,17-10-17,7:15,Migori,49,1.0,2017,10,17
1,19-11-17,7:12,Migori,49,1.0,2017,11,19
2,26-11-17,7:05,Keroka,49,1.0,2017,11,26
3,27-11-17,7:10,Homa Bay,49,5.0,2017,11,27
4,27-11-17,7:12,Migori,49,31.0,2017,11,27


In [268]:
df_train_set["travel_date"] = pd.to_datetime(df_train_set["travel_date"],infer_datetime_format=True)

#change the full date to day of week
df_train_set["travel_date"] = df_train_set["travel_date"].dt.dayofweek 

In [269]:
df_train_set.head()

Unnamed: 0,travel_date,travel_time,travel_from,max_capacity,number_of_tickets,year,month,day
0,1,7:15,Migori,49,1.0,2017,10,17
1,6,7:12,Migori,49,1.0,2017,11,19
2,6,7:05,Keroka,49,1.0,2017,11,26
3,0,7:10,Homa Bay,49,5.0,2017,11,27
4,0,7:12,Migori,49,31.0,2017,11,27


In [270]:
#df_train_set["car_type"] = pd.Categorical(df_train_set["car_type"])
#car_type_categories = df_train_set.car_type.cat.categories
#df_train_set["car_type"] = df_train_set.car_type.cat.codes

In [271]:
df_train_set["travel_from"] = pd.Categorical(df_train_set["travel_from"])
travel_from_categories = df_train_set.travel_from.cat.categories
df_train_set["travel_from"] = df_train_set.travel_from.cat.codes

In [272]:
#express travel time in minutes
df_train_set["travel_time"] = df_train_set["travel_time"].str.split(':').apply(lambda x: int(x[0]) * 60 + int(x[1]))

In [273]:
df_train_set.head()

Unnamed: 0,travel_date,travel_time,travel_from,max_capacity,number_of_tickets,year,month,day
0,1,435,9,49,1.0,2017,10,17
1,6,432,9,49,1.0,2017,11,19
2,6,425,4,49,1.0,2017,11,26
3,0,430,1,49,5.0,2017,11,27
4,0,432,9,49,31.0,2017,11,27


In [274]:
df_train_set.to_csv(r'C:\Users\Patrick Munene\Documents\Machine Learning\Zindi - Traffic Jam\data\train_aggregated.csv', index=False)