# Traffic Jam - Predicting People's Movement into Nairobi

We'll create a predictive model using traffic data provided from Uber Movement and historic bus ticket sales data from Mobiticket to predict the number of tickets that will be sold for buses into Nairobi from cities in "up country" Kenya.


In [347]:
import numpy as np
import pandas as pd

In [348]:
# fix random seed for reproducibility
seed = 7
np.random.seed(seed)


### 1. Data

In [365]:
df_raw = pd.read_csv(r'C:\Users\Patrick Munene\Documents\Machine Learning\Zindi - Traffic Jam\data\train_revised.csv')

In [366]:
df_raw.head()

Unnamed: 0,ride_id,seat_number,payment_method,payment_receipt,travel_date,travel_time,travel_from,travel_to,car_type,max_capacity
0,1442,15A,Mpesa,UZUEHCBUSO,17-10-17,7:15,Migori,Nairobi,Bus,49
1,5437,14A,Mpesa,TIHLBUSGTE,19-11-17,7:12,Migori,Nairobi,Bus,49
2,5710,8B,Mpesa,EQX8Q5G19O,26-11-17,7:05,Keroka,Nairobi,Bus,49
3,5777,19A,Mpesa,SGP18CL0ME,27-11-17,7:10,Homa Bay,Nairobi,Bus,49
4,5778,11A,Mpesa,BM97HFRGL9,27-11-17,7:12,Migori,Nairobi,Bus,49


Original train_revised.csv consists of a list of tickets. We need to aggregate them to know how many passengers are for a given ride_id

In [367]:
ride_id_dict = {} 
for ride_id in df_raw["ride_id"]:
    if not ride_id in ride_id_dict:
        ride_id_dict[ride_id] = 1
    else:
        ride_id_dict[ride_id] += 1

In [374]:
df_processed = df_raw.drop(['seat_number', 'payment_method','payment_receipt', 'travel_to'], axis=1)

In [375]:
df_processed.drop_duplicates(inplace=True)
df_processed.reset_index(drop= True, inplace=True)

In [376]:
df_processed["number_of_tickets"]= np.zeros(len(df_processed))

Let's update a number of tickets for a given ride.

In [377]:
for i in range(len(df_processed)):
    ride_id = df_processed.loc[i]["ride_id"]
    df_processed.at[i,"number_of_tickets"] = ride_id_dict[ride_id]

In [378]:
df_processed.head()

Unnamed: 0,ride_id,travel_date,travel_time,travel_from,car_type,max_capacity,number_of_tickets
0,1442,17-10-17,7:15,Migori,Bus,49,1.0
1,5437,19-11-17,7:12,Migori,Bus,49,1.0
2,5710,26-11-17,7:05,Keroka,Bus,49,1.0
3,5777,27-11-17,7:10,Homa Bay,Bus,49,5.0
4,5778,27-11-17,7:12,Migori,Bus,49,31.0


In [379]:
df_processed.to_csv(r'C:\Users\Patrick Munene\Documents\Machine Learning\Zindi - Traffic Jam\data\train_aggregated.csv', index=False)

In [338]:
df_train_set = pd.read_csv('train_aggregated.csv', low_memory=False)

In [339]:
df_train_set.head()

Unnamed: 0,ride_id,travel_date,travel_time,travel_from,car_type,max_capacity,number_of_tickets
0,1442,17-10-17,7:15,Migori,Bus,49,1.0
1,5437,19-11-17,7:12,Migori,Bus,49,1.0
2,5710,26-11-17,7:05,Keroka,Bus,49,1.0
3,5777,27-11-17,7:10,Homa Bay,Bus,49,5.0
4,5778,27-11-17,7:12,Migori,Bus,49,31.0


In [340]:
#ride_id is unnecessary in training set
df_train_set.drop(['ride_id'], axis=1, inplace=True) 

In [341]:
df_train_set["travel_date"] = pd.to_datetime(df_train_set["travel_date"],infer_datetime_format=True)

#change the full date to day of week
df_train_set["travel_date"] = df_train_set["travel_date"].dt.dayofweek 

In [342]:
df_train_set["car_type"] = pd.Categorical(df_train_set["car_type"])
car_type_categories = df_train_set.car_type.cat.categories
df_train_set["car_type"] = df_train_set.car_type.cat.codes

In [343]:
df_train_set["travel_from"] = pd.Categorical(df_train_set["travel_from"])
travel_from_categories = df_train_set.travel_from.cat.categories
df_train_set["travel_from"] = df_train_set.travel_from.cat.codes

In [344]:
#express travel time in minutes
df_train_set["travel_time"] = df_train_set["travel_time"].str.split(':').apply(lambda x: int(x[0]) * 60 + int(x[1]))

In [345]:
df_train_set.head()

Unnamed: 0,travel_date,travel_time,travel_from,car_type,max_capacity,number_of_tickets
0,1,435,9,0,49,1.0
1,6,432,9,0,49,1.0
2,6,425,4,0,49,1.0
3,0,430,1,0,49,5.0
4,0,432,9,0,49,31.0


### Random Forest Model

In [346]:
X = (df_train_set.iloc[:,0:5].values).astype('float32')
X

array([[  1., 435.,   9.,   0.,  49.],
       [  6., 432.,   9.,   0.,  49.],
       [  6., 425.,   4.,   0.,  49.],
       ...,
       [  4., 429.,  14.,   0.,  49.],
       [  4., 480.,   0.,   0.,  49.],
       [  1., 310.,   7.,   0.,  49.]], dtype=float32)

In [301]:
Y = (df_train_set.iloc[:,5].values).astype('float32')
Y

array([1., 1., 1., ..., 1., 1., 1.], dtype=float32)

In [302]:
Y

array([1., 1., 1., ..., 1., 1., 1.], dtype=float32)

In [312]:
model = RandomForestRegressor(
    n_estimators=100, 
    criterion="mae", 
    n_jobs=-1)

In [313]:
model.fit(X,Y)

RandomForestRegressor(bootstrap=True, criterion='mae', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [314]:
preds_train_set = model.predict(X)

In [315]:
print (mean_absolute_error(preds_train_set,Y))

3.173438150104017
