## Data preprocessing

In [1]:
#Import dependencies

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [2]:
#upload data

train_data=pd.read_csv("train_revised.csv")
test_data=pd.read_csv("test_questions.csv")


In [3]:
train_data.head()


Unnamed: 0,ride_id,seat_number,payment_method,payment_receipt,travel_date,travel_time,travel_from,travel_to,car_type,max_capacity
0,1442,15A,Mpesa,UZUEHCBUSO,17-10-17,7:15,Migori,Nairobi,Bus,49
1,5437,14A,Mpesa,TIHLBUSGTE,19-11-17,7:12,Migori,Nairobi,Bus,49
2,5710,8B,Mpesa,EQX8Q5G19O,26-11-17,7:05,Keroka,Nairobi,Bus,49
3,5777,19A,Mpesa,SGP18CL0ME,27-11-17,7:10,Homa Bay,Nairobi,Bus,49
4,5778,11A,Mpesa,BM97HFRGL9,27-11-17,7:12,Migori,Nairobi,Bus,49


In [4]:
test_data.head()


Unnamed: 0,ride_id,travel_date,travel_time,travel_from,travel_to,car_type,max_capacity
0,247,2018-05-07,07:06,Kisii,Nairobi,Bus,49
1,256,2018-05-06,11:08,Kisii,Nairobi,shuttle,11
2,275,2018-05-04,05:00,Kisii,Nairobi,shuttle,11
3,285,2018-05-04,09:10,Kisii,Nairobi,shuttle,11
4,286,2018-05-04,09:20,Kisii,Nairobi,shuttle,11


In [5]:
train_data.shape


(51645, 10)

In [6]:
#number of tickets
def tickets_no(df):
    #no of ticket per ride
    train_data["tickets"] = train_data.groupby(["ride_id"])["ride_id"].transform("size")
    
    #remove repeated id
    train_data.drop_duplicates(["ride_id"],keep="first",inplace=True)
    return train_data


In [7]:
tickets_no(train_data).head()


Unnamed: 0,ride_id,seat_number,payment_method,payment_receipt,travel_date,travel_time,travel_from,travel_to,car_type,max_capacity,tickets
0,1442,15A,Mpesa,UZUEHCBUSO,17-10-17,7:15,Migori,Nairobi,Bus,49,1
1,5437,14A,Mpesa,TIHLBUSGTE,19-11-17,7:12,Migori,Nairobi,Bus,49,1
2,5710,8B,Mpesa,EQX8Q5G19O,26-11-17,7:05,Keroka,Nairobi,Bus,49,1
3,5777,19A,Mpesa,SGP18CL0ME,27-11-17,7:10,Homa Bay,Nairobi,Bus,49,5
4,5778,11A,Mpesa,BM97HFRGL9,27-11-17,7:12,Migori,Nairobi,Bus,49,31


In [8]:
import datetime

def split_travel_times(df):
    train_data['travel_date'] = pd.to_datetime(df['travel_date'], infer_datetime_format=True)
    train_data['travel_time'] = pd.to_datetime(df['travel_time'])
    train_data['year'] = df['travel_date'].dt.year
    train_data['month'] = df['travel_date'].dt.month
    train_data['day'] = df['travel_date'].dt.day
    train_data['day_of_week'] = df['travel_date'].dt.dayofweek
    train_data['hour'] = df['travel_time'].dt.hour
    train_data['minute'] = df['travel_time'].dt.minute
    
    return train_data

In [9]:
split_travel_times(train_data).head()


Unnamed: 0,ride_id,seat_number,payment_method,payment_receipt,travel_date,travel_time,travel_from,travel_to,car_type,max_capacity,tickets,year,month,day,day_of_week,hour,minute
0,1442,15A,Mpesa,UZUEHCBUSO,2017-10-17,2021-07-19 07:15:00,Migori,Nairobi,Bus,49,1,2017,10,17,1,7,15
1,5437,14A,Mpesa,TIHLBUSGTE,2017-11-19,2021-07-19 07:12:00,Migori,Nairobi,Bus,49,1,2017,11,19,6,7,12
2,5710,8B,Mpesa,EQX8Q5G19O,2017-11-26,2021-07-19 07:05:00,Keroka,Nairobi,Bus,49,1,2017,11,26,6,7,5
3,5777,19A,Mpesa,SGP18CL0ME,2017-11-27,2021-07-19 07:10:00,Homa Bay,Nairobi,Bus,49,5,2017,11,27,0,7,10
4,5778,11A,Mpesa,BM97HFRGL9,2017-11-27,2021-07-19 07:12:00,Migori,Nairobi,Bus,49,31,2017,11,27,0,7,12


In [10]:
train_data.drop(['seat_number', 'payment_method', 'payment_receipt', 'car_type', 'travel_to', 'travel_date', 'travel_time'], axis=1, inplace=True)
cols_interest=['ride_id', 'travel_from', 'max_capacity', 'year', 'month', 'day', 'day_of_week', 'hour', 'minute', 'tickets']
new_train_data=train_data[cols_interest]

In [11]:
train_data.head()


Unnamed: 0,ride_id,travel_from,max_capacity,tickets,year,month,day,day_of_week,hour,minute
0,1442,Migori,49,1,2017,10,17,1,7,15
1,5437,Migori,49,1,2017,11,19,6,7,12
2,5710,Keroka,49,1,2017,11,26,6,7,5
3,5777,Homa Bay,49,5,2017,11,27,0,7,10
4,5778,Migori,49,31,2017,11,27,0,7,12


In [12]:
train_data['travel_from'] = train_data['travel_from'].astype('category')
originCategories = train_data['travel_from'].cat.categories
train_data['origin'] = train_data['travel_from'].cat.codes

In [13]:
train_data.head()


Unnamed: 0,ride_id,travel_from,max_capacity,tickets,year,month,day,day_of_week,hour,minute,origin
0,1442,Migori,49,1,2017,10,17,1,7,15,9
1,5437,Migori,49,1,2017,11,19,6,7,12,9
2,5710,Keroka,49,1,2017,11,26,6,7,5,4
3,5777,Homa Bay,49,5,2017,11,27,0,7,10,1
4,5778,Migori,49,31,2017,11,27,0,7,12,9


In [14]:
train_data.drop(['travel_from', 'ride_id'], axis=1, inplace=True)


In [15]:
train_data.head()


Unnamed: 0,max_capacity,tickets,year,month,day,day_of_week,hour,minute,origin
0,49,1,2017,10,17,1,7,15,9
1,49,1,2017,11,19,6,7,12,9
2,49,1,2017,11,26,6,7,5,4
3,49,5,2017,11,27,0,7,10,1
4,49,31,2017,11,27,0,7,12,9


In [16]:
X = train_data.drop(['tickets'], axis=1)
y = train_data['tickets']

In [18]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
model = RandomForestRegressor(n_estimators=100, criterion="mae", n_jobs=-1)

model.fit(X,y)


RandomForestRegressor(criterion='mae', n_jobs=-1)

In [19]:
preds_train_set = model.predict(X)



In [20]:
print (mean_absolute_error(preds_train_set,y))


1.371974715954553


In [21]:
# predicting test data

test_data=pd.read_csv("test_questions.csv")
test_data.head()


Unnamed: 0,ride_id,travel_date,travel_time,travel_from,travel_to,car_type,max_capacity
0,247,2018-05-07,07:06,Kisii,Nairobi,Bus,49
1,256,2018-05-06,11:08,Kisii,Nairobi,shuttle,11
2,275,2018-05-04,05:00,Kisii,Nairobi,shuttle,11
3,285,2018-05-04,09:10,Kisii,Nairobi,shuttle,11
4,286,2018-05-04,09:20,Kisii,Nairobi,shuttle,11


In [23]:
import datetime
def split_travel_times(df):
    test_data['travel_date'] = pd.to_datetime(df['travel_date'], infer_datetime_format=True)
    test_data['travel_time'] = pd.to_datetime(df['travel_time'])
    test_data['year'] = df['travel_date'].dt.year
    test_data['month'] = df['travel_date'].dt.month
    test_data['day'] = df['travel_date'].dt.day
    test_data['day_of_week'] = df['travel_date'].dt.dayofweek
    test_data['hour'] = df['travel_time'].dt.hour
    test_data['minute'] = df['travel_time'].dt.minute
    
    return test_data


In [24]:
split_travel_times(test_data).head()


Unnamed: 0,ride_id,travel_date,travel_time,travel_from,travel_to,car_type,max_capacity,year,month,day,day_of_week,hour,minute
0,247,2018-05-07,2021-07-19 07:06:00,Kisii,Nairobi,Bus,49,2018,5,7,0,7,6
1,256,2018-05-06,2021-07-19 11:08:00,Kisii,Nairobi,shuttle,11,2018,5,6,6,11,8
2,275,2018-05-04,2021-07-19 05:00:00,Kisii,Nairobi,shuttle,11,2018,5,4,4,5,0
3,285,2018-05-04,2021-07-19 09:10:00,Kisii,Nairobi,shuttle,11,2018,5,4,4,9,10
4,286,2018-05-04,2021-07-19 09:20:00,Kisii,Nairobi,shuttle,11,2018,5,4,4,9,20


In [25]:
test_data.drop(['car_type', 'travel_to', 'travel_date', 'travel_time'], axis=1, inplace=True)
col_of_interest=['ride_id', 'travel_from', 'max_capacity', 'year', 'month', 'day', 'day_of_week', 'hour', 'minute']
new_test_data=test_data[col_of_interest]

In [26]:
test_data.head()


Unnamed: 0,ride_id,travel_from,max_capacity,year,month,day,day_of_week,hour,minute
0,247,Kisii,49,2018,5,7,0,7,6
1,256,Kisii,11,2018,5,6,6,11,8
2,275,Kisii,11,2018,5,4,4,5,0
3,285,Kisii,11,2018,5,4,4,9,10
4,286,Kisii,11,2018,5,4,4,9,20


In [27]:
test_data['travel_from'] = test_data['travel_from'].astype('category')
originCategories = test_data['travel_from'].cat.categories
test_data['origin'] = test_data['travel_from'].cat.codes

In [28]:
test_data.drop(['travel_from', 'ride_id'], axis=1, inplace=True)


In [29]:
test_data.head()


Unnamed: 0,max_capacity,year,month,day,day_of_week,hour,minute,origin
0,49,2018,5,7,0,7,6,5
1,11,2018,5,6,6,11,8,5
2,11,2018,5,4,4,5,0,5
3,11,2018,5,4,4,9,10,5
4,11,2018,5,4,4,9,20,5


In [30]:
#predictions 

test_data_predictions = model.predict(test_data)


In [31]:
print(test_data_predictions)


[1.71 1.42 3.01 ... 1.72 8.23 6.37]
