In [None]:
from google.colab import files

uploaded = files.upload()

In [None]:
# LOAD CSV DATA
import pandas as pd

waybill_data = pd.read_csv("belvedere-waybill.csv")

waybill_data = waybill_data[["departure_time", "pickup_station", "dropoff_station", "total_count", "greetings", "day_time", "week_day"]]
print(waybill_data)

In [None]:
# CHECK IF IT WAS A WEEKEND
def check_weekend(row):
  if row["week_day"] == "Saturday" or row["week_day"] == "Sunday":
    return 1

  else:
    return 0

waybill_data["weekend"] = waybill_data.apply(lambda row: check_weekend(row), axis=1)

In [None]:
# REMOVING ROWS WITH Null/NaN/NaT VALUES
waybill_data = waybill_data.dropna()

In [None]:
# REMOVING ROWS WHERE PICKUP & DROP STATIONS COLUMNS ARE THE SAME
waybill_data = waybill_data[waybill_data["pickup_station"] != waybill_data["dropoff_station"]]
print(waybill_data)

In [None]:
# TRANSFORM DATE TO NUMERIC
import datetime as dt

waybill_data["departure_time"] = pd.to_datetime(waybill_data["departure_time"])
waybill_data["departure_time"] = waybill_data["departure_time"].apply(lambda x: x.timestamp())

# waybill_data["arrival_time"] = pd.to_datetime(waybill_data["arrival_time"])
# waybill_data["arrival_time"] = waybill_data["arrival_time"].apply(lambda x: x.timestamp())

print("COLUMNS:", waybill_data.columns)
print("DATA FRAME:\n", waybill_data[["departure_time", "day_time", "pickup_station", "dropoff_station"]].tail(10))

In [None]:
# TRANSFORM TIME TO NUMERIC (MINUTES)

def convert_to_minutes(value):
  hours, minutes = value.split(":")
  return int(hours) * 60 + int(minutes)

waybill_data["day_time_minutes"] = waybill_data["day_time"].apply(convert_to_minutes)

print("COLUMNS:", waybill_data.columns)
print("DATA FRAME:\n", waybill_data[["day_time_minutes", "pickup_station", "dropoff_station"]].tail(10))

In [None]:
# TRANSFORM STRING TO NUMERIC - ONE HOT ENCODING
waybill_data = pd.get_dummies(waybill_data, prefix=["greetings", "day", "pickup", "dropoff"], columns=["greetings", "week_day", "pickup_station", "dropoff_station"])

print(waybill_data.columns)
print(waybill_data.head())

In [None]:
# DEFINE MODEL & GET ACCURACY [LINEAR REGRESSION]
import sklearn
import numpy as np
from sklearn import linear_model
from sklearn.metrics import r2_score

predict = "total_count"
departure_time = "departure_time"
day_time = "day_time"

x = np.array(waybill_data.drop(columns=[predict, day_time, departure_time], axis=1))
y = np.array(waybill_data[predict])
x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(x, y, test_size=0.2, random_state=42)
waybill_model = linear_model.LinearRegression()
waybill_model.fit(x_train, y_train)
predictions = waybill_model.predict(x_test)

r_square = r2_score(y_test, predictions)

print("R SQUARE", r_square)

R SQUARE 0.844337713776504


In [None]:
# SAVING MODEL
import pickle

with open("waybillmodel.pickle", "wb") as f:
  pickle.dump(waybill_model, f)

# Reading saved model
pickle_in = open("waybillmodel.pickle", "rb")
waybill_model = pickle.load(pickle_in)