In [247]:
import pandas as pd
import numpy as np
import datetime as dt

In [254]:
# Read data and save as dataframe
df = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

# Create list of public holidays in Chicago (source: https://publicholidays.us/illinois/2018-dates/ and https://publicholidays.us/illinois/2019-dates/)
holidays = ['2018-10-08', '2018-11-06', '2018-11-11', '2018-11-12', '2018-11-22', '2018-11-23', '2018-12-25', 
            '2019-01-01', '2019-01-21', '2019-02-12', '2019-02-18', '2019-05-27', '2019-07-04', '2019-09-02']

# Split timestamps into date and time
df['trip_start_timestamp'] = pd.to_datetime(df['trip_start_timestamp'])
df_test['trip_start_timestamp'] = pd.to_datetime(df_test['trip_start_timestamp'])

# Pre-clean training data
df.dropna(inplace=True)
df = df[df.trip_miles > 0]
df = df[df.trip_miles < 100]
df = df[df.trip_seconds > 0]
df = df[df.trip_seconds < 7200]
df = df[df.trip_total > 0]
df = df[df.trip_total < 100]

# Pre-clearn test data
df_test.fare.fillna(df.fare.mean(), inplace=True)
df_test.trip_seconds.fillna(df.trip_seconds.mean(), inplace=True)
df_test.trip_miles.fillna(df.trip_miles.mean(), inplace=True)
df_test.trip_total.fillna(df.trip_total.mean(), inplace=True)
df_test.payment_type.fillna(0, inplace=True)

# Add weekday to new column and determine workdays and rushhours
df['trip_start_dow'] = df.trip_start_timestamp.dt.dayofweek
df['is_workday'] = np.where((df.trip_start_dow < 5) & (~df.trip_start_timestamp.dt.date.isin(holidays)), 1, 0)
df_test['trip_start_dow'] = df_test.trip_start_timestamp.dt.dayofweek
df_test['is_workday'] = np.where((df_test.trip_start_dow < 5) & (~df_test.trip_start_timestamp.dt.date.isin(holidays)), 1, 0)

df['is_rushhour'] = np.where(
    (df.is_workday == 1) & 
    (((df.trip_start_timestamp.dt.time >= dt.time(7, 0, 0)) & (df.trip_start_timestamp.dt.time <= dt.time(9, 0, 0))) | 
      ((df.trip_start_timestamp.dt.time >= dt.time(16, 0, 0)) & (df.trip_start_timestamp.dt.time <= dt.time(18, 0, 0)))), 1, 0)

# Create date column and delete time and id columns
df['trip_start_timestamp'] = df.trip_start_timestamp.dt.date

# Create additional columns
df['price_per_mile'] = df.fare / df.trip_miles
df = df[df.price_per_mile > 0]
df = df[df.price_per_mile < 10]
df['table_fare'] = 3.25 + 2.25 * df.trip_miles + df.trip_seconds / 36 * 0.20 # source: https://www.chicago.gov/city/en/depts/bacp/supp_info/2012_passenger_information.html
df['overpriced'] = np.where(df.fare > df.table_fare, 1, 0)
df.drop('table_fare', axis=1, inplace=True)


df_test['price_per_mile'] = np.where(df_test.trip_miles != 0, df_test.fare / df_test.trip_miles, 0)
df_test['table_fare'] = 3.25 + 2.25 * df_test.trip_miles + df_test.trip_seconds / 36 * 0.20 # source: https://www.chicago.gov/city/en/depts/bacp/supp_info/2012_passenger_information.html
df_test['overpriced'] = np.where(df_test.fare > df_test.table_fare, 1, 0)
df_test.drop('table_fare', axis=1, inplace=True)

In [255]:
# Drop columns
columns = ['id', 'trip_start_timestamp', 'trip_end_timestamp', 'tolls', 'trip_start_dow', 'fare', 'extras', 'tips', 'company', 'taxi_id', 
          'pickup_community_area', 'dropoff_community_area', 'pickup_centroid_longitude', 'pickup_centroid_latitude', 'dropoff_centroid_longitude', 'dropoff_centroid_latitude']
df.drop(columns, axis=1, inplace=True)
df_test.drop(columns, axis=1, inplace=True)
    
# One-hot encode categorical data
df['payment_type'] = np.where(df.payment_type == 'Cash', 1, 0)
df_test['payment_type'] = np.where(df_test.payment_type == 'Cash', 1, 0)

# Normalize data (feature scaling)
df = (df - df.min()) / (df.max() - df.min())
df_test = (df_test - df_test.min()) / (df_test.max() - df_test.min())

In [259]:
# Determine training and test set
X_train = df.loc[:, df.columns != 'is_rushhour'].to_numpy()
y_train = df.loc[:, df.columns == 'is_rushhour'].to_numpy()
X_test = df_test.to_numpy()

# Reduce dimensions (PCA)
from sklearn.decomposition import PCA
pca = PCA()
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

In [260]:
# Naive Bayes
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()

# Fit model and predict test set
classifier.fit(X_train, y_train.ravel())
y_pred = classifier.predict(X_test)

In [278]:
result = pd.DataFrame(y_pred, columns=['prediction'])
result.prediction = result.prediction.astype(int)
result.insert(0, 'id', range(200001, 200001 + len(result)))
result.to_csv('prediction.csv')