In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from scipy.stats import norm
import datetime
# if using a Jupyter notebook, inlcude:
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
#Read the data into a DataFrame and display some contents
filepath = "../input/renfe.csv"
data = pd.read_csv(filepath)
data.head()

In [None]:
print("There are", data.shape[0], "rows in this DataFrame")

In [None]:
empty_prices = data['price'].isnull().sum()
print("There are", empty_prices, "rows without prices that we will drop")
data.dropna(subset=['price'], inplace=True)
print("After dropping empty prices, we have", data.shape[0], "rows")

In [None]:
#Set the index to the first column "Unnamed: 0", then rename the column to "Id"
print("Set the index to the first column named", data.columns[0], "then rename it to 'Id'")
data.set_index(data.columns[0], inplace=True)
data.index.name='Id'
data.head()

In [None]:
print("The inset_date column does appear to be useful to me, I'll drop this column for now. I'll keep it in another kernel \
      to see if it makes a difference with error.")
data.drop(['insert_date'], axis=1, inplace=True)
data.head()

In [None]:
#Split the data into target and training dataframes/series
y = data['price']
X = data.drop(['price'], axis=1)
display(y.head())
display(X.head())

In [None]:
#start_date and end_date have useful date and time information that I want to split into separate columns
#refer to the link below on how this is done
#https://stackoverflow.com/questions/35491274/pandas-split-column-of-lists-into-multiple-columns
#We should 
#print("Split start_date's values into a list of date and time")
#temp_start_date = X['start_date'].str.split()
#print(temp_start_date.head())
#print("Create a two column DataFrame with the above")
#start_date_dataframe = pd.DataFrame(temp_start_date.values.tolist(), index=temp_start_date.index, columns=['start_date','start_time'])
#start_date_dataframe.head()

In [None]:
#start_date and end_date have useful information that I want to split up and convert to day of the week and hour of the day
start_date_day = pd.to_datetime(X['start_date']).dt.day_name()
start_date_time = pd.to_datetime(X['start_date']).dt.hour

display(start_date_day.head())
display(start_date_time.head())
print(type(start_date_day), type(start_date_time))

In [None]:
#Rename these Series to prevent overlap when joining to DataFrame X
start_date_day.rename('start_date_day', inplace=True)
start_date_time.rename('start_date_time', inplace=True)

In [None]:
#join the above Series to DataFrame X
X = X.join([start_date_day,start_date_time])
X.head()

In [None]:
#Want to calculate the duration of the ride by taking the difference of end_date and start_date
duration = pd.to_datetime(X['end_date']) - pd.to_datetime(X['start_date'])
print(type(duration))
display(duration.head())

In [None]:
duration.name = 'duration'

In [None]:
#Join the duration Series to X
X = X.join([duration])

In [None]:
X.drop(['start_date','end_date'], axis=1, inplace=True)

In [None]:
X.head()

In [None]:
#Follow examples by splitting the data into training and validation datasets before One Hot Encoding
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                                random_state=0)

In [None]:
# Get list of categorical variables
s = (X_train.dtypes == 'object')
object_cols = list(s[s].index)

print("Categorical variables:")
print(object_cols)

In [None]:
#need to OneHotEncode the string columns
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[object_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[object_cols]))

In [None]:
OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index

# Remove categorical columns (will replace with one-hot encoding)
num_X_train = X_train.drop(object_cols, axis=1)
num_X_valid = X_valid.drop(object_cols, axis=1)

# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)

In [None]:
def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=30, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

In [None]:
OH_X_valid['duration'] = OH_X_valid['duration'].dt.seconds

In [None]:
OH_X_train['duration'] = OH_X_train['duration'].dt.seconds

In [None]:
print("MAE from Approach 3 (One-Hot Encoding):") 
print(score_dataset(OH_X_train, OH_X_valid, y_train, y_valid))