In [118]:
import numpy as np
import pandas as pd
from datetime import datetime
import calendar
from datetime import timedelta
import datetime as dt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

In [119]:
from datetime import datetime
import calendar
from datetime import timedelta
import datetime as dt

In [120]:
# train data consists of 55M rows, taking 400K rows initially
train = pd.read_csv("./iiitb2019nyctaxifare/train.csv/train.csv", nrows = 400000)
test = pd.read_csv("./iiitb2019nyctaxifare/test.csv/test.csv")

In [121]:
train.head()
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400000 entries, 0 to 399999
Data columns (total 8 columns):
key                  400000 non-null object
fare_amount          400000 non-null float64
pickup_datetime      400000 non-null object
pickup_longitude     400000 non-null float64
pickup_latitude      400000 non-null float64
dropoff_longitude    399997 non-null float64
dropoff_latitude     399997 non-null float64
passenger_count      400000 non-null int64
dtypes: float64(5), int64(1), object(2)
memory usage: 24.4+ MB


In [122]:
test.head()
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11084772 entries, 0 to 11084771
Data columns (total 7 columns):
key                  object
pickup_datetime      object
pickup_longitude     float64
pickup_latitude      float64
dropoff_longitude    float64
dropoff_latitude     float64
passenger_count      int64
dtypes: float64(4), int64(1), object(2)
memory usage: 592.0+ MB


In [60]:
# change the format of datetime
def change_datetime_format(data):
    data['pickup_datetime']=pd.to_datetime(data['pickup_datetime'],format='%Y-%m-%d %H:%M:%S UTC')
    return data

In [61]:
# split the datetime
def split_datetime(data):
    data['pickup_date']= data['pickup_datetime'].dt.date
    data['pickup_day']=data['pickup_datetime'].apply(lambda x:x.day)
    data['pickup_hour']=data['pickup_datetime'].apply(lambda x:x.hour)
    data['pickup_day_of_week']=data['pickup_datetime'].apply(lambda x:calendar.day_name[x.weekday()])
    data['pickup_month']=data['pickup_datetime'].apply(lambda x:x.month)
    data['pickup_year']=data['pickup_datetime'].apply(lambda x:x.year)
    return data

In [62]:
# removing outliers in latitude and longitude
def remove_outliers(data):
    
    misplaced_locations_index = []

    for i, val in enumerate(zip(data.pickup_latitude,data.dropoff_latitude,data.pickup_longitude,data.dropoff_longitude)):

        #print(val)
        #break

        lat1,lat2,lon1,lon2 = val
        #co_ords1 = (lat1, lon1)
        #co_ords2 = (lat2, lon2)

        if lat1 < 40.5 or lat1 > 41.8 or lat2 < 40.5 or lat2 > 41.8 or lon1 < -74.5 or lon1 > -72.8 or lon2 < -74.5 or lon2 > -72.8:
            misplaced_locations_index.append(i)


    data = data.drop(misplaced_locations_index)
    return data

In [63]:
def remove_null(data):
    data = data.dropna()
    return data

In [64]:
def remove_neg_pass_count(data):
    data = data.drop(data[data['passenger_count'] <= 0].index.tolist())
    return data

In [65]:
def remove_neg_fare_amount(data):
    data = data.drop(data[data['fare_amount'] <= 0].index.tolist())
    return data

In [66]:
#calculate trip distance in miles
def distance(lat1, lat2, lon1,lon2):
    p = 0.017453292519943295 # Pi/180
    a = 0.5 - np.cos((lat2 - lat1) * p)/2 + np.cos(lat1 * p) * np.cos(lat2 * p) * (1 - np.cos((lon2 - lon1) * p)) / 2
    return 0.6213712 * 12742 * np.arcsin(np.sqrt(a))

In [67]:
def calc_distance(data):
    data['trip_distance']=data.apply(lambda row:distance(row['pickup_latitude'],row['dropoff_latitude'],row['pickup_longitude'],row['dropoff_longitude']),axis=1)
    return data

In [68]:
# Let us encode day of the week to numbers
def encodeDays(day_of_week):
    day_dict={'Sunday':0,'Monday':1,'Tuesday':2,'Wednesday':3,'Thursday':4,'Friday':5,'Saturday':6}
    return day_dict[day_of_week]

In [69]:
def pickupday_encode(data):
    data['pickup_day_of_week']=data['pickup_day_of_week'].apply(lambda x:encodeDays(x))
    return data

In [76]:
def drop_columns(data):
    data.drop(columns=['pickup_datetime','pickup_date'], inplace=True)
    return data

In [77]:
# clean the data
def cleandata(data):
    data = change_datetime_format(data)
    data = split_datetime(data)
    data = remove_outliers(data)
    data = remove_null(data)
    data = remove_neg_pass_count(data)
    #data = remove_neg_fare_amount(data)
    data = calc_distance(data)
    data = pickupday_encode(data)
    data = drop_columns(data)
    
    return data

In [72]:
train = cleandata(train)

In [73]:
train.head()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_day,pickup_hour,pickup_day_of_week,pickup_month,pickup_year,trip_distance
0,4.5,-73.98691,40.739538,-73.991381,40.745614,2,20,23,3,10,2010,0.480646
1,4.1,-73.961572,40.760283,-73.957438,40.769387,5,30,10,3,12,2009,0.665188
2,6.1,-73.979437,40.746517,-73.984195,40.732117,1,20,11,5,7,2012,1.02565
3,4.9,-73.964097,40.792508,-73.976422,40.785767,1,31,11,2,5,2011,0.795377
4,6.5,-74.003943,40.72567,-73.988915,40.74837,1,25,17,2,5,2010,1.754687


In [74]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 390181 entries, 0 to 399999
Data columns (total 12 columns):
fare_amount           390181 non-null float64
pickup_longitude      390181 non-null float64
pickup_latitude       390181 non-null float64
dropoff_longitude     390181 non-null float64
dropoff_latitude      390181 non-null float64
passenger_count       390181 non-null int64
pickup_day            390181 non-null int64
pickup_hour           390181 non-null int64
pickup_day_of_week    390181 non-null int64
pickup_month          390181 non-null int64
pickup_year           390181 non-null int64
trip_distance         390181 non-null float64
dtypes: float64(6), int64(6)
memory usage: 38.7 MB


In [78]:
test = cleandata(test)

In [123]:
test.head(130)

Unnamed: 0,key,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2009-01-01 00:01:04.0000003,2009-01-01 00:01:04 UTC,-73.972484,40.742743,-73.918937,40.764496,1
1,2009-01-01 00:01:26.0000001,2009-01-01 00:01:26 UTC,-73.985850,40.722826,-73.986301,40.739347,1
2,2009-01-01 00:04:42.0000001,2009-01-01 00:04:42 UTC,-73.988917,40.740142,-73.982769,40.777291,1
3,2009-01-01 00:04:54.0000001,2009-01-01 00:04:54 UTC,-73.977163,40.764490,-73.914474,40.771575,1
4,2009-01-01 00:04:59.0000004,2009-01-01 00:04:59 UTC,-73.948849,40.778003,-73.977678,40.748692,2
5,2009-01-01 00:05:12.0000001,2009-01-01 00:05:12 UTC,-73.981879,40.752958,-73.986693,40.702947,2
6,2009-01-01 00:06:17.0000002,2009-01-01 00:06:17 UTC,-73.972463,40.749348,-73.967542,40.756304,1
7,2009-01-01 00:07:23.0000004,2009-01-01 00:07:23 UTC,-73.872793,40.774211,-73.964306,40.712804,1
8,2009-01-01 00:08:59.0000002,2009-01-01 00:08:59 UTC,-73.989072,40.735167,-73.970879,40.751365,2
9,2009-01-01 00:09:02.0000001,2009-01-01 00:09:02 UTC,-73.976336,40.749605,-73.958337,40.760519,1


In [80]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10811259 entries, 0 to 11084771
Data columns (total 12 columns):
key                   object
pickup_longitude      float64
pickup_latitude       float64
dropoff_longitude     float64
dropoff_latitude      float64
passenger_count       int64
pickup_day            int64
pickup_hour           int64
pickup_day_of_week    int64
pickup_month          int64
pickup_year           int64
trip_distance         float64
dtypes: float64(5), int64(6), object(1)
memory usage: 1.0+ GB


In [81]:
print("Shape of Training Data after cleaning ",train.shape)
print("Shape of Testing Data after cleaning", test.shape)

Shape of Training Data after cleaning  (390181, 12)
Shape of Testing Data after cleaning (10811259, 12)


In [82]:
def processDataForModelling(data,target,is_train=True,split=0.25):
    data_1=data
    # One hot Encoding
    data_1=pd.get_dummies(data_1)
    if is_train==True:
        X=data_1.drop([target],axis=1)
        y=data_1[target]
        X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=split,random_state=123)
        
        print("Shape of Training Features",X_train.shape)
        print("Shape of Validation Features ",X_test.shape)
        
        return X_train, X_test, y_train, y_test
    else:
        print ("Shape of Test Data",data_1.shape)
        return data_1

In [83]:
X_train, X_test, y_train, y_test=processDataForModelling(train,'fare_amount',is_train=True,split=0.2)

Shape of Training Features (312144, 11)
Shape of Validation Features  (78037, 11)


In [126]:
# Baseline Model
avg_fare=round(np.mean(y_train),2)

baseline_pred=np.repeat(avg_fare, y_test.shape[0])
#baseline_rmse=np.sqrt(mean_squared_error(baseline_pred, y_test))
#print("Basline RMSE of Validation data :",baseline_rmse)

In [53]:
# Linear Regression Model
lm = LinearRegression()
lm.fit(X_train,y_train)
y_pred=np.round(lm.predict(X_test),2)
lm_rmse=np.sqrt(mean_squared_error(y_pred, y_test))
print("RMSE for Linear Regression is ",lm_rmse)

RMSE for Linear Regression is  5.410930065232967


In [54]:
test_data=processDataForModelling(test,'fare_amount',is_train=False)

Shape of Test Data (10811259, 11)


In [125]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11084772 entries, 0 to 11084771
Data columns (total 7 columns):
key                  object
pickup_datetime      object
pickup_longitude     float64
pickup_latitude      float64
dropoff_longitude    float64
dropoff_latitude     float64
passenger_count      int64
dtypes: float64(4), int64(1), object(2)
memory usage: 592.0+ MB


In [128]:
avg_fare = 11.33
baseline_pred=np.repeat(avg_fare,test.shape[0])

In [129]:
baseline_pred


array([11.33, 11.33, 11.33, ..., 11.33, 11.33, 11.33])

In [130]:
pred = pd.DataFrame(baseline_pred)

In [131]:
pred.info(0)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11084772 entries, 0 to 11084771
Data columns (total 1 columns):
0    float64
dtypes: float64(1)
memory usage: 84.6 MB


In [132]:
datasets = pd.concat([test['key'],pred[0]], axis = 1)

In [133]:
datasets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11084772 entries, 0 to 11084771
Data columns (total 2 columns):
key    object
0      float64
dtypes: float64(1), object(1)
memory usage: 169.1+ MB


In [134]:
datasets.columns = ['key','fare_amount']

In [111]:
datasets = datasets.dropna()

In [135]:
datasets.isnull().sum(axis = 0)

key            0
fare_amount    0
dtype: int64

In [136]:
datasets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11084772 entries, 0 to 11084771
Data columns (total 2 columns):
key            object
fare_amount    float64
dtypes: float64(1), object(1)
memory usage: 169.1+ MB


In [137]:
datasets.to_csv('sub1.csv',index=False)

In [138]:
d1 = pd.read_csv('sub1.csv')

In [139]:
d1.shape

(11084772, 2)

In [117]:
d1.head(140)

Unnamed: 0,key,fare_amount
0,2009-01-01 00:01:04.0000003,11.33
1,2009-01-01 00:01:26.0000001,11.33
2,2009-01-01 00:04:42.0000001,11.33
3,2009-01-01 00:04:54.0000001,11.33
4,2009-01-01 00:04:59.0000004,11.33
5,2009-01-01 00:05:12.0000001,11.33
6,2009-01-01 00:06:17.0000002,11.33
7,2009-01-01 00:07:23.0000004,11.33
8,2009-01-01 00:08:59.0000002,11.33
9,2009-01-01 00:09:02.0000001,11.33
