In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
import calendar
from datetime import timedelta
import datetime as dt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

In [2]:
from datetime import datetime
import calendar
from datetime import timedelta
import datetime as dt

In [4]:
# train data consists of 55M rows, taking 400K rows initially
train = pd.read_csv("./TrainTest/train.csv", nrows = 400000)
test = pd.read_csv("./TrainTest/test.csv")

In [29]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400000 entries, 0 to 399999
Data columns (total 8 columns):
key                  400000 non-null object
fare_amount          400000 non-null float64
pickup_datetime      400000 non-null object
pickup_longitude     400000 non-null float64
pickup_latitude      400000 non-null float64
dropoff_longitude    399997 non-null float64
dropoff_latitude     399997 non-null float64
passenger_count      400000 non-null int64
dtypes: float64(5), int64(1), object(2)
memory usage: 24.4+ MB


In [30]:
test.head()
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11084772 entries, 0 to 11084771
Data columns (total 7 columns):
key                  object
pickup_datetime      object
pickup_longitude     float64
pickup_latitude      float64
dropoff_longitude    float64
dropoff_latitude     float64
passenger_count      int64
dtypes: float64(4), int64(1), object(2)
memory usage: 592.0+ MB


In [31]:
# change the format of datetime
def change_datetime_format(data):
    data['pickup_datetime']=pd.to_datetime(data['pickup_datetime'],format='%Y-%m-%d %H:%M:%S UTC')
    return data

In [32]:
# split the datetime
def split_datetime(data):
    data['pickup_date']= data['pickup_datetime'].dt.date
    data['pickup_day']=data['pickup_datetime'].apply(lambda x:x.day)
    data['pickup_hour']=data['pickup_datetime'].apply(lambda x:x.hour)
    data['pickup_day_of_week']=data['pickup_datetime'].apply(lambda x:calendar.day_name[x.weekday()])
    data['pickup_month']=data['pickup_datetime'].apply(lambda x:x.month)
    data['pickup_year']=data['pickup_datetime'].apply(lambda x:x.year)
    return data

In [33]:
# removing outliers in latitude and longitude
def remove_outliers(data):
    
    misplaced_locations_index = []

    for i, val in enumerate(zip(data.pickup_latitude,data.dropoff_latitude,data.pickup_longitude,data.dropoff_longitude)):

        #print(val)
        #break

        lat1,lat2,lon1,lon2 = val
        #co_ords1 = (lat1, lon1)
        #co_ords2 = (lat2, lon2)

        if lat1 < 40.5 or lat1 > 41.8 or lat2 < 40.5 or lat2 > 41.8 or lon1 < -74.5 or lon1 > -72.8 or lon2 < -74.5 or lon2 > -72.8:
            misplaced_locations_index.append(i)


    data = data.drop(misplaced_locations_index)
    return data

In [34]:
def remove_null(data):
    data = data.dropna()
    return data

In [35]:
def remove_neg_pass_count(data):
    data = data.drop(data[data['passenger_count'] <= 0].index.tolist())
    return data

In [36]:
def remove_neg_fare_amount(data):
    data = data.drop(data[data['fare_amount'] <= 0].index.tolist())
    return data

In [37]:
#calculate trip distance in miles
def distance(lat1, lat2, lon1,lon2):
    p = 0.017453292519943295 # Pi/180
    a = 0.5 - np.cos((lat2 - lat1) * p)/2 + np.cos(lat1 * p) * np.cos(lat2 * p) * (1 - np.cos((lon2 - lon1) * p)) / 2
    return 0.6213712 * 12742 * np.arcsin(np.sqrt(a))

In [38]:
def calc_distance(data):
    data['trip_distance']=data.apply(lambda row:distance(row['pickup_latitude'],row['dropoff_latitude'],row['pickup_longitude'],row['dropoff_longitude']),axis=1)
    return data

In [39]:
# Let us encode day of the week to numbers
def encodeDays(day_of_week):
    day_dict={'Sunday':0,'Monday':1,'Tuesday':2,'Wednesday':3,'Thursday':4,'Friday':5,'Saturday':6}
    return day_dict[day_of_week]

In [40]:
def pickupday_encode(data):
    data['pickup_day_of_week']=data['pickup_day_of_week'].apply(lambda x:encodeDays(x))
    return data

In [41]:
def drop_columns(data):
    data.drop(columns=['key','pickup_datetime','pickup_date'], inplace=True)
    return data

In [42]:
# clean the data
def cleandata(data):
    data = change_datetime_format(data)
    data = split_datetime(data)
    data = remove_outliers(data)
    data = remove_null(data)
    data = remove_neg_pass_count(data)
    data = remove_neg_fare_amount(data)
    data = calc_distance(data)
    data = pickupday_encode(data)
    data = drop_columns(data)
    
    return data

In [43]:
train = cleandata(train)

In [44]:
train.head()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_day,pickup_hour,pickup_day_of_week,pickup_month,pickup_year,trip_distance
0,4.5,-73.98691,40.739538,-73.991381,40.745614,2,20,23,3,10,2010,0.480646
1,4.1,-73.961572,40.760283,-73.957438,40.769387,5,30,10,3,12,2009,0.665188
2,6.1,-73.979437,40.746517,-73.984195,40.732117,1,20,11,5,7,2012,1.02565
3,4.9,-73.964097,40.792508,-73.976422,40.785767,1,31,11,2,5,2011,0.795377
4,6.5,-74.003943,40.72567,-73.988915,40.74837,1,25,17,2,5,2010,1.754687


In [25]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 390181 entries, 0 to 399999
Data columns (total 12 columns):
fare_amount           390181 non-null float64
pickup_longitude      390181 non-null float64
pickup_latitude       390181 non-null float64
dropoff_longitude     390181 non-null float64
dropoff_latitude      390181 non-null float64
passenger_count       390181 non-null int64
pickup_day            390181 non-null int64
pickup_hour           390181 non-null int64
pickup_day_of_week    390181 non-null int64
pickup_month          390181 non-null int64
pickup_year           390181 non-null int64
trip_distance         390181 non-null float64
dtypes: float64(6), int64(6)
memory usage: 38.7 MB


In [None]:
#test = cleandata(test)

In [26]:
test.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11084772 entries, 0 to 11084771
Data columns (total 7 columns):
key                  object
pickup_datetime      object
pickup_longitude     float64
pickup_latitude      float64
dropoff_longitude    float64
dropoff_latitude     float64
passenger_count      int64
dtypes: float64(4), int64(1), object(2)
memory usage: 592.0+ MB


In [27]:
def processDataForModelling(data,target,is_train=True,split=0.3):
    data_1=data
    # One hot Encoding
    data_1=pd.get_dummies(data_1)
    if is_train==True:
        X=data_1.drop([target],axis=1)
        y=data_1[target]
        X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=split,random_state=123)
        
        print("Shape of Training Features",X_train.shape)
        print("Shape of Validation Features ",X_test.shape)
        
        return X_train, X_test, y_train, y_test
    else:
        print ("Shape of Test Data",data_1.shape)
        return data_1

In [28]:
X_train, X_test, y_train, y_test=processDataForModelling(train,'fare_amount',is_train=True,split=0.2)

Shape of Training Features (312144, 11)
Shape of Validation Features  (78037, 11)


In [17]:
avg_fare=round(np.mean(y_train),2)
avg_fare

In [30]:
# Baseline Model
baseline_pred=np.repeat(avg_fare,test.shape[0])
baseline_rmse=np.sqrt(mean_squared_error(baseline_pred, y_test))
print("Basline RMSE of Validation data :",baseline_rmse)

Basline RMSE of Validation data : 9.762140179591256


In [19]:
# Baseline Model
baseline_pred=np.repeat(avg_fare,test.shape[0])
#baseline_rmse=np.sqrt(mean_squared_error(baseline_pred, test))
#print("Basline RMSE of Validation data :",baseline_rmse)

In [21]:
p = pd.DataFrame(baseline_pred)
datasets = pd.concat([test['key'],p[0]],axis=1)
datasets.columns = ['key', 'fare_amount']
datasets.to_csv('submission1.csv',index = False)

In [22]:
d1  = pd.read_csv('submission1.csv')

In [23]:
d1.shape

(11084772, 2)

In [24]:
d1['fare_amount'].isnull().sum()

0

In [9]:
cols = d1.columns

for i in cols:
    if d1[i].isnull().sum() != 0:
        print("Column name is:", i)
        print(d1[i].isnull().sum())

Column name is: fare_amount
11006735


In [25]:
d1.shape[0] - d1[i].isnull().sum()

11084772

In [13]:
test.describe()

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,11084770.0,11084770.0,11084690.0,11084690.0,11084770.0
mean,-72.50765,39.91602,-72.51082,39.92153,1.685516
std,13.13703,9.638166,12.98226,10.04393,1.322056
min,-3414.133,-3488.08,-3442.025,-3488.08,0.0
25%,-73.99206,40.73493,-73.9914,40.73404,1.0
50%,-73.9818,40.75265,-73.98016,40.75316,1.0
75%,-73.96705,40.76713,-73.96368,40.76809,2.0
max,3456.138,3378.013,3453.425,3390.614,208.0


In [15]:
test['key'].isnull().sum()

0