In [3]:
import pandas as pd
import numpy as np


In [4]:
df = pd.read_csv("data/train.csv")


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1658099 entries, 0 to 1658098
Data columns (total 18 columns):
TID                   1658099 non-null object
vendor_id             1658099 non-null object
new_user              1658080 non-null object
tolls_amount          1658099 non-null float64
tip_amount            1492903 non-null float64
mta_tax               1658099 non-null float64
pickup_datetime       1658099 non-null object
dropoff_datetime      1658099 non-null object
passenger_count       1658099 non-null int64
pickup_longitude      1608290 non-null float64
pickup_latitude       1624998 non-null float64
rate_code             1658099 non-null int64
store_and_fwd_flag    1009977 non-null object
dropoff_longitude     1653112 non-null float64
dropoff_latitude      1649937 non-null float64
payment_type          1658099 non-null object
surcharge             1558926 non-null float64
fare_amount           1658099 non-null float64
dtypes: float64(9), int64(2), object(7)
memory usage

In [5]:
df = df.fillna('0')

In [5]:
df['new_user'].unique()

array(['NO', 'YES', '0'], dtype=object)

In [6]:
df.store_and_fwd_flag.unique()

array(['N', '0', 'Y'], dtype=object)

In [6]:
df  = df[df.fare_amount<600]

In [7]:
from sklearn.preprocessing import LabelEncoder

columnsToEncode = ['vendor_id','new_user','store_and_fwd_flag','payment_type']
le = {}
for col in columnsToEncode:
    print('Encoding....: ', col)
    le[col] = LabelEncoder()
    df[col] = le[col].fit_transform(df[col])
    le[col].classes_ = np.append(le[col].classes_, 'other')
    print('Encoded: ', col)

Encoding....:  vendor_id
Encoded:  vendor_id
Encoding....:  new_user
Encoded:  new_user
Encoding....:  store_and_fwd_flag
Encoded:  store_and_fwd_flag
Encoding....:  payment_type
Encoded:  payment_type


In [8]:
from math import cos, asin, sqrt
co_ordinates = ['pickup_latitude','pickup_longitude', 'dropoff_latitude','dropoff_longitude']

def distance(d):
    lat1, lon1, lat2, lon2 = d['pickup_latitude'],d['pickup_longitude'],d['dropoff_latitude'],d['dropoff_longitude']
    p = 0.017453292519943295     #Pi/180
    a = 0.5 - cos((lat2 - lat1) * p)/2 + cos(lat1 * p) * cos(lat2 * p) * (1 - cos((lon2 - lon1) * p)) / 2
    return 12742 * asin(sqrt(a)) #2*R*asin...

df['spatial_distance'] =df[co_ordinates].astype(float).apply(distance, axis=1)

df = df[df.spatial_distance < 15]

In [9]:
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'],format= '%Y-%m-%d %H:%M:%S')
df['dropoff_datetime'] = pd.to_datetime(df['dropoff_datetime'],format= '%Y-%m-%d %H:%M:%S')


def time_diff(row):
    return (row.dropoff_datetime - row.pickup_datetime).total_seconds()

df['travel_time'] = df[['dropoff_datetime','pickup_datetime']].apply(time_diff,axis=1)


'''
df['pickup_day'] = df['pickup_datetime'].dt.day
df['pickup_month'] = df['pickup_datetime'].dt.month
df['pickup_year'] = df['pickup_datetime'].dt.year
df['pickup_hour'] = df['pickup_datetime'].dt.hour
df['pickup_minute'] = df['pickup_datetime'].dt.minute
df['pickup_second'] = df['pickup_datetime'].dt.second
df['dropoff_day'] = df['dropoff_datetime'].dt.day
df['dropoff_month'] = df['dropoff_datetime'].dt.month
df['dropoff_year'] = df['dropoff_datetime'].dt.year
df['dropoff_hour'] = df['dropoff_datetime'].dt.hour
df['dropoff_minute'] = df['dropoff_datetime'].dt.minute
df['dropoff_second'] = df['dropoff_datetime'].dt.second

date_columns= ['pickup_hour' ,'pickup_minute','pickup_second','dropoff_day','dropoff_second','dropoff_minute','dropoff_hour','dropoff_year','dropoff_month'
           , 'pickup_day' ,'pickup_month','pickup_year']
'''

"\ndf['pickup_day'] = df['pickup_datetime'].dt.day\ndf['pickup_month'] = df['pickup_datetime'].dt.month\ndf['pickup_year'] = df['pickup_datetime'].dt.year\ndf['pickup_hour'] = df['pickup_datetime'].dt.hour\ndf['pickup_minute'] = df['pickup_datetime'].dt.minute\ndf['pickup_second'] = df['pickup_datetime'].dt.second\ndf['dropoff_day'] = df['dropoff_datetime'].dt.day\ndf['dropoff_month'] = df['dropoff_datetime'].dt.month\ndf['dropoff_year'] = df['dropoff_datetime'].dt.year\ndf['dropoff_hour'] = df['dropoff_datetime'].dt.hour\ndf['dropoff_minute'] = df['dropoff_datetime'].dt.minute\ndf['dropoff_second'] = df['dropoff_datetime'].dt.second\n\ndate_columns= ['pickup_hour' ,'pickup_minute','pickup_second','dropoff_day','dropoff_second','dropoff_minute','dropoff_hour','dropoff_year','dropoff_month'\n           , 'pickup_day' ,'pickup_month','pickup_year']\n"

In [14]:
from sklearn import preprocessing


df= (df[df.travel_time < 20000])

x = df.travel_time #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
df['travel_time_norm'] = pd.DataFrame(x_scaled)



In [10]:
features = ['travel_time','vendor_id','new_user','tolls_amount','tip_amount','mta_tax','passenger_count','pickup_longitude','spatial_distance',
           'pickup_latitude','rate_code','store_and_fwd_flag','dropoff_longitude','dropoff_latitude','payment_type','surcharge']
df = df[df.spatial_distance < 15]
X = df[features]
y = df['fare_amount']

In [21]:
#df.select_dtypes(include=[np.number]).isnull().sum()

df[df.travel_time_norm.isnull()]

Unnamed: 0,TID,vendor_id,new_user,tolls_amount,tip_amount,mta_tax,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,...,rate_code,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,surcharge,fare_amount,spatial_distance,travel_time,travel_time_norm
1517851,AIX0001862852,3,1,0.00,2.8,0.5,2016-03-14 18:55:54,2016-03-14 19:19:00,2,-73.9735,...,1,1,-73.9158,40.7797,0,0,22.60,5.462550,1386.0,
1517852,AIX0001862853,3,1,0.00,0,0.5,2016-03-16 22:03:56,2016-03-16 22:04:53,1,-73.7819,...,1,1,-73.7804,40.6453,1,0.5,3.80,0.142128,57.0,
1517853,AIX0001862854,3,1,0.00,0,0.5,2016-03-23 14:50:02,2016-03-23 15:10:12,2,-73.9779,...,1,1,-74.006,40.7345,1,0,14.30,3.104915,1210.0,
1517855,AIX0001862856,3,1,0.00,0.5,0.5,2016-03-31 08:05:25,2016-03-31 08:09:47,1,-73.9673,...,1,1,-73.9756,40.7638,0,0,6.30,0.805589,262.0,
1517856,AIX0001862857,3,1,0.00,3.36,0.5,2016-03-28 09:18:33,2016-03-28 09:42:43,1,-73.9554,...,1,1,-73.993,40.7143,0,0,20.16,3.203756,1450.0,
1517857,AIX0001862858,1,1,0.00,3.05,0.5,2016-03-30 18:53:17,2016-03-30 19:09:30,1,-74.0109,...,1,1,-73.9938,40.7343,0,1,18.35,3.886334,973.0,
1517858,AIX0001862859,3,1,0.00,2,0.5,2016-03-18 09:42:40,2016-03-18 10:01:03,6,-73.9915,...,1,1,-74.0152,40.7113,0,0,19.30,6.313569,1103.0,
1517860,AIX0001862861,1,1,0.00,2,0.5,2016-03-07 03:09:45,2016-03-07 03:31:01,4,-73.9975,...,1,1,-73.9534,40.7718,0,0.5,21.30,6.753730,1276.0,
1517861,AIX0001862862,1,1,0.00,0,0.5,2016-03-10 18:04:02,2016-03-10 18:33:00,1,-74.0166,...,1,1,-73.9985,40.7452,1,1,20.80,4.440864,1738.0,
1517862,AIX0001862863,3,1,0.00,0,0.5,2016-03-30 19:07:35,2016-03-30 19:19:37,1,-73.9493,...,1,1,-73.9722,40.7525,1,1,11.80,3.570959,722.0,


In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test , y_train, y_test = train_test_split(X, y ,test_size =0.25 ,random_state =21)

In [12]:
from sklearn import linear_model

regr = linear_model.LinearRegression()
regr.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [25]:
regr.score(X_test, y_test)

0.87974778634590789

In [28]:
reg = linear_model.Ridge (alpha = .5)
reg.fit(X_train,y_train)


Ridge(alpha=0.5, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [29]:
reg.score(X_test,y_test)

#79715184769468506

0.87975405232408765

In [30]:
bayreg =linear_model.BayesianRidge()
bayreg.fit(X_train,y_train)

BayesianRidge(alpha_1=1e-06, alpha_2=1e-06, compute_score=False, copy_X=True,
       fit_intercept=True, lambda_1=1e-06, lambda_2=1e-06, n_iter=300,
       normalize=False, tol=0.001, verbose=False)

In [31]:
bayreg.score(X_test,y_test)

0.87974810046045215

In [13]:
from sklearn.svm import SVR

svr_rbf = SVR(kernel='rbf', C=1e3, gamma=0.1)


In [None]:
svr_rbf.fit(X_train,y_train)

In [None]:
svr_rbf.score(X_test,y_test)