## Imported libraries and function

In [16]:
import pandas as pd
from sklearn import datasets, linear_model
import numpy as np
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import cross_val_score
def get_train_data():
    data = pd.read_csv('train.csv')

    data["pickup_datetime"] = pd.to_datetime(data["pickup_datetime"])
    data["dropoff_datetime"] = pd.to_datetime(data["dropoff_datetime"])
    data['pick_year'] = data["pickup_datetime"].dt.year
    data['pick_month'] = data["pickup_datetime"].dt.month
    data['pick_day'] = data["pickup_datetime"].dt.day

    data['pickup_year'] = data["pickup_datetime"].dt.year
    data['pickup_month'] = data["pickup_datetime"].dt.month
    data['pickup_day'] = data["pickup_datetime"].dt.day

    data['dropoff_year'] = data["dropoff_datetime"].dt.year
    data['dropoff_month'] = data["dropoff_datetime"].dt.month
    data['dropoff_day'] = data["dropoff_datetime"].dt.day

    pickup_hour =  data["pickup_datetime"].dt.hour
    pickup_minute = data["pickup_datetime"].dt.minute
    pickup_second = data["pickup_datetime"].dt.second

    dropoff_hour =  data["dropoff_datetime"].dt.hour
    dropoff_minute = data["dropoff_datetime"].dt.minute
    dropoff_second = data["dropoff_datetime"].dt.second

    pickup_time = pickup_hour*3600+pickup_minute*60+pickup_second
    data['pickup_time'] = pickup_time

    dropoff_time = dropoff_hour*3600+dropoff_minute*60+dropoff_second
    data['dropoff_time'] = dropoff_time

    data = data.drop(['pick_month'],axis=1)
    data =data.drop(['pick_day'],axis=1)
    data =data.drop(['pick_year'],axis=1)
    
    return data

def get_test_data():
    data = pd.read_csv('test.csv')

    data["pickup_datetime"] = pd.to_datetime(data["pickup_datetime"])
    data['pick_year'] = data["pickup_datetime"].dt.year
    data['pick_month'] = data["pickup_datetime"].dt.month
    data['pick_day'] = data["pickup_datetime"].dt.day

    data['pickup_year'] = data["pickup_datetime"].dt.year
    data['pickup_month'] = data["pickup_datetime"].dt.month
    data['pickup_day'] = data["pickup_datetime"].dt.day

    pickup_hour =  data["pickup_datetime"].dt.hour
    pickup_minute = data["pickup_datetime"].dt.minute
    pickup_second = data["pickup_datetime"].dt.second


    pickup_time = pickup_hour*3600+pickup_minute*60+pickup_second
    data['pickup_time'] = pickup_time

    data = data.drop(['pick_month'],axis=1)
    data =data.drop(['pick_day'],axis=1)
    data =data.drop(['pick_year'],axis=1)
    
    return data

from math import sin, cos, sqrt, atan2, radians

# approximate radius of earth in km
def get_L2(lat1,lon1,lat2,lon2):
    R = 6373.0

    lat1 = radians(lat1)
    lon1 = radians(lon1)
    lat2 = radians(lat2)
    lon2 = radians(lon2)

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    distance = R * c
    return distance

# Dataframe for LR
### Create Standard x and y

In [2]:
# Read data frame
df_train = get_train_data()
df_test = get_test_data()

# Drop id, vendor_id, dropoff_month, dropoff_day, dropoff_time, store_and_fwd_flag
df_train= df_train.drop(['id'],axis=1)
df_train= df_train.drop(['vendor_id'],axis=1)
df_train= df_train.drop(['dropoff_month'],axis=1)
df_train= df_train.drop(['dropoff_day'],axis=1)
df_train= df_train.drop(['dropoff_year'],axis=1)
df_train= df_train.drop(['dropoff_time'],axis=1)
df_train= df_train.drop(['store_and_fwd_flag'],axis=1)
df_train= df_train.drop(['pickup_datetime'],axis=1)
df_train= df_train.drop(['dropoff_datetime'],axis=1)

test_id = df_test['id']
df_test= df_test.drop(['id'],axis=1)
df_test= df_test.drop(['vendor_id'],axis=1)
df_test= df_test.drop(['store_and_fwd_flag'],axis=1)
df_test= df_test.drop(['pickup_datetime'],axis=1)


# Make standard x,y
train_y = df_train['trip_duration']
train_x = df_train.drop(['trip_duration'],axis=1)


test_x = df_test
#test_id


### Basic - train

In [3]:
regr = linear_model.LinearRegression()
# Timer
import timeit
start_time = timeit.default_timer()

# Actual training
regr.fit(train_x, train_y)
pred_y = regr.predict(test_x)
elapsed = timeit.default_timer() - start_time
print (elapsed)






0.5097781080112327


### Basic -predict

In [4]:
# Make the dataframe
duration_content = np.rint(pred_y)
id_content = np.array(test_id)

df = pd.DataFrame({
    'id':id_content,
    'trip_duration':duration_content
})

# Create the csv file
df.to_csv('basic_LR.csv',index= False)

# Now, upload the csv file to
# https://www.kaggle.com/c/nyc-taxi-trip-duration/submit
print("Score is 0.86725")

Score is 0.86725


### L2 - train

In [5]:
x = train_x.copy()
y = train_y.copy()
test = test_x.copy()

df_lat1 = np.array(x['pickup_latitude'])
df_lon1 = np.array(x['pickup_longitude'])
df_lat2 = np.array(x['dropoff_latitude'])
df_lon2 = np.array(x['dropoff_longitude'])

dft_lat1 = np.array(test['pickup_latitude'])
dft_lon1 = np.array(test['pickup_longitude'])
dft_lat2 = np.array(test['dropoff_latitude'])
dft_lon2 = np.array(test['dropoff_longitude'])

L2 = []
for idx in range(len(df_lat1)):
    lat1 = df_lat1[idx]
    lon1 = df_lon1[idx]
    lat2 = df_lat2[idx]
    lon2 = df_lon2[idx]
    L2.append(get_L2(lat1,lon1,lat2,lon2))
L2_t = []
for idx in range(len(dft_lat1)):
    lat1 = dft_lat1[idx]
    lon1 = dft_lon1[idx]
    lat2 = dft_lat2[idx]
    lon2 = dft_lon2[idx]
    L2_t.append(get_L2(lat1,lon1,lat2,lon2))
x['L2'] = L2
test['L2'] = L2_t

regr = linear_model.LinearRegression()
# Timer
import timeit
start_time = timeit.default_timer()

# Actual training
regr.fit(x, y)
pred_y = regr.predict(test)
elapsed = timeit.default_timer() - start_time
print (elapsed)



0.6513427649915684


### L2 - predict

In [6]:
# Make the dataframe

duration_content = np.rint(pred_y)
id_content = np.array(test_id)

df = pd.DataFrame({
    'id':id_content,
    'trip_duration':duration_content
})

# Create the csv file
df.to_csv('L2_LR.csv',index= False)

# Now, upload the csv file to
# https://www.kaggle.com/c/nyc-taxi-trip-duration/submit
print("Score is 0.68746")

Score is 0.68746


### L1 - train

In [7]:
x = train_x.copy()
y = train_y.copy()
test = test_x.copy()

df_lat1 = np.array(x['pickup_latitude'])
df_lon1 = np.array(x['pickup_longitude'])
df_lat2 = np.array(x['dropoff_latitude'])
df_lon2 = np.array(x['dropoff_longitude'])

dft_lat1 = np.array(test['pickup_latitude'])
dft_lon1 = np.array(test['pickup_longitude'])
dft_lat2 = np.array(test['dropoff_latitude'])
dft_lon2 = np.array(test['dropoff_longitude'])

L1 = []
for idx in range(len(df_lat1)):
    lat1 = df_lat1[idx]
    lon1 = df_lon1[idx]
    lat2 = df_lat2[idx]
    lon2 = df_lon2[idx]
    L1.append(abs(lat1-lat2)+abs(lon1-lon2))
L1_t = []
for idx in range(len(dft_lat1)):
    lat1 = dft_lat1[idx]
    lon1 = dft_lon1[idx]
    lat2 = dft_lat2[idx]
    lon2 = dft_lon2[idx]
    L1_t.append(abs(lat1-lat2)+abs(lon1-lon2))
x['L1'] = L1
test['L1'] = L1_t

regr = linear_model.LinearRegression()
# Timer
import timeit
start_time = timeit.default_timer()

# Actual training
regr.fit(x, y)
pred_y = regr.predict(test)
elapsed = timeit.default_timer() - start_time
print (elapsed)



0.6367131829902064


In [8]:
# Make the dataframe

duration_content = np.rint(pred_y)
id_content = np.array(test_id)

df = pd.DataFrame({
    'id':id_content,
    'trip_duration':duration_content
})

# Create the csv file
df.to_csv('L1_LR.csv',index= False)

# Now, upload the csv file to
# https://www.kaggle.com/c/nyc-taxi-trip-duration/submit
print("Score is 0.69480")

Score is 0.69480


### Polynomial(degree2) with L2 - train

In [9]:
x = train_x.copy()
y = train_y.copy()
test = test_x.copy()

df_lat1 = np.array(x['pickup_latitude'])
df_lon1 = np.array(x['pickup_longitude'])
df_lat2 = np.array(x['dropoff_latitude'])
df_lon2 = np.array(x['dropoff_longitude'])

dft_lat1 = np.array(test['pickup_latitude'])
dft_lon1 = np.array(test['pickup_longitude'])
dft_lat2 = np.array(test['dropoff_latitude'])
dft_lon2 = np.array(test['dropoff_longitude'])

L2 = []
for idx in range(len(df_lat1)):
    lat1 = df_lat1[idx]
    lon1 = df_lon1[idx]
    lat2 = df_lat2[idx]
    lon2 = df_lon2[idx]
    L2.append(get_L2(lat1,lon1,lat2,lon2))
L2_t = []
for idx in range(len(dft_lat1)):
    lat1 = dft_lat1[idx]
    lon1 = dft_lon1[idx]
    lat2 = dft_lat2[idx]
    lon2 = dft_lon2[idx]
    L2_t.append(get_L2(lat1,lon1,lat2,lon2))
x['L2'] = L2
test['L2'] = L2_t

degree_list = [2,3]

for degree in degree_list:
    poly = PolynomialFeatures(degree=degree)
    x_ = poly.fit_transform(x)
    test_ = poly.fit_transform(test)
    clf = linear_model.LinearRegression()
    import timeit
    start_time = timeit.default_timer()
    clf.fit(x_, y)
    pred_y = clf.predict(test_)
    elapsed = timeit.default_timer() - start_time
    print (elapsed)
    duration_content = np.rint(pred_y)
    id_content = np.array(test_id)

    df = pd.DataFrame({
        'id':id_content,
        'trip_duration':duration_content
    })

    # Create the csv file
    filename = 'PolyL2_LR_' + str(degree) + '.csv'
    df.to_csv(filename,index= False)
"""
poly_2 = PolynomialFeatures(degree=2)
x = poly.fit_transform(x)
test = poly.fit_transform(test)

clf = linear_model.LinearRegression()
import timeit
start_time = timeit.default_timer()
clf.fit(x_, y)
pred_y = clf.predict(test_)
elapsed = timeit.default_timer() - start_time
print (elapsed)



"""




10.589969166001538
64.12128323598881


'\npoly_2 = PolynomialFeatures(degree=2)\nx = poly.fit_transform(x)\ntest = poly.fit_transform(test)\n\nclf = linear_model.LinearRegression()\nimport timeit\nstart_time = timeit.default_timer()\nclf.fit(x_, y)\npred_y = clf.predict(test_)\nelapsed = timeit.default_timer() - start_time\nprint (elapsed)\n\n\n\n'

In [10]:
# Make the dataframe

duration_content = np.rint(pred_y)
id_content = np.array(test_id)

df = pd.DataFrame({
    'id':id_content,
    'trip_duration':duration_content
})

# Create the csv file
df.to_csv('PolyL2_LR.csv',index= False)

# Now, upload the csv file to
# https://www.kaggle.com/c/nyc-taxi-trip-duration/submit
print("Score is 0.63158")

Score is 0.63158


### Lasso with L2 -Train

In [12]:

x = train_x.copy()
y = train_y.copy()
test = test_x.copy()

df_lat1 = np.array(x['pickup_latitude'])
df_lon1 = np.array(x['pickup_longitude'])
df_lat2 = np.array(x['dropoff_latitude'])
df_lon2 = np.array(x['dropoff_longitude'])

dft_lat1 = np.array(test['pickup_latitude'])
dft_lon1 = np.array(test['pickup_longitude'])
dft_lat2 = np.array(test['dropoff_latitude'])
dft_lon2 = np.array(test['dropoff_longitude'])

L2 = []
for idx in range(len(df_lat1)):
    lat1 = df_lat1[idx]
    lon1 = df_lon1[idx]
    lat2 = df_lat2[idx]
    lon2 = df_lon2[idx]
    L2.append(get_L2(lat1,lon1,lat2,lon2))
L2_t = []
for idx in range(len(dft_lat1)):
    lat1 = dft_lat1[idx]
    lon1 = dft_lon1[idx]
    lat2 = dft_lat2[idx]
    lon2 = dft_lon2[idx]
    L2_t.append(get_L2(lat1,lon1,lat2,lon2))
x['L2'] = L2
test['L2'] = L2_t

degree_list = [2]

for degree in degree_list:
    poly = PolynomialFeatures(degree=degree)
    x_ = poly.fit_transform(x)
    test_ = poly.fit_transform(test)
    clf = linear_model.Lasso(alpha=0.1)
    import timeit
    start_time = timeit.default_timer()
    clf.fit(x_, y)
    pred_y = clf.predict(test_)
    elapsed = timeit.default_timer() - start_time
    print (elapsed)
    duration_content = np.rint(pred_y)
    id_content = np.array(test_id)

    df = pd.DataFrame({
        'id':id_content,
        'trip_duration':duration_content
    })

    # Create the csv file
    filename = 'Lasso' + str(degree) + '.csv'
    df.to_csv(filename,index= False)



309.305736476992


In [13]:
print("0.64415")

0.64415


### Lasso(degree 3) with L2 -Train

In [14]:

x = train_x.copy()
y = train_y.copy()
test = test_x.copy()

df_lat1 = np.array(x['pickup_latitude'])
df_lon1 = np.array(x['pickup_longitude'])
df_lat2 = np.array(x['dropoff_latitude'])
df_lon2 = np.array(x['dropoff_longitude'])

dft_lat1 = np.array(test['pickup_latitude'])
dft_lon1 = np.array(test['pickup_longitude'])
dft_lat2 = np.array(test['dropoff_latitude'])
dft_lon2 = np.array(test['dropoff_longitude'])

L2 = []
for idx in range(len(df_lat1)):
    lat1 = df_lat1[idx]
    lon1 = df_lon1[idx]
    lat2 = df_lat2[idx]
    lon2 = df_lon2[idx]
    L2.append(get_L2(lat1,lon1,lat2,lon2))
L2_t = []
for idx in range(len(dft_lat1)):
    lat1 = dft_lat1[idx]
    lon1 = dft_lon1[idx]
    lat2 = dft_lat2[idx]
    lon2 = dft_lon2[idx]
    L2_t.append(get_L2(lat1,lon1,lat2,lon2))
x['L2'] = L2
test['L2'] = L2_t

degree_list = [3]

for degree in degree_list:
    poly = PolynomialFeatures(degree=degree)
    x_ = poly.fit_transform(x)
    test_ = poly.fit_transform(test)
    clf = linear_model.Lasso(alpha=0.1)
    import timeit
    start_time = timeit.default_timer()
    clf.fit(x_, y)
    pred_y = clf.predict(test_)
    elapsed = timeit.default_timer() - start_time
    print (elapsed)
    duration_content = np.rint(pred_y)
    id_content = np.array(test_id)

    df = pd.DataFrame({
        'id':id_content,
        'trip_duration':duration_content
    })

    # Create the csv file
    filename = 'Lasso' + str(degree) + '.csv'
    df.to_csv(filename,index= False)



1553.5645713300037


In [15]:
print("0.63360")

0.63360


In [17]:

x = train_x.copy()
y = train_y.copy()
test = test_x.copy()

df_lat1 = np.array(x['pickup_latitude'])
df_lon1 = np.array(x['pickup_longitude'])
df_lat2 = np.array(x['dropoff_latitude'])
df_lon2 = np.array(x['dropoff_longitude'])

dft_lat1 = np.array(test['pickup_latitude'])
dft_lon1 = np.array(test['pickup_longitude'])
dft_lat2 = np.array(test['dropoff_latitude'])
dft_lon2 = np.array(test['dropoff_longitude'])

L2 = []
for idx in range(len(df_lat1)):
    lat1 = df_lat1[idx]
    lon1 = df_lon1[idx]
    lat2 = df_lat2[idx]
    lon2 = df_lon2[idx]
    L2.append(get_L2(lat1,lon1,lat2,lon2))
L2_t = []
for idx in range(len(dft_lat1)):
    lat1 = dft_lat1[idx]
    lon1 = dft_lon1[idx]
    lat2 = dft_lat2[idx]
    lon2 = dft_lon2[idx]
    L2_t.append(get_L2(lat1,lon1,lat2,lon2))
x['L2'] = L2
test['L2'] = L2_t

degree_list = [2]

for degree in degree_list:
    poly = PolynomialFeatures(degree=degree)
    x_ = poly.fit_transform(x)
    test_ = poly.fit_transform(test)
    cv_list = [0.01,0.1,0.5,1.0,2.0,5.0,10.0]
    score_list =[]
    for cv in cv_list:
        clf = linear_model.Lasso(alpha=cv)
        scores = cross_val_score(clf, x_, y, cv=5)
        score_list.append(scores.mean())
    max_idx = np.argmax(np.array(score_list))
    print("best alpha is {}".format(cv_list[max_idx]))
    import timeit
    clf = linear_model.Lasso(alpha=cv_list[max_idx])
    start_time = timeit.default_timer()
    clf.fit(x_, y)
    pred_y = clf.predict(test_)
    elapsed = timeit.default_timer() - start_time
    print (elapsed)
    duration_content = np.rint(pred_y)
    id_content = np.array(test_id)

    df = pd.DataFrame({
        'id':id_content,
        'trip_duration':duration_content
    })

    # Create the csv file
    filename = 'Lasso' + str(degree) + '.csv'
    df.to_csv(filename,index= False)





best alpha is 10.0
191.65076293601305


In [18]:
clf = Ridge(alpha=0.1)
clf.fit(x_, y) 
pred_y = clf.predict(test_)
duration_content = np.rint(pred_y)
id_content = np.array(test_id)

df = pd.DataFrame({
    'id':id_content,
    'trip_duration':duration_content
})

# Create the csv file
filename = 'Ridge' + str(degree) + '.csv'
df.to_csv(filename,index= False)

NameError: name 'Ridge' is not defined

In [26]:

x = train_x.copy()
y = train_y.copy()
test = test_x.copy()

df_lat1 = np.array(x['pickup_latitude'])
df_lon1 = np.array(x['pickup_longitude'])
df_lat2 = np.array(x['dropoff_latitude'])
df_lon2 = np.array(x['dropoff_longitude'])

dft_lat1 = np.array(test['pickup_latitude'])
dft_lon1 = np.array(test['pickup_longitude'])
dft_lat2 = np.array(test['dropoff_latitude'])
dft_lon2 = np.array(test['dropoff_longitude'])

L2 = []
for idx in range(len(df_lat1)):
    lat1 = df_lat1[idx]
    lon1 = df_lon1[idx]
    lat2 = df_lat2[idx]
    lon2 = df_lon2[idx]
    L2.append(get_L2(lat1,lon1,lat2,lon2))
L2_t = []
for idx in range(len(dft_lat1)):
    lat1 = dft_lat1[idx]
    lon1 = dft_lon1[idx]
    lat2 = dft_lat2[idx]
    lon2 = dft_lon2[idx]
    L2_t.append(get_L2(lat1,lon1,lat2,lon2))
x['L2'] = L2
test['L2'] = L2_t



In [27]:
from sklearn.neural_network import MLPRegressor
reg = MLPRegressor(hidden_layer_sizes=(20,),  activation='relu', solver='adam',    alpha=0.001,batch_size='auto',
               learning_rate='constant', learning_rate_init=0.01, power_t=0.5, max_iter=1000, shuffle=True,
               random_state=None, tol=0.0001, verbose=False, warm_start=False, momentum=0.9,
               nesterovs_momentum=True, early_stopping=False, validation_fraction=0.1, beta_1=0.9, beta_2=0.999,
               epsilon=1e-08)

import timeit
start_time = timeit.default_timer()
reg = reg.fit(x, y)
pred_y = reg.predict(test)
elapsed = timeit.default_timer() - start_time
print (elapsed)
duration_content = np.rint(pred_y)
id_content = np.array(test_id)

df = pd.DataFrame({
     'id':id_content,
     'trip_duration':duration_content
})

# Create the csv file
filename = 'MLPR_test' + str(degree) + '.csv'
df.to_csv(filename,index= False)

59.79876787198009
