### Libraries

In [1]:
import numpy as np
import pandas as pd
import datetime as dt
from sklearn.preprocessing import StandardScaler
from math import radians, cos, sin, asin, sqrt

### Linear Regression using Gradient Descent

In [2]:
def h(dataset, thetas):
    return np.dot(dataset, thetas)

def derv_cost_func(dataset, target, thetas):
    dataset_T = np.transpose(dataset)
    return np.dot(dataset_T, (h(dataset, thetas) - target))

def gradient_descent(dataset, target, learning_rate = 0.01, num_itrs = 30000):
    # randomly select thetas
    thetas = np.random.rand(dataset.shape[1],)
    
    # updating thetas
    for i in range(num_itrs):    
        thetas_new = thetas - (learning_rate * (1 / float(dataset.shape[0])) *(derv_cost_func(dataset, target, thetas)))
        thetas = thetas_new
    
    return thetas

### Distance Formulas

In [3]:
# function that takes in pickup and dropoff coordinates and calculates distance
def haversine(lon_1, lat_1, lon_2, lat_2):
    lon_1, lat_1, lon_2, lat_2 = map(radians, [lon_1, lat_1, lon_2, lat_2])

    d_lon = lon_2 - lon_1 
    d_lat = lat_2 - lat_1
    result = 2 * asin(sqrt(sin(d_lat/2)**2 + cos(lat_1) * cos(lat_2) * sin(d_lon/2)**2)) 

    return 6371 * result
    
# function that traverses each row to calculate distance
def calculate_distances(dataset):
    distances = np.zeros(dataset.shape[0])
    
    for i in range(dataset.shape[0]):
        distances[i] = haversine(dataset['pickup_longitude'][i], dataset['pickup_latitude'][i], dataset['dropoff_longitude'][i], dataset['dropoff_latitude'][i])
    
    return distances

### Translating into meaningful data

In [4]:
def prepare_data(dataset):    
    # converting datetime to weekday, month, day
    datetime = pd.to_datetime(dataset['pickup_datetime'])
    day = datetime.dt.weekday  
    month = datetime.dt.month
    dataset['pickup_month'] = month.astype(float)
    dataset['pickup_day'] = day.astype(float)
    
    # converting distances
    distances = calculate_distances(dataset)
    dataset['distance'] = distances
    
    dataset['constant'] = np.ones(len(dataset))
    
    # dropping useless columns
    del dataset['pickup_longitude']
    del dataset['pickup_latitude']
    del dataset['dropoff_longitude']
    del dataset['dropoff_latitude']
    del dataset['pickup_datetime']
    del dataset['id']
    del dataset['store_and_fwd_flag']
    
    return dataset

### Function to write test predictions to CSV

In [5]:
def write_output(test_data, thetas):
    ids = test_data['id']
    del test_data['id']
    test_data = test_data.values
    
    # generate preidctions using thetas
    predictions = []
    for i in range(len(test_data)):
        predictions.append(np.dot(test_data[i], thetas))
    
    # prepare dataframe in given format
    df = pd.DataFrame()
    df['id'] = ids
    df['trip_duration'] = predictions
    
    # save csv
    df.to_csv('output.csv', encoding = 'utf-8', index = False)

### Calling Functions

In [28]:
# reading data

training_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

In [29]:
# preparing data

target = training_data['trip_duration']
del training_data['trip_duration']

training_data = prepare_data(training_data)

test_id = test_data['id']

test_data = prepare_data(test_data)

test_data['id'] = test_id

In [30]:
training_data.head()

Unnamed: 0,vendor_id,passenger_count,pickup_month,pickup_day,distance,constant
0,1,1,2.0,3.0,3.904278,1.0
1,2,5,1.0,2.0,2.682692,1.0
2,2,1,6.0,6.0,3.721689,1.0
3,2,1,1.0,6.0,1.488411,1.0
4,2,1,6.0,1.0,1.23492,1.0


In [31]:
# running gradient descent of 60K itrs, alpha 0.01, 5 times for random batches of 25K datapoints

thetas = []

training_data['id'] = target

for i in range(5):
    x = training_data.sample(frac=1)
    y = x['id']
    del x['id']
    
    x = x[0:25000]
    y = y[0:25000]

    theta = gradient_descent(x.values, y)
    thetas.append(theta)
    print (theta)

[243.32103042 -15.27024682   3.47184041   9.59402932 130.74648807
 121.43993352]
[244.46403508 -11.50629868   3.95996064   5.91024681 130.29564449
 125.58395987]
[255.6488877  -12.67079971  12.8761593   10.78052929 130.75856785
  68.11841923]
[260.46530127  -9.52949341   1.06271849   8.95148214 127.5958789
 119.21630913]
[268.8365219  -17.70087005   1.97401554   1.16924351 130.26596599
 132.45687235]


In [32]:
# getting mean of thetas
df = pd.DataFrame(thetas)
thetas = df.mean().tolist()

# writing output of predictions to csv
write_output(test_data, thetas)