In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import AdaBoostRegressor
from sklearn.model_selection import train_test_split
df = pd.read_csv('train.csv',nrows=1000000)

# Calculate distance with respect to latitude and longitude

In [2]:
from math import sin, cos, sqrt, atan2, radians
def calculateDistance(d_lon, d_lat,lat_p,lat_d):
    # approximate radius of earth in km
    R = 6373.0
    dlon = d_lon
    dlat = d_lat
    distance = []
    for i in range(len(d_lon)):
        a = sin(dlat[i] / 2)**2 + cos(lat_p[i]) * cos(lat_d[i]) * sin(dlon[i] / 2)**2
        c = 2 * atan2(sqrt(a), sqrt(1 - a))
        distance.append(R*c)
    distance = np.asarray(distance)
    print(distance[:5])
    return distance

In [3]:
#data = np.load("train_numpy.npy")
print(df.head())

                             key  fare_amount          pickup_datetime  \
0    2009-06-15 17:26:21.0000001          4.5  2009-06-15 17:26:21 UTC   
1    2010-01-05 16:52:16.0000002         16.9  2010-01-05 16:52:16 UTC   
2   2011-08-18 00:35:00.00000049          5.7  2011-08-18 00:35:00 UTC   
3    2012-04-21 04:30:42.0000001          7.7  2012-04-21 04:30:42 UTC   
4  2010-03-09 07:51:00.000000135          5.3  2010-03-09 07:51:00 UTC   

   pickup_longitude  pickup_latitude  dropoff_longitude  dropoff_latitude  \
0        -73.844311        40.721319         -73.841610         40.712278   
1        -74.016048        40.711303         -73.979268         40.782004   
2        -73.982738        40.761270         -73.991242         40.750562   
3        -73.987130        40.733143         -73.991567         40.758092   
4        -73.968095        40.768008         -73.956655         40.783762   

   passenger_count  
0                1  
1                1  
2                2  
3       

In [4]:
df['longitude_difference']= df['dropoff_longitude']-df['pickup_longitude']
df['latitude_difference'] = df['dropoff_latitude']-df['pickup_latitude']

In [5]:
distance = calculateDistance(df['longitude_difference'].values,df['latitude_difference'].values,df['pickup_latitude'].values,df['dropoff_latitude'].values)
df = df.drop(['dropoff_latitude', 'dropoff_longitude','pickup_latitude','pickup_longitude'], axis=1)

[ 60.09695026 507.40024081  87.02352385 161.47238614 123.98888534]


# Stack distance and other data

In [6]:
data = df.values
data = np.hstack((data,distance[:,None]))

# Drop missing rows, rows which have more than 6 passenger and fair less than 1

In [7]:
data = data[data[:,1]>0]
data = data[data[:,3]<6]
dataframe = pd.DataFrame(data)
f_data = dataframe.dropna(how = 'any',axis='rows')
f_data = f_data.values
print("After removing NAn and drop rows : ",f_data.shape)

After removing NAn and drop rows :  (978727, 7)


# Seprate the key, label and X_train

In [8]:
data = f_data
keys = data[:,0]
labels = data[:,1]
X_train = data[:,2:]

In [9]:
print("label shape  : ",labels.shape)
print("data shape : ", X_train.shape)

label shape  :  (978727,)
data shape :  (978727, 5)


# Preprocess the column pickup_datetime 

In [10]:
col = X_train[:,0]
string_date_rng = [str(x.strip(" UTC")) for x in col]

In [11]:
split_key = [x.split() for x in string_date_rng]
split_key = np.asarray(split_key)
date, time = split_key[:,0],split_key[:,1] 

# Function to get date i.e. day month and year in seprate column

In [12]:
def fun_date(date):
    split_date = [x.split("-") for x in date]
    split_date = np.asarray(split_date)
    year,month,day = split_date[:,0],split_date[:,1],split_date[:,2]
    year = np.asarray(list(map(int, year)))
    month = np.asarray(list(map(int, month)))
    day = np.asarray(list(map(int, day)))
    days = np.hstack((month[:,None],day[:,None]))
    final_date = np.hstack((year[:,None],days))
    return final_date

# Seprate time in shifts day and night

In [13]:
def getShift(hour):
    if (hour >= 0 and hour <= 6):
        return 1
    else: 
        return 0

In [14]:
def convert_into_hours(time):
    split_time = [x.split(":") for x in time]
    split_time = np.asarray(split_time)
    hour,minutes,sec = split_time[:,0],split_time[:,1],split_time[:,2]
    hours = []
    hour= np.asarray(list(map(int, hour)))
    for i in range(len(sec)):
        hours.append(getShift(hour[i]))
    hours = np.asarray(hours)
    return hours

# Stack date and time

In [15]:
final_date = fun_date(date)
hours = convert_into_hours(time)
date_time = np.hstack((final_date,hours[:,None]))

In [16]:
print("date time shape : ",date_time.shape)
total_data = np.hstack((date_time,np.absolute(X_train[:,1:])))

date time shape :  (978727, 4)


# Gradient Boosting Classifier Model

In [17]:
from sklearn.ensemble import GradientBoostingRegressor
model_grad = GradientBoostingRegressor().fit(total_data, labels)
#model_grad = GradientBoostingRegressor(learning_rate=0.1,min_samples_split=500,min_samples_leaf=50,max_depth=8,max_features='sqrt',subsample=0.75,random_state=12).fit(total_data, labels)

# Linear Regression Model

In [18]:
from sklearn.linear_model import LinearRegression
model_linear = LinearRegression().fit(total_data, labels)

# XGBoost Model

In [19]:
import xgboost as xgb
xgb_model = xgb.XGBRegressor(objective="reg:linear",random_state=42)
#xgb_model = xgb.XGBRegressor(learning_rate =0.1,n_estimators=1000, max_depth=5, min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, seed=27).fit(total_data,labels)
xgb_model.fit(total_data,labels)

  "because it will generate extra copies and increase memory consumption")


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, importance_type='gain',
       learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
       nthread=None, objective='reg:linear', random_state=42, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=1)

# Load test file and preprocess it like we do in training dataset

In [20]:
df_test = pd.read_csv('test.csv')
print("test data shape :",df_test.shape)
df_test['longitude_difference']= df_test['dropoff_longitude']-df_test['pickup_longitude']
df_test['latitude_difference'] = df_test['dropoff_latitude']-df_test['pickup_latitude']

distance_test = calculateDistance(df_test['longitude_difference'].values,df_test['latitude_difference'].values,df_test['pickup_latitude'].values,df_test['dropoff_latitude'].values)
df_test = df_test.drop(['dropoff_latitude', 'dropoff_longitude','pickup_latitude','pickup_longitude'], axis=1)

df_test = df_test.values

test data shape : (9914, 7)
[137.28955003 147.4781896   37.37425975 118.75587263 322.49623257]


# Segregate the dataset

In [21]:
keys = df_test[:,0]
X_test = df_test[:,1:]
X_test = np.hstack((X_test,distance_test[:,None]))

# Preprocess the column pickup_datetime 

In [22]:
col_test = X_test[:,0]
test_date_rng = [str(x.strip(" UTC")) for x in col_test]
split_key = [x.split() for x in test_date_rng]
split_key = np.asarray(split_key)
date, time = split_key[:,0],split_key[:,1] 
final_date_test = fun_date(date)
hours_test = convert_into_hours(time)
test_date_time = np.hstack((final_date_test,hours_test[:,None]))

In [23]:
print(test_date_time.shape)
test_data = np.hstack((test_date_time,np.absolute(X_test[:,1:])))
print(test_data.shape)

(9914, 4)
(9914, 8)


# Predict the result of Gradient Boosting Model and save in CSV

In [24]:
predicted = model_grad.predict(test_data)
predicted = np.around(predicted,decimals=1)
final_res = np.hstack((keys[:,None],predicted[:,None]))
fin_df = pd.DataFrame(final_res)
fin_df.to_csv("submissions_gradient.csv", sep=',',header=['key','fare_amount'],index=False)

# Predict the result of Linear Model and save in CSV

In [25]:
predicted = model_linear.predict(test_data)
predicted = np.around(predicted,decimals=1)
final_res = np.hstack((keys[:,None],predicted[:,None]))
fin_df = pd.DataFrame(final_res)
fin_df.to_csv("submissions_linear.csv", sep=',',header=['key','fare_amount'],index=False)

# Predict the result of XGBoost Classifier Model and save in CSV

In [26]:
predicted = xgb_model.predict(test_data)
predicted = np.around(predicted,decimals=1)
final_res = np.hstack((keys[:,None],predicted[:,None]))
fin_df = pd.DataFrame(final_res)
fin_df.to_csv("submissions_xgb.csv", sep=',',header=['key','fare_amount'],index=False)

In [27]:
#xgb.plot_importance(xgb_model)