In [None]:
import numpy as np
import pandas as pd

import os

# Unzip data

In [None]:
from zipfile import ZipFile

with ZipFile('/kaggle/input/nyc-taxi-trip-duration/train.zip', 'r') as file:
    file.extractall()
    
with ZipFile('/kaggle/input/nyc-taxi-trip-duration/test.zip', 'r') as file:
    file.extractall()
    
with ZipFile('/kaggle/input/nyc-taxi-trip-duration/sample_submission.zip', 'r') as file:
    file.extractall()

# Load data

In [None]:
train = pd.read_csv("./train.csv")

train['pickup_datetime'] = pd.to_datetime(train['pickup_datetime'])
train['dropoff_datetime'] = pd.to_datetime(train['dropoff_datetime'])

train.head()

In [None]:
test = pd.read_csv("./test.csv")

test['pickup_datetime'] = pd.to_datetime(test['pickup_datetime'])

test.head()

In [None]:
sample_submission = pd.read_csv("./sample_submission.csv")
sample_submission.head()

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

# Data Analysis

In [None]:
train.describe()

In [None]:
train.quantile(.9)

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

fig = make_subplots(
    rows=2, 
    cols=2, 
    subplot_titles=['vendor_id', 'passenger_count', 'store_and_fwd_flag', 'trip_duration']
)

fig.append_trace(go.Histogram(x=train[::500]['vendor_id'], nbinsx=10),1,1)
fig.append_trace(go.Histogram(x=train[::500]['passenger_count'], nbinsx=10),1,2)
fig.append_trace(go.Histogram(x=train[::500]['store_and_fwd_flag'], nbinsx=10),2,1)
fig.append_trace(go.Histogram(x=train[::500]['trip_duration'], nbinsx=20),2,2)

fig.show()

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

fig = make_subplots(
    rows=2, 
    cols=2, 
    subplot_titles=['vendor_id', 'passenger_count', 'store_and_fwd_flag', 'trip_duration']
)

fig.append_trace(go.Box(y=train[::500]['vendor_id']),1,1)
fig.append_trace(go.Box(y=train[::500]['passenger_count']),1,2)
fig.append_trace(go.Box(y=train[::500]['store_and_fwd_flag']),2,1)
fig.append_trace(go.Box(y=train[::500]['trip_duration']),2,2)

fig.show()

In [None]:
# drop outliers

train = train[train.trip_duration < 2000]

In [None]:
import plotly.express as px

fig = px.scatter_matrix(
    train[::1000], 
    dimensions=['vendor_id', 'passenger_count', 'store_and_fwd_flag', 'trip_duration'], 
    color="trip_duration"
)

fig.show()

# Train / Test Split

In [None]:
y = train["trip_duration"].copy().to_numpy().reshape(-1,1)
X = train.drop(["id", "dropoff_datetime", "trip_duration"], axis=1).copy()

# Util Functions

In [None]:
def distance(df):
    df = df.copy()
    lat2 = df['dropoff_latitude']
    lat1 = df['pickup_latitude']
    lon2 = df['dropoff_longitude']
    lon1 = df['pickup_longitude']
    df['distance'] = np.sqrt(np.power(lat2 - lat1, 2) + np.power(lon2 - lon1, 2)) 
    
    df = df.drop(['dropoff_latitude', 'pickup_latitude', 'dropoff_longitude', 'pickup_longitude'], axis=1)
    return df

In [None]:
def distance_v2(df):
    df = df.copy()
    lat2 = df['dropoff_latitude']
    lat1 = df['pickup_latitude']
    lon2 = df['dropoff_longitude']
    lon1 = df['pickup_longitude']
    
    dy = 12430 * (np.abs(lat2 - lat1)/180)
    dx = 24901 * (np.abs(lat2 - lat1)/360) * np.cos(np.abs(lat2 + lat1)/2)
    df['distance'] = np.sqrt(np.power(dx, 2) + np.power(dy, 2)) 
    
    df = df.drop(['dropoff_latitude', 'pickup_latitude', 'dropoff_longitude', 'pickup_longitude'], axis=1)
    return df

In [None]:
def hour_of_the_day(df):
    df = df.copy()
    df['hour_day'] = [x.hour for x in df['pickup_datetime']]
    
    df = df.drop(['pickup_datetime'], axis=1)
    return df

# Linear Regression

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

X_train = distance_v2(X_train)
X_train = hour_of_the_day(X_train)
X_train = X_train[['hour_day', 'passenger_count', 'distance']]

lr_model = LinearRegression()
lr_model.fit(X_train, y_train)


In [None]:
X_test = distance_v2(X_test)
X_test = hour_of_the_day(X_test)
X_test = X_test[['hour_day', 'passenger_count', 'distance']]

In [None]:
y_train_pred = lr_model.predict(X_train)

y_test_pred = lr_model.predict(X_test)

In [None]:
mae = mean_absolute_error(y_train, y_train_pred)

test_mae = mean_absolute_error(y_test, y_test_pred)

print(f"mae: {mae}, test_mae: {test_mae}")

# Plot

In [None]:
pretty_plot = X_test.copy()
pretty_plot['real'] = y_test.reshape(-1,1)
pretty_plot['pred'] = y_test_pred.reshape(-1,1)

pretty_plot = pretty_plot.sort_values(by='real')

pretty_plot = pretty_plot.reset_index(drop=True)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(20, 5))

plt.plot(pretty_plot.iloc[:,:3].index, pretty_plot.loc[:,'real'], label = "real")

plt.plot(pretty_plot.iloc[:,:3].index, pretty_plot.loc[:,'pred'], label = "pred")

#plt.xlabel('x')
#plt.ylabel('y')
#plt.title('title')

plt.legend()

plt.show()

In [None]:
import plotly.graph_objects as go

fig = go.Figure()

fig.add_trace(go.Scatter(x=pretty_plot.iloc[:,:3].index, y=pretty_plot.loc[:,'real'],mode='lines',name='real'))
fig.add_trace(go.Scatter(x=pretty_plot.iloc[:,:3].index, y=pretty_plot.loc[:,'pred'],mode='lines',name='pred'))

fig.show()

# Linear Regression Model (Using SGD)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [None]:
from sklearn.linear_model import SGDRegressor

sgd_model = SGDRegressor(max_iter=1000, tol=1e-3)

In [None]:
last_test_mae = -1

pointer = 0
for i in range(10):
    
    X_train_small = X_train[pointer*16:(pointer+1)*16].copy()
    y_train_small = y_train[pointer*16:(pointer+1)*16].copy()
    
    X_train_small = distance(X_train_small)
    
    X_train_small = hour_of_the_day(X_train_small)
    
    X_train_small = X_train_small[['hour_day', 'passenger_count', 'distance']]
    
    sgd_model.partial_fit(X_train_small, y_train_small.ravel())
        
    mae = mean_absolute_error(y_train_small, sgd_model.predict(X_train_small))

    # Test
    
    X_test_small = X_test.copy()
    y_test_small = y_test.copy()
    
    X_test_small = distance(X_test_small)
    X_test_small = hour_of_the_day(X_test_small)
    X_test_small = X_test_small[['hour_day', 'passenger_count', 'distance']]
    
    test_mae = mean_absolute_error(y_test_small, sgd_model.predict(X_test_small))

    print(f"iteration: {i}, mae: {mae}, test_mae: {test_mae}")
    
    if (pointer+1)*100 > len(X_train):
        pointer = 0
    
    if last_test_mae == test_mae:
        break
    else:
        last_test_mae = test_mae

# Predictions

In [None]:
X_val = test.drop(["id"], axis=1).copy()

In [None]:
X_val = distance_v2(X_val)
X_val = hour_of_the_day(X_val)
X_val = X_val[['hour_day', 'passenger_count', 'distance']]

sample_submission['trip_duration'] = lr_model.predict(X_val).astype(int)

In [None]:
sample_submission.head()

In [None]:
sample_submission.to_csv("sample_submission.csv", index=False)