In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Library
import matplotlib.pyplot as plt
import seaborn as sns
import math
plt.style.use('fivethirtyeight');
plt.rcParams['font.size'] = 12;
plt.figure(figsize=(14,7));
palette = sns.color_palette('Paired', 10);

# map
import folium
from folium.plugins import HeatMap
from folium.plugins import HeatMapWithTime



In [None]:
#Reading File and see the data
df = pd.read_csv("/kaggle/input/new-york-city-taxi-fare-prediction/train.csv", nrows=1000000)
df.shape
df.head()



In [None]:
df.info()

In [None]:
'''
The points we should notice:
    1)The null should be removed
    2)The duplicates should be removed
    3)The number of passangers is important 
    4)The distance is important
    5)The season is important
    6)The month is important
    7)The date is important
    8)The hour is important
    9)The location of pick up is important
    10)The location of drop off is important
    11)Unusable columns should be removed
    12)Fare amount should not be negative or zero
    13)Removing outliers in fare column
    14)The No. of passengers should not be more than 4 and should not be zero
    15)The distance should not be 0 or more than 783 in NYC
    16) Min fare is 2.5$  https://www.introducingnewyork.com/taxis
    17)Peak hour supplement,night suplement,cuty tax and an hour's wait money can be considered
   
'''


In [None]:
100 * df.isnull().sum().sort_values(ascending=False)/len(df) #df.isnull().sum() 

In [None]:
#Removing useless columns
df.drop('key',axis=1, inplace=True)

In [None]:
#Removing Nulls
df.dropna(inplace=True)
df.isnull().sum() 

In [None]:
#Removing Duplicates
size_before_Removing = len(df)
df = df.drop_duplicates()
size_after_Removing = len(df)
print(str(size_before_Removing - size_after_Removing) + " duplicates were removed.")

In [None]:
def plot_dist(series=df["fare_amount"], title="Fare Distribution"):
    sns.histplot(series, kde=True, stat='density', discrete=True)
    sns.despine()
    plt.title(title);
    plt.show()
plot_dist()

In [None]:
#deleting  longitude or latitude equal to 0
drop_i = df[(df.pickup_longitude == 0) | (df.pickup_latitude == 0) | (df.dropoff_longitude == 0) | (df.dropoff_latitude == 0)].index
df = df.drop(drop_i)

In [None]:
df = df[df.fare_amount.between(2.5, 60)]
plot_dist(df.fare_amount)

In [None]:
#deleting rows with fare price less or equal to 2.5$   #Minimum fare: US$ 2.50.
drop_i = df[df.fare_amount <= 2.5].index
df = df.drop(drop_i)

In [None]:
'''
df['fare-bin'] = pd.cut(df['fare_amount'], bins = list(range(2, 60, 5)), include_lowest=True).astype('str')

# Uppermost bin
df['fare-bin'] = df['fare-bin'].replace(np.nan, '[45+]')

# apply this to clean up the first bin's label
df['fare-bin'] = df['fare-bin'].apply(lambda x: x.replace('-0.001', '0'))

# sort by fare the correct look in the chart
df = df.sort_values(by='fare_amount')

sns.catplot(x="fare-bin", kind="count", palette=palette, data=df, height=5, aspect=3);
sns.despine()
plt.show()
'''

In [None]:
#print(round(df['fare_amount'].mean() + 5* df['fare_amount'].std(),2))

In [None]:
#Neglecting outliers: 5*standard deviation
df[df.fare_amount > round(df['fare_amount'].mean() + 5* df['fare_amount'].std(),2)]




In [None]:
drop_i = df[df.fare_amount > round(df['fare_amount'].mean() + 5* df['fare_amount'].std(),2)].index
df = df.drop(drop_i)

In [None]:
df.passenger_count.describe()
drop_i = df[(df.passenger_count == 0) | (df.passenger_count > 5)].index
df = df.drop(drop_i)

#The maximum amount of passengers allowed in a yellow taxicab by law is four (4) in a four (4) passenger taxicab 
#or five (5) passengers in a five (5) passenger taxicab, 
#except that an additional passenger must be accepted if such passenger is under the age of seven (7) and is held on the lap of an adult passenger seated in the rear.

In [None]:
#Find boudaries from test set and remove outliers from training set
data_test = pd.read_csv("/kaggle/input/new-york-city-taxi-fare-prediction/test.csv")
for col in ["pickup_latitude", "pickup_longitude", "dropoff_latitude", "dropoff_longitude"]:
    MIN = data_test[col].min()
    MAX = data_test[col].max()
    print(col, MIN, MAX)

In [None]:
df = df[df["pickup_latitude"].between(left = 40, right = 42 )]
df = df[df["pickup_longitude"].between(left = -74.3, right = -72.9 )]
df = df[df["dropoff_latitude"].between(left = 40, right = 42 )]
df = df[df["dropoff_longitude"].between(left = -74, right = -72.9 )]

center_location = [40.758896, -73.985130]
m = folium.Map(location=center_location, control_scale=True, zoom_start=11)


heatmap_data = df.head(10000)[['pickup_latitude', 'pickup_longitude']].groupby(['pickup_latitude', 'pickup_longitude']).sum().reset_index().values.tolist()
gradient = {0.2: 'blue', 0.4: 'lime', 0.6: 'orange', 1: 'red'}
HeatMap(data=heatmap_data, radius=5, gradient=gradient, max_zoom=13).add_to(m)
m

In [None]:
#HAVERSINE FORMULA
def haversine_distance(pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude):
    dlon = np.deg2rad(dropoff_longitude) - np.deg2rad(pickup_longitude)
    dlat = np.deg2rad(dropoff_latitude) - np.deg2rad(pickup_latitude)
    pre_dist = np.sin(dlat / 2)**2 + np.cos(pickup_latitude) * np.cos(dropoff_latitude) * np.sin(dlon / 2)**2
    c = 2 * np.arctan2(np.sqrt(pre_dist), np.sqrt(1 - pre_dist))
    distance = 6373.0 * c
    return distance

In [None]:
df['distance'] = df.apply(lambda x: haversine_distance(x.pickup_longitude,x.pickup_latitude,x.dropoff_longitude,x.dropoff_latitude),axis=1) 



In [None]:
#Removing negative distances!
df.head(3)
df['distance'] = np.int32(df['distance'])
df = df.drop(df[df.distance < 0].index)
#Removing none related distance
df = df.drop(df[df.distance > 783].index)    #783 m^2 area of NYC

In [None]:
#new features from timedate
def time_features(df,time_col):
    df[time_col] = pd.to_datetime(df[time_col])
    df['day'] = pd.to_datetime(df[time_col]).dt.day
    df['dayofweek'] = pd.to_datetime(df[time_col]).dt.dayofweek
    df['week']=pd.to_datetime(df[time_col]).dt.isocalendar().week
    df['hour'] = pd.to_datetime(df[time_col]).dt.hour
    df['minute'] = pd.to_datetime(df[time_col]).dt.minute
    df['month'] = pd.to_datetime(df[time_col]).dt.month
    return df

In [None]:
df = time_features(df,'pickup_datetime')
df.head(5)

In [None]:
heatmap_data_by_hour = []
__df__ = df.head(10000)
for hour in df.hour.sort_values().unique():
    _df = __df__[__df__.hour == hour][['pickup_latitude', 'pickup_longitude']].groupby(['pickup_latitude', 'pickup_longitude']).sum().reset_index().values.tolist()
    heatmap_data_by_hour.append(_df)

In [None]:
m2 = folium.Map(location=center_location, control_scale=True, zoom_start=11)
HeatMapWithTime(heatmap_data_by_hour, radius=5, 
                gradient=gradient, 
                min_opacity=0.5, max_opacity=0.8, 
                use_local_extrema=False).add_to(m2)
m2

In [None]:
#creating bins for minute

def min_bin(min):
    if min >= 46: return 3
    elif min >= 31: return 2
    elif min >= 16: return 1
    elif min >= 0: return 0

In [None]:
df['min_bin'] = df.minute.apply(lambda x:min_bin(x))
df.head(5)


In [None]:
df.info()

In [None]:
df.month = np.int32(df.month)

In [None]:
df.month.value_counts().sort_index()

In [None]:
tp = df.groupby('month').sum().sort_values('fare_amount',ascending=False)
tp.head(12)
#April, May, June are the highest grossing months(maybe because of tourist)

In [None]:
tp = df.groupby('hour').sum().sort_values('fare_amount',ascending=False)
tp.head(24)
# peak hours between 6 pm to 22 pm

In [None]:
sns.catplot(x="hour", kind="count", palette=palette, data=df, height=5, aspect=3);
sns.despine()
plt.title('Hour of Day');
plt.show()

In [None]:
tp = df.groupby('dayofweek').sum().sort_values('fare_amount',ascending=False)
tp.head(7)
#friday,saturday and sunday has minimum earning(wed,tue,thu has highest earning)

In [None]:
sns.catplot(x="day", kind="count", palette=palette, data=df, height=5, aspect=3);
sns.despine()
plt.title('Day of Week');
plt.show()

In [None]:
tp = df.groupby(['pickup_longitude','pickup_latitude']).sum().sort_values('fare_amount',ascending=False)
tp.head(5)
#highest earning and also passenger wise place is "Keith Mitchell Forest, Squantuck Road, Seymour"
#second highest "Maple Court, East 122nd Street"
#https://www.gps-coordinates.net/map/state/NY

In [None]:
tp = df.groupby(['dropoff_longitude','dropoff_latitude']).sum().sort_values('fare_amount',ascending=False)
tp.head(5)
#dropoff also showing same places



In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(12, 10)
sns.heatmap(df.drop(['pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude','minute'],axis=1).corr(),cmap='YlGnBu',linewidths=.5,annot=True)

In [None]:
X = df.drop(['fare_amount', 'pickup_datetime', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude','minute'],axis=1)
y = df.fare_amount

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [None]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression().fit(X_train, y_train)
reg.score(X_train, y_train)
y_pred = reg.predict(X_test)
print('Linear regression, Root mean square is: ',np.sqrt(mean_squared_error(y_test,y_pred)))

In [None]:
from sklearn.tree import DecisionTreeRegressor
reg = DecisionTreeRegressor().fit(X_train, y_train)
reg.score(X_train, y_train)
y_pred = reg.predict(X_test)
print('DecisionTreeRegressor, Root mean square is: ',np.sqrt(mean_squared_error(y_test,y_pred)))

In [None]:
import lightgbm as lgb
gbm = lgb.LGBMRegressor().fit(X_train.values, y_train.values)
gbm.score(X_train.values, y_train.values)
y_pred = gbm.predict(X_test.values)
print('RandomForestRegressor, Root mean square is: ',np.sqrt(mean_squared_error(y_test,y_pred)))

In [None]:
test = pd.read_csv('../input/new-york-city-taxi-fare-prediction/test.csv')


In [None]:
test.head(5)

In [None]:
test['distance'] = test.apply(lambda x: haversine_distance(x.pickup_longitude,x.pickup_latitude,x.dropoff_longitude,x.dropoff_latitude),axis=1) 
test['distance'] = np.int32(test['distance'])
test = time_features(test,'pickup_datetime')
test['min_bin'] = test.minute.apply(lambda x:min_bin(x))
test.isnull().sum()

In [None]:
pred = test.drop(['key','pickup_datetime', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude','minute'],axis=1)

In [None]:
fare_amount = gbm.predict(pred.values)

In [None]:
submission = pd.DataFrame()
submission['key'] = test['key']
submission['fare_amount'] = fare_amount

In [None]:
submission.to_csv('submission.csv',index=False)