In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math

In [None]:
df = pd.read_csv('../input/new-york-city-taxi-fare-prediction/train.csv',nrows=1000000)

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

# Data Cleaning

In [None]:
df.dropna(inplace=True)
df.drop('key',axis=1, inplace=True)

###Null Island
Null Island is the name used to refer to the point on the Earth's surface where the Prime Meridian and the Equator intersect, at zero degrees latitude and zero degrees longitude ( 0°N 0°E).

delete rows where longitude or latitude is equal to 0

In [None]:
df[(df.pickup_longitude == 0) | (df.pickup_latitude == 0) | (df.dropoff_longitude == 0) | (df.dropoff_latitude == 0)]

In [None]:
drop_i = df[(df.pickup_longitude == 0) | (df.pickup_latitude == 0) | (df.dropoff_longitude == 0) | (df.dropoff_latitude == 0)].index
df = df.drop(drop_i)

delete rows that have fare price less or equal to 0

In [None]:
drop_i = df[df.fare_amount <= 0].index
df = df.drop(drop_i)

In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(11.7, 8.27)
sns.histplot(df.fare_amount)

In [None]:
df.fare_amount.mean()

Negleting outliers using 5*standard deviation

In [None]:
df['fare_amount'].mean() + 5* df['fare_amount'].std()

In [None]:
df[df.fare_amount > 60]

In [None]:
drop_i = df[df.fare_amount > 60].index
df = df.drop(drop_i)

In [None]:
df.shape

In [None]:
df.passenger_count.value_counts()

The maximum amount of passengers allowed in a yellow taxicab by law is four (4) in a four (4) passenger taxicab or five (5) passengers in a five (5) passenger taxicab, except that an additional passenger must be accepted if such passenger is under the age of seven (7) and is held on the lap of an adult passenger seated

https://www1.nyc.gov/site/tlc/passengers/passenger-frequently-asked-questions.page

Deleting rows where passenger_count == 0 or greate than 4

In [None]:
df[(df.passenger_count == 0) | (df.passenger_count > 4)]

In [None]:
drop_i = df[(df.passenger_count == 0) | (df.passenger_count > 4)].index
df = df.drop(drop_i)

In [None]:
df.shape

## USE THE HAVERSINE FORMULA

The Haversine formula calculates the great-circle distance between two points. Start by calculating the change in latitude and longitude, in radians, and input the result into the Haversine formula (implemented below). Use the functions in the math library for trigonometry related calculations.

In [None]:
def cal_dist(pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude):
    dlon = np.deg2rad(dropoff_longitude) - np.deg2rad(pickup_longitude)
    dlat = np.deg2rad(dropoff_latitude) - np.deg2rad(pickup_latitude)
    pre_dist = np.sin(dlat / 2)**2 + np.cos(pickup_latitude) * np.cos(dropoff_latitude) * np.sin(dlon / 2)**2
    c = 2 * np.arctan2(np.sqrt(pre_dist), np.sqrt(1 - pre_dist))
    distance = 6373.0 * c
    return distance

In [None]:
df['distance'] = df.apply(lambda x: cal_dist(x.pickup_longitude,x.pickup_latitude,x.dropoff_longitude,x.dropoff_latitude),axis=1) 

In [None]:
df.head(3)

In [None]:
df.info()

In [None]:
df['distance'] = np.int32(df['distance'])

In [None]:
df.distance.max(),df.distance.min(),df.distance.std()

In [None]:
df = df.drop(df[df.distance < 0].index)

New York/Land area is 783.8 km²
* so we can delete distance over 783

In [None]:
df = df.drop(df[df.distance > 783].index)

In [None]:
# skipping the distance less than 1 km
except_zero = df[df.distance != 0]
sns.scatterplot(y = 'fare_amount',x = 'distance',data=except_zero,hue='distance')
plt.show()

## Creating new features from timedate columns

Creating new feature using pickupdatetime columns


In [None]:
def time_features(df,time_col):
    df[time_col] = pd.to_datetime(df[time_col])
    df['day'] = pd.to_datetime(df[time_col]).dt.day
    df['dayofweek'] = pd.to_datetime(df[time_col]).dt.dayofweek
    df['week']=pd.to_datetime(df[time_col]).dt.isocalendar().week
    df['hour'] = pd.to_datetime(df[time_col]).dt.hour
    df['minute'] = pd.to_datetime(df[time_col]).dt.minute
    df['month'] = pd.to_datetime(df[time_col]).dt.month
    return df

In [None]:
df = time_features(df,'pickup_datetime')
df.head(5)

In [None]:
df.info()

In [None]:
df.week = np.int32(df.week)

creating bins for minute

In [None]:
def min_bin(min):
    if min >= 46: return 3
    elif min >= 31: return 2
    elif min >= 16: return 1
    elif min >= 0: return 0

In [None]:
df['min_bin'] = df.minute.apply(lambda x:min_bin(x))

In [None]:
df.head(5)

In [None]:
df.info()

In [None]:
df.month = np.int32(df.month)

In [None]:
df.month.value_counts().sort_index()

In [None]:
tp = df.groupby('month').sum().sort_values('fare_amount',ascending=False)
tp.head(12)

* April, May, June are the highest grossing months
* maybe because of tourist

In [None]:
tp = df.groupby('hour').sum().sort_values('fare_amount',ascending=False)
tp.head(24)

If I want to earn as maximum as a uber owner then I only take out maximum taxis during peak hours between 6 pm to 22 pm

In [None]:
tp = df.groupby('dayofweek').sum().sort_values('fare_amount',ascending=False)
tp.head(7)

* friday,saturday and sunday has minimum earning
* wed,tue,thu has highest earning

In [None]:
tp = df.groupby(['pickup_longitude','pickup_latitude']).sum().sort_values('fare_amount',ascending=False)
tp.head(5)

* highest earning and also passenger wise place is "Keith Mitchell Forest, Squantuck Road, Seymour"
* second highest "Maple Court, East 122nd Street"

https://www.gps-coordinates.net/map/state/NY

In [None]:
tp = df.groupby(['dropoff_longitude','dropoff_latitude']).sum().sort_values('fare_amount',ascending=False)
tp.head(5)

dropoff also showing same places

In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(11.7, 8.27)
sns.heatmap(df.drop(['pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude','minute'],axis=1).corr(),cmap='YlGnBu',linewidths=.5,annot=True)

# Model

In [None]:
X = df.drop(['fare_amount', 'pickup_datetime', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude','minute'],axis=1)
y = df.fare_amount

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression().fit(X_train, y_train)
reg.score(X_train, y_train)
y_pred = reg.predict(X_test)
print('Linear regression, Root mean square is: ',np.sqrt(mean_squared_error(y_test,y_pred)))

In [None]:
from sklearn.tree import DecisionTreeRegressor
reg = DecisionTreeRegressor().fit(X_train, y_train)
reg.score(X_train, y_train)
y_pred = reg.predict(X_test)
print('DecisionTreeRegressor, Root mean square is: ',np.sqrt(mean_squared_error(y_test,y_pred)))

In [None]:
import lightgbm as lgb
gbm = lgb.LGBMRegressor().fit(X_train.values, y_train.values)
gbm.score(X_train.values, y_train.values)
y_pred = gbm.predict(X_test.values)
print('RandomForestRegressor, Root mean square is: ',np.sqrt(mean_squared_error(y_test,y_pred)))

# Submission

In [None]:
test = pd.read_csv('../input/new-york-city-taxi-fare-prediction/test.csv')

In [None]:
test.head(5)

In [None]:
test['distance'] = test.apply(lambda x: cal_dist(x.pickup_longitude,x.pickup_latitude,x.dropoff_longitude,x.dropoff_latitude),axis=1) 
test['distance'] = np.int32(test['distance'])
test = time_features(test,'pickup_datetime')
test['min_bin'] = test.minute.apply(lambda x:min_bin(x))

In [None]:
test.isnull().sum()

In [None]:
test.head(5)

In [None]:
X.columns

In [None]:
pred = test.drop(['key','pickup_datetime', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude','minute'],axis=1)

In [None]:
fare_amount = gbm.predict(pred.values)

In [None]:
submission = pd.DataFrame()
submission['key'] = test['key']
submission['fare_amount'] = fare_amount

In [None]:
submission.to_csv('submission.csv',index=False)

Submit this file

# I Hope this notebook helpful to you!!!