In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Importing Libraries**

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from math import radians, cos, sin, sqrt, asin

**Reading Data**

In [None]:
df = pd.read_csv('../input/new-york-city-taxi-fare-prediction/train.csv')
df.sample(5)

In [None]:
df.info()

In [None]:
df.duplicated().sum()

In [None]:
df.isna().sum()

**Changing Data Types**

In [None]:
df['pickup_datetime']=pd.to_datetime(df['pickup_datetime'])

**Creating New Features**

In [None]:
df= df.assign(hour = df.pickup_datetime.dt.hour, 
                         day = df.pickup_datetime.dt.day,
                        month = df.pickup_datetime.dt.month, 
                        year = df.pickup_datetime.dt.year, 
                        dayofweek = df.pickup_datetime.dt.dayofweek)

In [None]:
# function to calculate the travel distance from the longitudes and latitudes
def distance_transform(longitude1, latitude1, longitude2, latitude2):
    travel_dist = []
    
    for pos in range(len(longitude1)):
        long1,lati1,long2,lati2 = map(radians,[longitude1[pos],latitude1[pos],longitude2[pos],latitude2[pos]])
        dist_long = long2 - long1
        dist_lati = lati2 - lati1
        a = sin(dist_lati/2)**2 + cos(lati1) * cos(lati2) * sin(dist_long/2)**2
        c = 2 * asin(sqrt(a))*6371
        travel_dist.append(c)
       
    return travel_dist

In [None]:
#Add a new variable travel distance

df['travel_dist_km'] = distance_transform(df['pickup_longitude'].to_numpy(),
                                                df['pickup_latitude'].to_numpy(),
                                                df['dropoff_longitude'].to_numpy(),
                                                df['dropoff_latitude'].to_numpy())
df

**Treating Outliers**

In [None]:
for i in df.iloc[:,3:].columns:
    sns.boxplot(x=df[i],data=df,palette='rainbow')
    
    plt.show()

In [None]:
l=[7]
def Outlier(data):
    for column in data.iloc[:,l].columns:
        if data[column].dtype!='str':
            Q1=np.percentile(data[column],25)
            Q3=np.percentile(data[column],75)
            IQR= Q3-Q1
            lower=Q1-(1.5* IQR)
            upper=Q3+(1.5* IQR)
            data.loc[:,column] =np.where(data[column].values>upper,upper,data[column].values)
            data.loc[:,column] =np.where(data[column].values<lower,lower,data[column].values)
    return data
 
df2=Outlier(df)
df2.columns
df2.head()

In [None]:
for i in df2.iloc[:,3:].columns:
    sns.boxplot(x=df2[i],data=df2,palette='rainbow')
    
    plt.show()

In [None]:
df2.corr()['fare_amount']

In [None]:
# Plot a histogram

df2.fare_amount.hist(bins=30, alpha=0.5)
plt.show()

In [None]:
# heatmap

corr = df2.corr()
plt.figure(figsize=(14,14))
sns.heatmap(corr, annot=True, fmt= '.2f',annot_kws={'size': 15}, cmap= 'coolwarm')
plt.show()
print(corr)

**Splitting Data**

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X=df2.drop(['fare_amount','key','pickup_datetime'],axis=1)
y=df2['fare_amount']

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=3)

**Building A Model**

In [None]:
from sklearn.ensemble import RandomForestRegressor

clf= RandomForestRegressor(n_estimators=150, max_depth=10, min_samples_leaf = 12,min_samples_split=15)

In [None]:
#Scaling the data

from sklearn.preprocessing import MinMaxScaler

min_max=MinMaxScaler()

X_train_transformed=min_max.fit_transform(X_train)

X_test_transformed=min_max.transform(X_test)

In [None]:
clf.fit(X_train_transformed,y_train)

In [None]:
pred=clf.predict(X_test_transformed)
pred

In [None]:
train_pred=clf.predict(X_train_transformed)
train_pred

**Evaluating The Model**

In [None]:
#R2

from sklearn.metrics import r2_score
print('R^2:',r2_score(y_test,pred))
print('R^2:',r2_score(y_train,train_pred))

In [None]:
#MSE

from sklearn.metrics import mean_squared_error
print('MSE:',mean_squared_error(y_test,pred))
print('MSE:',mean_squared_error(y_train,train_pred))

In [None]:
#RMSE

print('RMSE:',np.sqrt(mean_squared_error(y_test,pred)))
print('RMSE:',np.sqrt(mean_squared_error(y_train,train_pred)))

In [None]:
#MAE

from sklearn.metrics import mean_absolute_error
print('MAE:',mean_absolute_error(y_test,pred))
print('MAE:',mean_absolute_error(y_train,train_pred))