<a href="https://colab.research.google.com/github/simritha14/JOD-DS/blob/main/ENSEMBLE_P5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Ensemble Learning
# Project: Building Basic predictive models over the NYC Taxi Trip dataset

Models to build :


*  Knn Model
*  Linear Model with Regularization
*  Random Forest Model
*  Gradient Boosting Decision Tree

#### Importing Libraries

In [None]:
#importing libraries 

import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

#### Importing Dataset

In [None]:
path = "/content/drive/MyDrive/nyc_taxi_trip_duration.csv"
dat = pd.read_csv(path)

In [None]:
dat.shape

(729322, 11)

## Cleaning the Dataset
To use the data we need to first clean it

In [None]:
dat.dtypes

id                     object
vendor_id               int64
pickup_datetime        object
dropoff_datetime       object
passenger_count         int64
pickup_longitude      float64
pickup_latitude       float64
dropoff_longitude     float64
dropoff_latitude      float64
store_and_fwd_flag     object
trip_duration           int64
dtype: object

In [None]:
dat.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id1080784,2,2016-02-29 16:40:21,2016-02-29 16:47:01,1,-73.953918,40.778873,-73.963875,40.771164,N,400
1,id0889885,1,2016-03-11 23:35:37,2016-03-11 23:53:57,2,-73.988312,40.731743,-73.994751,40.694931,N,1100
2,id0857912,2,2016-02-21 17:59:33,2016-02-21 18:26:48,2,-73.997314,40.721458,-73.948029,40.774918,N,1635
3,id3744273,2,2016-01-05 09:44:31,2016-01-05 10:03:32,6,-73.96167,40.75972,-73.956779,40.780628,N,1141
4,id0232939,1,2016-02-17 06:42:23,2016-02-17 06:56:31,1,-74.01712,40.708469,-73.988182,40.740631,N,848


### Typecasting the Variables

In [None]:
dat['vendor_id'] = dat['vendor_id'].astype('category')

- converting vendor_id to a category as it is a categorical variable

In [None]:
dat['pickup_datetime'] = pd.to_datetime(dat.pickup_datetime)
dat['dropoff_datetime'] = pd.to_datetime(dat.dropoff_datetime)

typecasting pickup_datetime and dropoff_datetime to datatime datatype to be able to extract thee following details

*   pickup hour
*   pickup day
*   pickup month
*   dropoff hour
*   dropoff day
*   dropoff month



In [None]:
#using the function dt from datatime library
dat['pickuphour']=dat.pickup_datetime.dt.hour
dat['pickupday']=dat.pickup_datetime.dt.dayofweek
dat['pickupmonth']=dat.pickup_datetime.dt.month
dat['dropoffhour']=dat.dropoff_datetime.dt.hour
dat['dropoffday']=dat.dropoff_datetime.dt.dayofweek
dat['dropoffmonth']=dat.dropoff_datetime.dt.month

In [None]:
#encoding store_and_fwd_flag
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
dat['store_and_fwd_encoded'] = label_encoder.fit_transform(dat['store_and_fwd_flag'])

- Encoding the store_and_fwd_flag variable to convert it into a numerical variable

### Missing Values

In [None]:
#checking for missing values
dat.isnull().sum()

id                       0
vendor_id                0
pickup_datetime          0
dropoff_datetime         0
passenger_count          0
pickup_longitude         0
pickup_latitude          0
dropoff_longitude        0
dropoff_latitude         0
store_and_fwd_flag       0
trip_duration            0
pickuphour               0
pickupday                0
pickupmonth              0
dropoffhour              0
dropoffday               0
dropoffmonth             0
store_and_fwd_encoded    0
dtype: int64

* There are no missing values in our dataset so we can move forward with the modeling process

In [None]:
#Removing Columns with less Significance
data=dat.drop(['pickup_datetime','dropoff_datetime','id','pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude'],axis=1)

In [None]:
data.head()

Unnamed: 0,vendor_id,passenger_count,store_and_fwd_flag,trip_duration,pickuphour,pickupday,pickupmonth,dropoffhour,dropoffday,dropoffmonth,store_and_fwd_encoded
0,2,1,N,400,16,0,2,16,0,2,0
1,1,2,N,1100,23,4,3,23,4,3,0
2,2,2,N,1635,17,6,2,18,6,2,0
3,2,6,N,1141,9,1,1,10,1,1,0
4,1,1,N,848,6,2,2,6,2,2,0


* This is the final set of features we are going to build the models upon

### Segregating variables: Independent and Dependent Variables

In [None]:
#converting the categorical variables to dummy columns
data=pd.get_dummies(data)

In [None]:
#seperating independent and dependent variables
x = data.drop(['trip_duration'], axis=1)
y = data['trip_duration']
x.shape, y.shape

((729322, 12), (729322,))

### Scaling the data (Using MinMax Scaler)

In [None]:
## Importing the MinMax Scaler
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
x_scaled = scaler.fit_transform(x)

In [None]:
x = pd.DataFrame(x_scaled, columns = x.columns)

In [None]:
x.head()

Unnamed: 0,passenger_count,pickuphour,pickupday,pickupmonth,dropoffhour,dropoffday,dropoffmonth,store_and_fwd_encoded,vendor_id_1,vendor_id_2,store_and_fwd_flag_N,store_and_fwd_flag_Y
0,0.111111,0.695652,0.0,0.2,0.695652,0.0,0.166667,0.0,0.0,1.0,1.0,0.0
1,0.222222,1.0,0.666667,0.4,1.0,0.666667,0.333333,0.0,1.0,0.0,1.0,0.0
2,0.222222,0.73913,1.0,0.2,0.782609,1.0,0.166667,0.0,0.0,1.0,1.0,0.0
3,0.666667,0.391304,0.166667,0.0,0.434783,0.166667,0.0,0.0,0.0,1.0,1.0,0.0
4,0.111111,0.26087,0.333333,0.2,0.26087,0.333333,0.166667,0.0,1.0,0.0,1.0,0.0


In [None]:
# Importing Train test split
from sklearn.model_selection import train_test_split
train_x,test_x,train_y,test_y = train_test_split(x,y, random_state = 56)

In [None]:
# Importing the predictive models
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

## KNeighbors Regressor model


In [None]:
# creating a knn instance
model1 = KNeighborsRegressor(n_neighbors=2)

#training the model
model1.fit(train_x,train_y)

pred1=model1.predict(test_x)

# Evaluating the KNeighbors Regressor model using score
pred1[:10], model1.score(test_x, test_y)

(array([ 931.5,  721.5,  528. , 1178. ,  813. ,  874. ,  322.5, 1341.5,
        1010. ,  965.5]), 0.7441862413787768)

* The Knn regressor scored pretty good.

## Ridge Regression model


In [None]:
model2 = Ridge(alpha=1.5)
model2.fit(train_x, train_y)
pred2=model2.predict(test_x)

# Evaluating the Ridge Regression model using score
pred2[:10], model2.score(test_x, test_y)

(array([ 794.47717293, 1075.34203686, 1034.47864732,  808.47986679,
         887.16881083, 1036.51022929, 1001.72529889, 1064.79402949,
        1034.3731249 ,  769.81664162]), 0.008935395769454013)

* The ridge model did not perform well. The data was well cleaned but it still is very bad.

## Random Forest Regressor model


In [None]:
#creating a random forest instance
model3 = RandomForestRegressor(random_state=96, n_estimators=100)

#training the model
model3.fit(train_x,train_y)
pred3=model3.predict(test_x)

#Evaluating the Random Forest Regressor model using score
pred3[:10], model3.score(test_x, test_y)

(array([ 683.35042143, 1076.38076319,  602.54747879,  533.40283144,
         820.30899896, 1236.13548613,  601.54468516,  681.05748668,
        1234.66966303,  665.02916505]), 0.9241511426209903)

* The Random forest model gave one of the best results. 

## Gradient Boosting Regressor model


In [None]:
#creating an Gradient boosting instance
model4 = GradientBoostingRegressor(random_state=96, n_estimators=200, subsample=0.7)

#training the model
model4.fit(train_x,train_y)
pred4=model4.predict(test_x)

#Evaluating the Gradient Boosting Regressor model using score
pred4[:10], model4.score(test_x, test_y)

(array([ 621.87784718,  899.93331365,  749.91814257,  608.40697561,
         895.43757917,  975.57685477,  750.80571303,  749.46008197,
        1006.11704653,  649.9668565 ]), 0.655618575204646)

* The Gradient Boosting gave a decent score too.

# The Final Prediction Model using the Averaging technique

In [None]:
#creating a new dataframe with all the model's predictions and the actual prediction
df = pd.DataFrame(columns=['KNN_MODEL', 'LINEAR_MODEL', 'RANDOMFOREST_MODEL','GRADIENTBOOSTING_MODEL','Actual'])
df['KNN_MODEL'] = pred1
df['LINEAR_MODEL'] = pred2
df['RANDOMFOREST_MODEL'] = pred3
df['GRADIENTBOOSTING_MODEL'] = pred4
df['Actual'] = np.array(test_y)

In [None]:
df.head()

Unnamed: 0,KNN_MODEL,LINEAR_MODEL,RANDOMFOREST_MODEL,GRADIENTBOOSTING_MODEL,Actual
0,931.5,794.477173,683.350421,621.877847,2406
1,721.5,1075.342037,1076.380763,899.933314,1457
2,528.0,1034.478647,602.547479,749.918143,163
3,1178.0,808.479867,533.402831,608.406976,2160
4,813.0,887.168811,820.308999,895.437579,402


In [None]:
#calculating the final_predictions from taking the mean of predictions of all the models
from statistics import mean
final_pred = np.array([])
for i in range(0,len(test_x)):
    final_pred = np.append(final_pred, mean([pred1[i], pred2[i], pred3[i], pred4[i]]))

In [None]:
#importing r2_score
from sklearn.metrics import r2_score

In [None]:
r2_score(test_y, pred1), r2_score(test_y, pred2), r2_score(test_y, pred3), r2_score(test_y, pred4)

(0.7441862413787768,
 0.008935395769454013,
 0.9241511426209903,
 0.655618575204646)

In [None]:
r2_score(test_y, final_pred)

0.7691518573947

* The final prediction score comes out to be 0.77 which is pretty good.