In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('Taxi.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18919 entries, 0 to 18918
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   fare_amount        18919 non-null  float64
 1   pickup_datetime    18919 non-null  object 
 2   pickup_longitude   18919 non-null  float64
 3   pickup_latitude    18919 non-null  float64
 4   dropoff_longitude  18919 non-null  float64
 5   dropoff_latitude   18919 non-null  float64
 6   passenger_count    18918 non-null  float64
dtypes: float64(6), object(1)
memory usage: 1.0+ MB


In [4]:
## Check missing value

In [5]:
df.isna().sum()

fare_amount          0
pickup_datetime      0
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    0
dropoff_latitude     0
passenger_count      1
dtype: int64

In [6]:
### Handling missing value 

In [7]:
df.dropna(inplace=True)

In [8]:
df.dtypes

fare_amount          float64
pickup_datetime       object
pickup_longitude     float64
pickup_latitude      float64
dropoff_longitude    float64
dropoff_latitude     float64
passenger_count      float64
dtype: object

In [9]:
## Data types conversion

In [10]:
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18918 entries, 0 to 18917
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   fare_amount        18918 non-null  float64       
 1   pickup_datetime    18918 non-null  datetime64[ns]
 2   pickup_longitude   18918 non-null  float64       
 3   pickup_latitude    18918 non-null  float64       
 4   dropoff_longitude  18918 non-null  float64       
 5   dropoff_latitude   18918 non-null  float64       
 6   passenger_count    18918 non-null  float64       
dtypes: datetime64[ns](1), float64(6)
memory usage: 1.2 MB


In [12]:
### data describe

In [13]:
df.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,18918.0,18918.0,18918.0,18918.0,18918.0,18918.0
mean,11.289221,-72.486834,39.925272,-72.498734,39.913116,1.657205
std,9.551667,10.503808,6.715228,10.46167,6.160671,1.282236
min,-3.0,-74.438233,-74.006893,-84.654241,-74.006377,0.0
25%,6.0,-73.992223,40.734687,-73.991233,40.734589,1.0
50%,8.5,-73.981758,40.75262,-73.980234,40.753566,1.0
75%,12.5,-73.966788,40.767427,-73.963834,40.768066,2.0
max,180.0,40.766125,401.083332,40.802437,41.366138,6.0


In [14]:
### Data Preparation

### Adding features 

- Jarak (simplify)
- jam (jam pulang dan berangkat kantor)
- hari: hari kerja atau libur
- lokasi
- tahun 
- jumlah penumpang

In [15]:
### Feature extraction => datetime extract

In [16]:
pickup_dt = df['pickup_datetime'].dt

In [17]:
df['year'] = pickup_dt.year
df['month'] = pickup_dt.month
df['date'] = pickup_dt.day

In [18]:
df.head()

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,year,month,date
0,4.5,2009-06-15 17:26:21,-73.844311,40.721319,-73.84161,40.712278,1.0,2009,6,15
1,16.9,2010-01-05 16:52:16,-74.016048,40.711303,-73.979268,40.782004,1.0,2010,1,5
2,5.7,2011-08-18 00:35:00,-73.982738,40.76127,-73.991242,40.750562,2.0,2011,8,18
3,7.7,2012-04-21 04:30:42,-73.98713,40.733143,-73.991567,40.758092,1.0,2012,4,21
4,5.3,2010-03-09 07:51:00,-73.968095,40.768008,-73.956655,40.783762,1.0,2010,3,9


In [19]:
df['day'] = pickup_dt.dayofweek

In [20]:
df.head()

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,year,month,date,day
0,4.5,2009-06-15 17:26:21,-73.844311,40.721319,-73.84161,40.712278,1.0,2009,6,15,0
1,16.9,2010-01-05 16:52:16,-74.016048,40.711303,-73.979268,40.782004,1.0,2010,1,5,1
2,5.7,2011-08-18 00:35:00,-73.982738,40.76127,-73.991242,40.750562,2.0,2011,8,18,3
3,7.7,2012-04-21 04:30:42,-73.98713,40.733143,-73.991567,40.758092,1.0,2012,4,21,5
4,5.3,2010-03-09 07:51:00,-73.968095,40.768008,-73.956655,40.783762,1.0,2010,3,9,1


In [21]:
def WeekEnd(x):
    if x > 4:
        return 1
    else:
        return 0

In [22]:
df['Weekend'] = df['day'].apply(WeekEnd)

In [23]:
df.head()

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,year,month,date,day,Weekend
0,4.5,2009-06-15 17:26:21,-73.844311,40.721319,-73.84161,40.712278,1.0,2009,6,15,0,0
1,16.9,2010-01-05 16:52:16,-74.016048,40.711303,-73.979268,40.782004,1.0,2010,1,5,1,0
2,5.7,2011-08-18 00:35:00,-73.982738,40.76127,-73.991242,40.750562,2.0,2011,8,18,3,0
3,7.7,2012-04-21 04:30:42,-73.98713,40.733143,-73.991567,40.758092,1.0,2012,4,21,5,1
4,5.3,2010-03-09 07:51:00,-73.968095,40.768008,-73.956655,40.783762,1.0,2010,3,9,1,0


In [24]:
df.drop(columns='pickup_datetime', inplace=True)

In [25]:
df.head()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,year,month,date,day,Weekend
0,4.5,-73.844311,40.721319,-73.84161,40.712278,1.0,2009,6,15,0,0
1,16.9,-74.016048,40.711303,-73.979268,40.782004,1.0,2010,1,5,1,0
2,5.7,-73.982738,40.76127,-73.991242,40.750562,2.0,2011,8,18,3,0
3,7.7,-73.98713,40.733143,-73.991567,40.758092,1.0,2012,4,21,5,1
4,5.3,-73.968095,40.768008,-73.956655,40.783762,1.0,2010,3,9,1,0


In [26]:
## Membuat Jarak - simplify

In [27]:
df['distance'] = abs((df['pickup_longitude'] - df['dropoff_longitude']) + 
                     (df['pickup_latitude'] - df['dropoff_longitude']))

In [28]:
df.head()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,year,month,date,day,Weekend,distance
0,4.5,-73.844311,40.721319,-73.84161,40.712278,1.0,2009,6,15,0,0,114.560228
1,16.9,-74.016048,40.711303,-73.979268,40.782004,1.0,2010,1,5,1,0,114.653791
2,5.7,-73.982738,40.76127,-73.991242,40.750562,2.0,2011,8,18,3,0,114.761016
3,7.7,-73.98713,40.733143,-73.991567,40.758092,1.0,2012,4,21,5,1,114.729147
4,5.3,-73.968095,40.768008,-73.956655,40.783762,1.0,2010,3,9,1,0,114.713223


In [29]:
df.drop(columns=['pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude'], inplace=True)

In [30]:
df.head()

Unnamed: 0,fare_amount,passenger_count,year,month,date,day,Weekend,distance
0,4.5,1.0,2009,6,15,0,0,114.560228
1,16.9,1.0,2010,1,5,1,0,114.653791
2,5.7,2.0,2011,8,18,3,0,114.761016
3,7.7,1.0,2012,4,21,5,1,114.729147
4,5.3,1.0,2010,3,9,1,0,114.713223


In [31]:
### Machine Learning Modeling 

In [32]:
### Splitting Data 

In [33]:
from sklearn.model_selection import train_test_split

In [34]:
X = df.drop(columns='fare_amount')
y = df['fare_amount']

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size= .80, random_state = 42) ## random state fungsinya adalah lock 

In [36]:
X_train.shape

(15134, 7)

In [37]:
X_test.shape

(3784, 7)

In [38]:
X_train.head()

Unnamed: 0,passenger_count,year,month,date,day,Weekend,distance
7539,1.0,2011,6,14,1,0,0.0
578,3.0,2010,3,25,3,0,114.672915
8056,5.0,2010,4,12,0,0,114.7732
11761,1.0,2014,11,17,0,0,114.739487
15531,1.0,2012,12,29,5,1,114.730563


In [39]:
X_test.head()

Unnamed: 0,passenger_count,year,month,date,day,Weekend,distance
12565,1.0,2012,6,15,4,0,114.754729
5282,1.0,2012,1,19,3,0,114.745586
4525,2.0,2010,11,16,1,0,114.669423
17370,2.0,2011,2,9,2,0,114.664885
9239,1.0,2013,10,7,0,0,114.729194


In [40]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet

In [41]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [42]:
## base model

In [43]:
Model_1 = LinearRegression(fit_intercept=False)

In [44]:
Model_1.fit(X_train, y_train)

LinearRegression(fit_intercept=False)

In [45]:
## Evaluation

In [46]:
## Training

In [47]:
y_pred_tr = Model_1.predict(X_train)

In [48]:
r2_score(y_train, y_pred_tr)

0.0012996536710235818

In [49]:
mean_absolute_error(y_train, y_pred_tr)

6.015884276163577

In [50]:
a1 =  mean_absolute_error(y_train, y_pred_tr)
a2 = y_train.mean()

In [51]:
a1 / a2 * 100 ## errornya mencapai 53%

53.12926895485742

In [52]:
mean_squared_error(y_train, y_pred_tr)

92.61119506399577

In [53]:
np.sqrt(mean_squared_error(y_train, y_pred_tr))

9.623471050717395

In [54]:
## testing

In [55]:
y_pred_ts = Model_1.predict(X_test)

In [56]:
r2_score(y_test, y_pred_ts)

0.0027070313473716023

In [57]:
mean_absolute_error(y_test, y_pred_ts)

5.806981849519223

In [58]:
mean_squared_error(y_test, y_pred_ts)

84.96797039888578

In [59]:
np.sqrt(mean_squared_error(y_test, y_pred_ts))

9.21780724461549

In [60]:
def Eva_Matrix_A(Model, X, y_true):
    y_pred = Model.predict(X)
    R2 = r2_score(y_true, y_pred)
    MAE = mean_absolute_error(y_true, y_pred)
    MSE = mean_squared_error(y_true, y_pred)
    RMSE = np.sqrt(MSE)
    return R2, MAE, MSE, RMSE

In [61]:
r2_train, MAE_train, MSE_train, RMSE_train = Eva_Matrix_A(Model_1, X_train, y_train)

In [62]:
r2_train

0.0012996536710235818

In [63]:
r2_test, MAE_test, MSE_test, RMSE_test = Eva_Matrix_A(Model_1, X_test, y_test)

In [64]:
r2_test

0.0027070313473716023

In [65]:
data = {
    "Training" : [r2_train, MAE_train, MSE_train, RMSE_train],
    "Testing" : [r2_test, MAE_test, MSE_test, RMSE_test ]
}

In [66]:
pd.DataFrame(data=data, index = ['R2','MAE','MSE','RMSE'])

Unnamed: 0,Training,Testing
R2,0.0013,0.002707
MAE,6.015884,5.806982
MSE,92.611195,84.96797
RMSE,9.623471,9.217807


def Eva_Matrix(Model, X_train, X_test, y_train, y_test, Nama):
    Model = Model.fit(X_train, y_train)
    y_pred_tr = Model.predict(X_train)
    R2_tr = r2_score(y_train, y_pred_tr)
    MAE_tr = mean_absolute_error(y_train, y_pred_tr)
    MSE_tr = mean_squared_error(y_train, y_pred_tr)
    RMSE_tr = np.sqrt(MSE_tr)
    y_pred_ts = Model.predict(X_test)
    R2_ts = r2_score(y_test, y_pred_ts)
    MAE_ts = mean_absolute_error(y_test, y_pred_ts)
    MSE_ts = mean_squared_error(y_test, y_pred_ts)
    RMSE_ts = np.sqrt(MSE_ts)
    
    data = {
        "Training" + " " + Nama : [R2_tr, MAE_tr, MSE_tr, RMSE_tr],
        "Testing" + " " + Nama : [R2_ts, MAE_ts, MSE_ts, RMSE_ts],

    }
    df = pd.DataFrame(data=data, index = ['R2','MAE','MSE','RMSE'])
    return df

In [67]:
df_LinReg = Eva_Matrix(Model_1, X_train, X_test, y_train, y_test, "LinReg")

NameError: name 'Eva_Matrix' is not defined

In [None]:
df_LinReg

In [None]:
model_2 = Lasso()

In [None]:
model_2.fit(X_train, y_train)

In [None]:
df_Lasso =  Eva_Matrix(model_2, X_train, X_test, y_train, y_test, "Lasso")

In [None]:
df_Lasso

In [None]:
model_3 = Ridge()
model_3.fit(X_train, y_train)
df_Ridge =  Eva_Matrix(model_3, X_train, X_test, y_train, y_test, "Ridge")

In [None]:
df_Ridge

In [None]:
model_4 = ElasticNet()
model_4.fit(X_train, y_train)
df_ElasticNet =  Eva_Matrix(model_4, X_train, X_test, y_train, y_test, "Elastic Net")

In [None]:
df_ElasticNet

In [None]:
pd.concat([df_LinReg, df_Lasso, df_Ridge, df_ElasticNet], axis=1)

## Optimize 1 

In [None]:
## Using Feature Engineering (polynomial - polynomial transformer)

In [68]:
X_train.head()

Unnamed: 0,passenger_count,year,month,date,day,Weekend,distance
7539,1.0,2011,6,14,1,0,0.0
578,3.0,2010,3,25,3,0,114.672915
8056,5.0,2010,4,12,0,0,114.7732
11761,1.0,2014,11,17,0,0,114.739487
15531,1.0,2012,12,29,5,1,114.730563


In [69]:
### Feature Engineering 

In [70]:
from sklearn.preprocessing import PolynomialFeatures, PowerTransformer

In [71]:
poly = PolynomialFeatures(degree = 3, include_bias=False)

In [72]:
### Power Transformer

In [73]:
yeo_pow = PowerTransformer(method='yeo-johnson')

In [74]:
num_col = df[['passenger_count','distance','year']]

In [75]:
num_poly = poly.fit_transform(num_col)

In [76]:
num_pow = yeo_pow.fit_transform(num_poly)

  loglike = -n_samples / 2 * np.log(x_trans.var())


In [77]:
df_pow = pd.DataFrame(num_pow)

In [78]:
df_pow.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,-0.541042,0.093165,2.590983e-14,-0.600677,-0.417044,-0.514831,0.093061,0.084938,-1.869616e-13,-0.623728,-0.417478,-0.51399,-0.415948,-0.412878,-0.518532,0.09306,0.088951,0.077515,-8.718026e-14
1,-0.541042,0.102143,2.590983e-14,-0.600677,-0.416036,-0.5141,0.102074,0.099149,-1.860734e-13,-0.623728,-0.416882,-0.513574,-0.414021,-0.41133,-0.517123,0.102074,0.100617,0.096336,-8.718026e-14
2,0.914194,0.11244,2.590983e-14,1.18588,0.547434,0.577272,0.112412,0.114674,-1.852962e-13,1.262003,0.63003,0.646762,0.52646,0.525857,0.557731,0.112412,0.113604,0.116442,-8.718026e-14
3,-0.541042,0.109378,2.590983e-14,-0.600677,-0.415224,-0.512641,0.109338,0.117056,-1.845191e-13,-0.623728,-0.416402,-0.512742,-0.41247,-0.409385,-0.514305,0.109338,0.113284,0.123923,-8.718026e-14
4,-0.541042,0.107849,2.590983e-14,-0.600677,-0.415395,-0.5141,0.107803,0.104762,-1.860734e-13,-0.623728,-0.416504,-0.513574,-0.412798,-0.41072,-0.517123,0.107803,0.106301,0.10173,-8.718026e-14


In [79]:
df.head()

Unnamed: 0,fare_amount,passenger_count,year,month,date,day,Weekend,distance
0,4.5,1.0,2009,6,15,0,0,114.560228
1,16.9,1.0,2010,1,5,1,0,114.653791
2,5.7,2.0,2011,8,18,3,0,114.761016
3,7.7,1.0,2012,4,21,5,1,114.729147
4,5.3,1.0,2010,3,9,1,0,114.713223


In [80]:
df_1 = pd.concat([df.drop(columns=['passenger_count','year','distance']), df_pow], axis=1)
df_1.head()

Unnamed: 0,fare_amount,month,date,day,Weekend,0,1,2,3,4,...,9,10,11,12,13,14,15,16,17,18
0,4.5,6,15,0,0,-0.541042,0.093165,2.590983e-14,-0.600677,-0.417044,...,-0.623728,-0.417478,-0.51399,-0.415948,-0.412878,-0.518532,0.09306,0.088951,0.077515,-8.718026e-14
1,16.9,1,5,1,0,-0.541042,0.102143,2.590983e-14,-0.600677,-0.416036,...,-0.623728,-0.416882,-0.513574,-0.414021,-0.41133,-0.517123,0.102074,0.100617,0.096336,-8.718026e-14
2,5.7,8,18,3,0,0.914194,0.11244,2.590983e-14,1.18588,0.547434,...,1.262003,0.63003,0.646762,0.52646,0.525857,0.557731,0.112412,0.113604,0.116442,-8.718026e-14
3,7.7,4,21,5,1,-0.541042,0.109378,2.590983e-14,-0.600677,-0.415224,...,-0.623728,-0.416402,-0.512742,-0.41247,-0.409385,-0.514305,0.109338,0.113284,0.123923,-8.718026e-14
4,5.3,3,9,1,0,-0.541042,0.107849,2.590983e-14,-0.600677,-0.415395,...,-0.623728,-0.416504,-0.513574,-0.412798,-0.41072,-0.517123,0.107803,0.106301,0.10173,-8.718026e-14


In [81]:
X_1 = df_1.drop(columns='fare_amount')
y_1 = df_1['fare_amount']               ### membuat splitting

In [82]:
def Eva_Matrix_Main(model, X_train, X_test, y_train, y_test, Nama):
    Model = model.fit(X_train, y_train)
    y_pred_tr = Model.predict(X_train)
    R2_tr1 = r2_score(y_train, y_pred_tr)
    MAE_tr1 = mean_absolute_error(y_train, y_pred_tr)
    MSE_tr1 = mean_squared_error(y_train, y_pred_tr)
    RMSE_tr1 = np.sqrt(MSE_tr1)
    y_pred_ts = Model.predict(X_test)
    R2_ts = r2_score(y_test, y_pred_ts)
    MAE_ts = mean_absolute_error(y_test, y_pred_ts)
    MSE_ts = mean_squared_error(y_test, y_pred_ts)
    RMSE_ts = np.sqrt(MSE_ts)
    
    data = {
        "Training" + " " + Nama : [R2_tr1, MAE_tr1, MSE_tr1, RMSE_tr1],
        "Testing" + " " + Nama : [R2_ts, MAE_ts, MSE_ts, RMSE_ts],

    }
    df = pd.DataFrame(data=data, index = ['R2','MAE','MSE','RMSE'])
    return df

In [83]:
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X_1, y_1, train_size = .80, random_state=42)

In [84]:
X_train_1.head()

Unnamed: 0,month,date,day,Weekend,0,1,2,3,4,5,...,9,10,11,12,13,14,15,16,17,18
7539,6,14,1,0,-0.541042,-5.987394,2.590983e-14,-0.600677,-3.426598,-0.51337,...,-0.623728,-3.58955,-0.513158,-3.401458,-3.461283,-0.515713,-5.985315,-6.13936,-6.543795,-8.718026e-14
578,3,25,3,0,1.486464,0.103979,2.590983e-14,1.62154,1.219548,1.287266,...,1.650726,1.272142,1.325114,1.200433,1.197569,1.27687,0.103917,0.102446,0.098072,-8.718026e-14
8056,4,12,0,0,1.942077,0.11361,2.590983e-14,1.821819,2.207469,2.267931,...,1.801995,2.115025,2.180444,2.221259,2.21146,2.296936,0.113587,0.112039,0.107175,-8.718026e-14
11761,11,17,0,0,-0.541042,0.110372,2.590983e-14,-0.600677,-0.415113,-0.511182,...,-0.623728,-0.416337,-0.511911,-0.412257,-0.408107,-0.511488,0.110335,0.119734,0.14566,-8.718026e-14
15531,12,29,5,1,-0.541042,0.109514,2.590983e-14,-0.600677,-0.415209,-0.512641,...,-0.623728,-0.416393,-0.512742,-0.412441,-0.40937,-0.514305,0.109475,0.113419,0.124052,-8.718026e-14


In [85]:
df_LinReg2 = Eva_Matrix_Main(LinearRegression(),X_train_1, X_test_1, y_train_1, y_test_1, "LinReg 2")



In [86]:
df_LinReg2

Unnamed: 0,Training LinReg 2,Testing LinReg 2
R2,0.017572,0.013191
MAE,5.94261,5.763171
MSE,91.102201,84.074723
RMSE,9.544747,9.169227


In [87]:
### Optimize 2 
# Using Feature Engineering + Others Algorithms

In [88]:
df_Ridge2 = Eva_Matrix_Main(Ridge(),X_train_1, X_test_1, y_train_1, y_test_1, "Ridge")
df_Ridge2



Unnamed: 0,Training Ridge,Testing Ridge
R2,0.003615,0.004551
MAE,6.009968,5.809481
MSE,92.396471,84.810847
RMSE,9.612308,9.20928


In [89]:
df_Lasso2 = Eva_Matrix_Main(Lasso(),X_train_1, X_test_1, y_train_1, y_test_1, "Ridge")
df_Lasso2



Unnamed: 0,Training Ridge,Testing Ridge
R2,0.0,-0.000337
MAE,6.020683,5.806663
MSE,92.731714,85.227306
RMSE,9.629731,9.231864
