# Feature Selection and Training Models

**Import Libraries**

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import root_mean_squared_error,r2_score

In [35]:
df=pd.read_csv("../data/clean.csv")

In [36]:
df.columns

Index(['Unnamed: 0', 'TRIP_DURATION', 'DISTANCE_TRAVELED', 'NUM_OF_PASSENGERS',
       'FARE', 'TIP', 'MISCELLANEOUS_FEES', 'TOTAL_FARE', 'SURGE_APPLIED'],
      dtype='object')

*Since were predicting FARE only and not TOTAL_FARE we drop the features that are not dependent on FARE and select the necessary features* **'TRIP_DURATION', 'DISTANCE_TRAVELED', 'NUM_OF_PASSENGERS', 'TIP', 'MISCELLANEOUS_FEES' 'SURGE_APPLIED'**
*We drop Total Fare because TOTAL_FARE = FARE + TIP + MISCELLANEOUS_FEES*

In [37]:
df.drop(columns=["TOTAL_FARE","Unnamed: 0"],inplace=True)

In [38]:
df.head(5)

Unnamed: 0,TRIP_DURATION,DISTANCE_TRAVELED,NUM_OF_PASSENGERS,FARE,TIP,MISCELLANEOUS_FEES,SURGE_APPLIED
0,1848,5,1,143,17,26,1
1,319,2,1,41,0,27,1
2,962,13,1,180,37,6,0
3,718,5,1,94,0,6,0
4,626,3,2,68,19,26,1


Separate Labels and Target

In [39]:
y=df["FARE"]
x=df.drop(columns="FARE")

Split into Testing and Training Sets

In [41]:
X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.25,random_state=42)

Scaling Features for Linear and Distance based Models

In [42]:
st=StandardScaler()
X_train_scaled=st.fit_transform(X_train)
X_test_scaled=st.transform(X_train)

**Linear Regression**

In [None]:
lg=LinearRegression()
lg.fit(X_train_scaled,y_train)
y_pred_lg=lg.predict(y_test)

print("RMSE LINEAR REGRESSION \n",root_mean_squared_error(y_test,y_pred_lg))
print("R2 LINEAR REGRESSION:\n",r2_score(y_test,y_pred_lg))

**Support Vector Regression**

In [None]:
sv=SVR()
sv.fit(X_train_scaled,y_train)
y_pred_sv=sv.predict(y_test)

print("RMSE SVR: \n",root_mean_squared_error(y_test,y_pred_sv))
print("R2 SVR:\n",r2_score(y_test,y_pred_sv))

**Random Forest**

In [None]:
rf=RandomForestRegressor()
rf.fit(X_train,y_train)
y_pred_rf=rf.predict(y_test)

print("RMSE RANDOM FOREST: \n",root_mean_squared_error(y_test,y_pred_rf))
print("R2 RANDOM FOREST:\n",r2_score(y_test,y_pred_rf))

**Xgboost**

In [None]:
xg=XGBRegressor())
xg.fit(X_train,y_train)
y_pred_xg=xg.predict(y_test)

print("RMSE XGBOOST: \n",root_mean_squared_error(y_test,y_pred_xg))
print("R2 XGBOOST:\n",r2_score(y_test,y_pred_xg))