## Selection from different models 

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_error
from sklearn.model_selection import GridSearchCV
import pandas as pd

In [9]:
def eveluate_model(model,true,pred):
    print("=="*20)
    print("Model: ",model.__class__.__name__)
    true=true.values.reshape(-1,1)
    pred=pred.reshape(-1,1)
    print("--"*20)
    print("R2 Score: ",r2_score(true,pred))
    print("MSE: ",mean_squared_error(true,pred))
    print("MAE: ",mean_absolute_error(true,pred))
    print("=="*20)

In [10]:
data= pd.read_csv('cleaned_data.csv')
x=data.drop(columns=[data.columns[15]],axis=1)
y=data.iloc[:,15]
x_train , x_test , y_train , y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [11]:
models = {
    "Random Forest Regressor": RandomForestRegressor(),
    "Decision Tree Regressor": DecisionTreeRegressor(),
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(),
    "Lasso Regression": Lasso(),
    "Support Vector Regression": SVR(),
    "KNN Regression": KNeighborsRegressor(),
}

In [12]:
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(x_train,y_train)
    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)
    print("Evaluating model on Training data")
    eveluate_model(model,y_train,y_train_pred)
    print("=="*20)
    print("Evaluating model on Testing data")
    eveluate_model(model,y_test,y_test_pred)
    print("=="*20)

Evaluating model on Training data
Model:  RandomForestRegressor
----------------------------------------
R2 Score:  0.9743366983796437
MSE:  20813830337.366215
MAE:  39580.50333996615
Evaluating model on Testing data
Model:  RandomForestRegressor
----------------------------------------
R2 Score:  0.9327865860845653
MSE:  50597034071.74996
MAE:  100876.25498648104
Evaluating model on Training data
Model:  DecisionTreeRegressor
----------------------------------------
R2 Score:  0.9994666998284044
MSE:  432524990.53644824
MAE:  5164.819922128488
Evaluating model on Testing data
Model:  DecisionTreeRegressor
----------------------------------------
R2 Score:  0.8871592578767633
MSE:  84944455894.98415
MAE:  122610.9498324143
Evaluating model on Training data
Model:  LinearRegression
----------------------------------------
R2 Score:  0.6219505790641252
MSE:  306611231200.9522
MAE:  268209.07931516797
Evaluating model on Testing data
Model:  LinearRegression
------------------------------

## Hyperparameter Tuning on KNN Regression and Random Forest Regression

In [16]:
kNN_params={
    "n_neighbors":[i for i in range(1,20)],
    "weights":["uniform","distance"],
    "algorithm":["auto","ball_tree","kd_tree","brute"],
    "p":[1,2]
}

rf_params ={
    "n_estimators":[5,10,25,50,100,200,500,1000],
    "max_depth":[None,2,5,10,15],
    "min_samples_split":[2,5,10,20],
    "max_features":[None,"sqrt","log2"],
}

In [17]:
grid_models=[
    ("Random Forest Regressor",RandomForestRegressor(),rf_params),
    ("K nearest neighbour",KNeighborsRegressor(),kNN_params)
]

In [18]:
model_prams={}

for name,model,params in grid_models:
    grid_search = GridSearchCV(
        estimator=model,
        param_grid=params,
        cv=3,
        n_jobs=-1,
        verbose=1,
    )
    grid_search.fit(x_train,y_train)
    model_prams[name]=grid_search.best_params_
    

Fitting 3 folds for each of 480 candidates, totalling 1440 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Fitting 3 folds for each of 304 candidates, totalling 912 fits


  _data = np.array(data, dtype=dtype, copy=copy,


In [19]:
model_prams

{'Random Forest Regressor': {'max_depth': None,
  'max_features': None,
  'min_samples_split': 10,
  'n_estimators': 25},
 'K nearest neighbour': {'algorithm': 'auto',
  'n_neighbors': 4,
  'p': 1,
  'weights': 'distance'}}

In [20]:
models={
    "Random Forest Regressor": RandomForestRegressor(**model_prams["Random Forest Regressor"]),
    "K nearest neighbour": KNeighborsRegressor(**model_prams["K nearest neighbour"]),
}

In [21]:
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(x_train,y_train)
    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)
    print("Evaluating model on Training data")
    eveluate_model(model,y_train,y_train_pred)
    print("=="*20)
    print("Evaluating model on Testing data")
    eveluate_model(model,y_test,y_test_pred)
    print("=="*20)

Evaluating model on Training data
Model:  RandomForestRegressor
----------------------------------------
R2 Score:  0.9493428504787946
MSE:  41084710420.60421
MAE:  67473.01528088134
Evaluating model on Testing data
Model:  RandomForestRegressor
----------------------------------------
R2 Score:  0.9341154309673906
MSE:  49596703841.58231
MAE:  102237.31147866213
Evaluating model on Training data
Model:  KNeighborsRegressor
----------------------------------------
R2 Score:  0.9994659171143871
MSE:  433159798.8319273
MAE:  5169.3975773307375
Evaluating model on Testing data
Model:  KNeighborsRegressor
----------------------------------------
R2 Score:  0.6809946285710928
MSE:  240141434678.01385
MAE:  227442.63921070044
