In [3]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
%matplotlib inline 
import warnings
warnings.filterwarnings("ignore")
df=pd.read_csv("cardekho_imputed.csv")
df.head()
df.drop(columns=['Unnamed: 0'],inplace=True,axis=1)
df.head()
#Data Cleaning
df.isnull().sum()
#Remmove unnessary columns 
df.drop(columns=["car_name","brand"],axis=1,inplace=True)
df.head()
len(df["model"].unique())
#Getting all different Types of  features 
num_features=[feature for feature in df.columns if df[feature].dtype!="O"]
print(f"number of numeric features are: {len(num_features)}")
cat_features=[feature for feature in df.columns if df[feature].dtype=="O"]
print(f"number of categorical features are: {len(cat_features)}")
#If a Column contains categories less than or equal to 25 than it is said to be discreate feature
discreate_features=[feature for feature in num_features if len(df[feature].unique())<=25]
print(f"number of Discreate features are: {len(discreate_features)}")
continous_features=[feature for feature in num_features if len(df[feature].unique())>25]
print(f"number of Continous features are: {len(continous_features)}")
#Splitting Dataset into Dependent and independent Featurees
X=df.drop("selling_price",axis=1)
y=df["selling_price"]
X.head()
df["model"].value_counts()
print(len(df["model"].value_counts()))
len(df["model"].unique())
#Applying label Encoding to the Model features
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
X["model"]=le.fit_transform(X["model"])
X.head()
#only 3 categories 
X["seller_type"].unique()
X["fuel_type"].unique()
X["transmission_type"].unique()
#These above three features have very less categories so we can apply OneHotEncoding to these three
num_features=X.select_dtypes(exclude="object").columns 
one_hot_features=['seller_type','fuel_type','transmission_type']
#Create a column transfoemer with three types of Transformers
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer
numeric_transformer=StandardScaler()
Oh_transformers=OneHotEncoder(drop="first")
preprocessor=ColumnTransformer(
    transformers=[
        ("numeric_transformer",numeric_transformer,num_features),
        ("oh_transformer",Oh_transformers,one_hot_features)
        ],remainder="passthrough"
)
X=preprocessor.fit_transform(X)
import sys
import numpy as np 
np.set_printoptions(threshold=sys.maxsize)
pd.DataFrame(X)
#Train Test Split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20,random_state=42)
#MODEL TRAINING
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

#Creating a function to evaluate Model
import math
def evaluate_model(true,predicted):
    mae=mean_absolute_error(true,predicted)
    mse=mean_squared_error(true,predicted)
    rmse=math.sqrt(mse)
    r2score=r2_score(true,predicted)
    return mae,rmse,r2score 
#Beggining the Model Training
models={
    "Linear Regression":LinearRegression(),
    "Decision Tree Regressor":DecisionTreeRegressor(),
    "Random Forest Regressor":RandomForestRegressor(),
    "K-Nearest-Regressor":KNeighborsRegressor(),
    "Lasso":Lasso(),
    "Ridge":Ridge(),
    "AdaboostRegressor":AdaBoostRegressor()
}
for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)

    #make Predictions
    y_train_pred=model.predict(X_train)
    y_test_pred=model.predict(X_test)

    #Evaluate train and test Datasets
    model_train_mae,model_train_rmse,model_train_r2=evaluate_model(y_train,y_train_pred)

    model_test_mae,model_test_rmse,model_test_r2=evaluate_model(y_test,y_test_pred)


    print(list(models.keys())[i])

    print("Model Perfomance For Training Set ")
    print("Root mean Squared Error {:.4f}".format(model_train_rmse))
    print("Mean Absolute Error {:.4f}".format(model_train_mae))
    print("r2_score {:.4f}".format(model_train_r2))

    print("--------------------------------------")

    print("Model Perfomance For Testing Set ")
    print("Root mean Squared Error {:.4f}".format(model_test_rmse))
    print("Mean Absolute Error {:.4f}".format(model_test_mae))
    print("r2_score {:.4f}".format(model_test_r2))

    print("="*30)


number of numeric features are: 7
number of categorical features are: 4
number of Discreate features are: 2
number of Continous features are: 5
120
Linear Regression
Model Perfomance For Training Set 
Root mean Squared Error 553855.6665
Mean Absolute Error 268101.6071
r2_score 0.6218
--------------------------------------
Model Perfomance For Testing Set 
Root mean Squared Error 502543.5930
Mean Absolute Error 279618.5794
r2_score 0.6645
Decision Tree Regressor
Model Perfomance For Training Set 
Root mean Squared Error 20797.2352
Mean Absolute Error 5164.8199
r2_score 0.9995
--------------------------------------
Model Perfomance For Testing Set 
Root mean Squared Error 304571.9567
Mean Absolute Error 125817.2965
r2_score 0.8768
Random Forest Regressor
Model Perfomance For Training Set 
Root mean Squared Error 139406.2256
Mean Absolute Error 40178.8654
r2_score 0.9760
--------------------------------------
Model Perfomance For Testing Set 
Root mean Squared Error 227373.1359
Mean Absol

In [4]:
ada_params={
    "n_estimators":[50,60,70,80],
    "loss":["linear","square","exponential"]
}

In [5]:
randomcv_models=[
    ("ADABoost",AdaBoostRegressor(),ada_params)
]

In [6]:
#hyper parameter tuning using Randomized Cv
from sklearn.model_selection import RandomizedSearchCV
model_param={}
for name,model,params in randomcv_models:
    random=RandomizedSearchCV(estimator=model,param_distributions=params,n_iter=100,cv=3,verbose=2,n_jobs=-1)
    random.fit(X_train,y_train)
    model_param[name]=random.best_params_


for model_name in model_param:
    print(f"-----Best Params for {model_name}----------")
    print(model_param[model_name])

Fitting 3 folds for each of 12 candidates, totalling 36 fits
-----Best Params for ADABoost----------
{'n_estimators': 70, 'loss': 'linear'}


In [7]:
#Beggining the Model Training
models={
    "Adaboost":AdaBoostRegressor(n_estimators=70,loss="linear")
}
for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)

    #make Predictions
    y_train_pred=model.predict(X_train)
    y_test_pred=model.predict(X_test)

    #Evaluate train and test Datasets
    model_train_mae,model_train_rmse,model_train_r2=evaluate_model(y_train,y_train_pred)

    model_test_mae,model_test_rmse,model_test_r2=evaluate_model(y_test,y_test_pred)


    print(list(models.keys())[i])

    print("Model Perfomance For Training Set ")
    print("Root mean Squared Error {:.4f}".format(model_train_rmse))
    print("Mean Absolute Error {:.4f}".format(model_train_mae))
    print("r2_score {:.4f}".format(model_train_r2))

    print("--------------------------------------")

    print("Model Perfomance For Testing Set ")
    print("Root mean Squared Error {:.4f}".format(model_test_rmse))
    print("Mean Absolute Error {:.4f}".format(model_test_mae))
    print("r2_score {:.4f}".format(model_test_r2))

    print("="*30)

Adaboost
Model Perfomance For Training Set 
Root mean Squared Error 478302.6068
Mean Absolute Error 379278.0082
r2_score 0.7179
--------------------------------------
Model Perfomance For Testing Set 
Root mean Squared Error 506892.5932
Mean Absolute Error 394770.0377
r2_score 0.6587
