# CAR PRICE ASSIGNMENT

<img src="https://web-assets.cdn.dealersolutions.com.au/modular.multisite.dealer.solutions/wp-content/uploads/2018/07/30122827/MK2-GenericBanner-1-414x311.jpg" width="400" height="400">

## Import Libraries

In [None]:
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split,GridSearchCV,cross_val_score
from sklearn.metrics import r2_score,mean_squared_error
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Reading Dataset

In [None]:
df= pd.read_csv("/kaggle/input/car-data/CarPrice_Assignment.csv",index_col=0)

##### DATASET SOURCE: https://archive.ics.uci.edu/ml/datasets/Automobile

In [None]:
df.head()

In [None]:
df.info()

## Data Analysis

* Symboling: Its assigned insurance risk rating, A value of +3 indicates that the auto is risky.
* Fueltype: Car fuel type i.e gas or diesel
* Aspiration: Aspiration used in a car (standard,turbo)
* Doornumber: Number of doors in a car (2,4)
* Carbody: Body of car (hatchback,sedan..)
* Drivewheel: Type of drive wheel ( rwd= rear wheel drive, fwd= front wheel drive, 4wd= four wheel drive )
* Enginelocation: Location of car engine (rear,front)
* Wheelbase: Wheelbase of car
* Carlength: Length of car
* Carwidth: Width of car
* Carheight: Height of car
* Curbweight: The weight of a car without occupants or baggage
* Enginetype: Type of engine
* Cylindernumber: Cylinder placed in the car
* Enginesize: Size of car engine
* Fuelsystem: Fuel system of car
* Boreratio: Boreratio of car 
* Stroke: Stroke or volume inside the engine
* Compressionratio: Compression ratio of car
* Horsepower: Horsepower
* Peakrpm: Car peak rpm
* Citympg: Mileage in city 
* Highwaympg: Mileage on highway 
* Price(Dependent variable): Price of car 

<img src="https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSUrnQLFO9PaU9um6MFUwlw6QbQU29XmH9m_Q&usqp=CAU" width="400" height="400">

<img src="https://4.bp.blogspot.com/-ZZPbnqf3CvI/XG6N9RrR7dI/AAAAAAAAHSE/WB-qcK1WcLU2RpE-zp_99RRzwhPhz-u7ACLcBGAs/s640/cfdf.jpg" width="400" height="400">

## Feature Engineering

#### Converting to categorical variable

In [None]:
objects= df.select_dtypes(include="object").columns

In [None]:
df[objects]= df.select_dtypes(include="object").astype("category")

In [None]:
df.info()

#### Categorize vehicles by make and model

In [None]:
df["CarName"].value_counts()

#### Classifying cars by brands and models

In [None]:
df["carbrands"]= df["CarName"].apply(lambda x: x.split()[0])

In [None]:
df.head()

In [None]:
df["carbrands"].value_counts()

##### Some vehicle brands have the same name but under different variables

In [None]:
df["carbrands"]= df["carbrands"].str.lower()

In [None]:
def replace_brands(old,new):
    df.carbrands.replace(old,new,inplace=True)
replace_brands("maxda","mazda")
replace_brands("vw","volkswagen")
replace_brands("vokswagen","volkswagen")
replace_brands("porcshce","porsche")
replace_brands("toyouta","toyota")

In [None]:
df["carbrands"].unique()

In [None]:
carmodels= df["CarName"].apply(lambda x: x.split())

In [None]:
carmodels

##### Although some values are the same, these values seem to be different from each other due to 
##### the incorrect creation of the data set. For example, 100 ls and 100ls mean the same thing.

In [None]:
carmodelsnew=[]

In [None]:
lengths=[]
for i in carmodels:
    lengths.append(len(i))

In [None]:
np.array(lengths).max()

In [None]:
for i in carmodels:
    if len(i)==1:
        carmodelsnew.append(np.nan)
    elif len(i)==2:
        carmodelsnew.append(i[1])
    elif len(i)==3: 
        carmodelsnew.append(i[1]+i[2])
    elif len(i)==4:
        carmodelsnew.append(i[1]+i[2]+i[3])
    elif len(i)==5:
        carmodelsnew.append(i[1]+i[2]+i[3]+i[4])

##### Since the models of some cars are not included in the data set, the value nan is entered in the model variable.

In [None]:
len(carmodelsnew)

In [None]:
df["carmodels"]= carmodelsnew

In [None]:
df.head()

##### Car prices vary a lot depending on the models so we can't ignore the carmodels. But we no longer need the variable "CarName"

In [None]:
df.drop("CarName",axis=1,inplace=True)

##### Although some values are of numeric type, they are written in string type. Fixing these will be good for the model.

In [None]:
df["carmodels"].unique()

In [None]:
cylindernum= {"eight":8,
              "five":5,
              "six":6,
              "three":3,
              "twelve":12,
              "two":2,
              "four":4}
df["cylindernumber"].replace(cylindernum,inplace=True)

In [None]:
doornum={"four":4,
         "two":2}
df["doornumber"].replace(doornum,inplace=True)

In [None]:
df.info()

##### Detailed information about the dimensions of the car is given. Let's simplify these dimensions by gathering them under a single value.

In [None]:
df["carvolume"]=df["carlength"]*df["carwidth"]*df["carheight"]

In [None]:
df.groupby("carbody")[["carvolume","carlength","carwidth","carheight"]].mean()

In [None]:
carsizes=["carvolume","carlength","carwidth","carheight"]
plt.figure(figsize=(10,10),facecolor="goldenrod")
a=1
for i in carsizes:
    value=df[i]
    plt.subplot(2,2,a)
    sns.kdeplot(value,color="orangered",shade=True)
    a=a+1
    if a==5:
        a=1

In [None]:
df.corr()["carlength"].sort_values()

In [None]:
df.corr()["carwidth"].sort_values()

In [None]:
df.corr()["carheight"].sort_values()

In [None]:
df.corr()["carvolume"].sort_values()

In [None]:
df.drop(["carlength","carwidth","carheight","wheelbase"],axis=1,inplace=True)

##### Let's examine the engine information of the cars

##### "Compression ratio" value is directly proportional to "boreratio" and "stroke" values

<img src="https://qph.fs.quoracdn.net/main-qimg-765b76c39cfee15ee53df139c677f7a4.webp" width="400" height="400">

##### The compression ratio is the mathematical ratio used to predict the performance of an engine (such as an internal combustion or Stirling engine).

##### In addition, there is a formula that gives the engine volume. And from this formula, it is seen that the number of cylinders, boreratio value, stroke value is directly proportional to enginesize value.

##### The "compressionratio" value varies according to fuel types.
##### And the risk value (symboling) increases as the "compressionratio" value increases.

In [None]:
df.groupby("fueltype")[["compressionratio","symboling"]].mean()

* enginesize= 0.7854 x boreratio x boreratio x stroke x number of cylinders.

##### When both formulas are examined, there is no harm in deleting the "boreratio","stroke","cylindernumber" values.

In [None]:
plt.figure(figsize=(5,5),facecolor="greenyellow")
plt.subplot(1,1,1)
sns.barplot(x="carbody",y="cylindernumber",data=df);
sns.lineplot(x=df["carbody"],y=df["enginesize"]/10,color="purple");
sns.lineplot(x=df["carbody"],y=df["price"]/1000,color="red");

In [None]:
df.drop(["cylindernumber","boreratio","stroke"],axis=1,inplace=True)

In [None]:
df.head()

## Data Visualization

In [None]:
plt.figure(figsize=(20,20),facecolor="pink")
plt.subplot(1,1,1)
sns.barplot(x="carbrands",y="price",hue="carbody",data=df);
plt.plot(df["carbrands"],df["price"],marker="o",linestyle="--",color="purple");
plt.title("Price comparison by brands",fontsize=20);

In [None]:
plt.figure(figsize=(7,7),facecolor="yellow")
sns.heatmap(df.corr(),annot=True,fmt=".001g",linewidths=5,linecolor="yellow");

In [None]:
plt.figure(figsize=(20,20),facecolor="white")
plt.subplot(1,1,1)
sns.barplot(x="carbrands",y="citympg",data=df);
plt.plot(df["carbrands"],df["highwaympg"],marker="*",linestyle="-.",color="red");
plt.title("The amount of fuel used by cars.",fontsize=20);

##### As the mpg value increases, the fuel consumed (lt.) decreases. The vehicle that consumes the most fuel in the same kilometer is the jaguar.

In [None]:
plt.figure(figsize=(10,10),facecolor="orange")
plt.subplot(1,2,1)
sns.barplot(x=df["fueltype"],y=df["compressionratio"],hue=df["aspiration"],color="red");
plt.subplot(1,2,2)
sns.barplot(x=df["fueltype"],y=df["compressionratio"],hue=df["carbody"],color="blue");

In [None]:
plt.figure(figsize=(5,5),facecolor="orange")
plt.subplot(1,1,1)
sns.barplot(x=df["fueltype"],y=df["compressionratio"],hue=df["fuelsystem"],color="green")
sns.barplot(x=df["fueltype"],y=df["symboling"],color="red");

##### Diesel vehicles use a single fuel system. And the "compressionratio" value in diesel vehicles is much higher than in LPG vehicles. As the compression ratio value increases, of course, the risk of the motor increases.

In [None]:
plt.figure(figsize=(15,15),facecolor="orange")
plt.subplot(1,1,1)
sns.barplot(x=df["carbrands"],y=df["price"],hue=df["fueltype"],color="green")
plt.plot(df["carbrands"],df["citympg"]*100,linestyle="-.",marker="*",color="red")
plt.plot(df["carbrands"],df["highwaympg"]*100,linestyle="--",marker="o",color="blue")
plt.fill_between(df["carbrands"],df["highwaympg"]*100,color="yellow");

##### In most car brands, LPG models are more expensive than diesel models.

In [None]:
plt.figure(figsize=(15,15),facecolor="orange")
plt.subplot(1,1,1)
sns.barplot(x=df["carbrands"],y=df["price"],color="green")
sns.lineplot(x=df["carbrands"],y=df["horsepower"]*100,color="red")
plt.title("Price and horsepower chart by car brands",fontsize=15);

##### Although the price of some cars is high, the horsepower is average. Such cars do not gain from horsepower, but from the luxury design of the interior of the car. Buick can be shown as a brand that sells cars for its luxury design.

In [None]:
plt.figure(figsize=(7,7),facecolor="pink")
plt.plot(df["price"].index,df["price"],linestyle="--",marker=".",color="red")
plt.plot(df["price"].index,df["horsepower"]*100,linestyle="-",marker=".",color="green");
plt.xlabel("Index")
plt.ylabel("Values")
plt.title("Price and Horsepower")
plt.legend(["Price","Horsepower"],loc="best");

In [None]:
plt.figure(figsize=(7,7),facecolor="white")
plt.plot(df["enginesize"].index,df["enginesize"],linestyle="-.",marker=".",color="red")
plt.plot(df["enginesize"].index,df["horsepower"],linestyle="-",marker=".",color="blue");
plt.xlabel("Index")
plt.ylabel("Values")
plt.title("Enginesize and Horsepower");
plt.legend(["Enginesize","Horsepower"],loc="best");

In [None]:
plt.figure(figsize=(7,7),facecolor="slateblue")
plt.subplot(1,1,1)
sns.barplot(x="carbody",y="curbweight",hue="enginelocation",data=df);

In [None]:
plt.figure(figsize=(7,7),facecolor="mediumspringgreen")
plt.subplot(1,1,1)
sns.barplot(x="enginetype",y="enginesize",hue="drivewheel",data=df);

## Handle Outliers

In [None]:
df_num= df.select_dtypes(include=["int64","float64"])

In [None]:
plt.figure(figsize=(15,15),facecolor="mediumspringgreen")
a=1
for i in df_num:
    value=df[i]
    plt.subplot(2,6,a)
    sns.boxplot(value,color="red");
    a=a+1
    if a==12:
        a=1

##### Some car prices are much higher than normal, the reason for this should be investigated

In [None]:
plt.figure(figsize=(10,10))
sns.scatterplot(x=df["price"].index,y=df["price"],hue=df["carbrands"],style=df["carbody"])
plt.plot(df["horsepower"].index,df["horsepower"]*100,linestyle="--",marker=".",color="yellow");

##### Prices of cars vary according to horsepower and car body.

##### If you want to see the amount of outliers in variables

In [None]:
lower_quarter=[]
upper_quarter=[]

lower_thresholds=[]
upper_thresholds=[]

lower_values=[]
upper_values=[]

percent=[]

for i in df_num: 
    value= df_num[i]
    Q1= value.quantile(0.25)
    Q3= value.quantile(0.75)
    lower_quarter.append(Q1)
    upper_quarter.append(Q3)

    IQR= Q3-Q1
    lower_threshold= Q1-(1.5*IQR)
    upper_threshold= Q3+(1.5*IQR)
    lower_thresholds.append(lower_threshold)
    upper_thresholds.append(upper_threshold)
    
    lower_value= value[value<lower_threshold] 
    upper_value= value[value>upper_threshold]
    lower_values.append(len(lower_value))
    upper_values.append(len(upper_value))
    
    percent.append(((len(lower_value)+len(upper_value))/len(value))*100)
    
    lower_index= value[value<lower_threshold].index
    upper_index= value[value>upper_threshold].index
    df[i].loc[lower_index]= lower_threshold
    df[i].loc[upper_index]= upper_threshold

In [None]:
values=df_num.columns
values= pd.DataFrame(values)

In [None]:
lower_quarter= pd.DataFrame(lower_quarter)
upper_quarter= pd.DataFrame(upper_quarter)
lower_thresholds= pd.DataFrame(lower_thresholds)
upper_thresholds= pd.DataFrame(upper_thresholds)
lower_value= pd.DataFrame(lower_values)
upper_value= pd.DataFrame(upper_values)
percent=pd.DataFrame(percent)

In [None]:
handle_outliers= pd.concat([values,lower_quarter,upper_quarter,lower_thresholds,upper_thresholds,lower_value,upper_value,percent],axis=1)

In [None]:
handle_outliers.columns=["Value","Lower Quarter (Q1)","Upper Quarter (Q3)","Lower Threshold","Upper Threshold","Number of lower outliers","Number of upper outliers",
                        "Percentage of outlier"]
handle_outliers["Total number of outliers"]= handle_outliers["Number of lower outliers"]+handle_outliers["Number of upper outliers"]

In [None]:
handle_outliers

In [None]:
df_num_new= df.select_dtypes(include=["int64","float64"])
plt.figure(figsize=(15,15),facecolor="yellow")
a=1
for i in df_num_new:
    value=df[i]
    plt.subplot(2,6,a)
    sns.boxplot(value,color="purple");
    a=a+1
    if a==12:
        a=1

## Let's fill in the missing values

In [None]:
df.isnull().sum()

In [None]:
df[df["carmodels"].isnull()==True]

In [None]:
index_miss= df[df["carmodels"].isnull()==True].index

In [None]:
df["carmodels"].fillna(df["carmodels"].mode()[0],inplace=True)

In [None]:
df.isnull().sum()

In [None]:
df.loc[index_miss]

##### Missing values are filled with the car model named "504".

## One Hot Encoding

In [None]:
df["fueltype"].value_counts()

In [None]:
df["aspiration"].value_counts()

In [None]:
df["enginelocation"].value_counts()

In [None]:
df["fueltype"]=pd.get_dummies(df["fueltype"],drop_first=True)

In [None]:
df["aspiration"]=pd.get_dummies(df["aspiration"],drop_first=True)

In [None]:
df["enginelocation"]=pd.get_dummies(df["enginelocation"],drop_first=True)

In [None]:
df.head()

## Label Encoder

In [None]:
df["carbrands"].value_counts()

In [None]:
print("Totel number of car brands: " + str(len(df["carbrands"].value_counts())))

In [None]:
df["carmodels"].unique()

In [None]:
print("Totel number of car models: " + str(len(df["carmodels"].unique())))

##### The number of models of vehicles is very large. It would be ridiculous to convert the "carmodels" variable with the LabelEncoder. Therefore, we need to reduce the number of models with a logical method.

In [None]:
df["carbrands"].unique()

In [None]:
plt.figure(figsize=(20,20),facecolor="palegreen")
a=1
for i in np.arange(0,len(df["carbrands"].unique())):
    value=df["carbrands"].unique()[i]
    plt.subplot(5,5,a)
    sns.barplot(x= df[df["carbrands"]==value]["carmodels"],y=df["price"],color="aqua")
    sns.lineplot(df[df["carbrands"]==value]["carmodels"],df[df["carbrands"]==value]["horsepower"]*100,color="crimson")
    plt.legend(["Horsepower"],loc="best",labelcolor="crimson",edgecolor="crimson");
    a=a+1

In [None]:
df["carbody"]=LabelEncoder().fit_transform(df["carbody"])
df["drivewheel"]=LabelEncoder().fit_transform(df["drivewheel"])
df["enginetype"]=LabelEncoder().fit_transform(df["enginetype"])
df["fuelsystem"]=LabelEncoder().fit_transform(df["fuelsystem"])
df["carbrands"]= LabelEncoder().fit_transform(df["carbrands"])
df["carmodels"]=LabelEncoder().fit_transform(df["carmodels"])

In [None]:
df.head()

In [None]:
df["carmodels"].value_counts()

In [None]:
sns.kdeplot(df["carmodels"],shade=True,color="green");

## Separating the dataset into train and test

In [None]:
X=df.drop("price",axis=1)
y=df[["price"]]

In [None]:
X_train,X_test,y_train,y_test= train_test_split(X,y,train_size=0.75,random_state=42)

## Feature Scaling

In [None]:
X_train.head()

In [None]:
scale_value= ["citympg","highwaympg","carvolume","curbweight","enginesize","horsepower","peakrpm","carmodels"]
X_train_scaled=X_train.copy()
X_test_scaled=X_test.copy()
for i in scale_value:    
    X_train_scaled[i]=StandardScaler().fit_transform(np.array(X_train_scaled[i]).reshape(-1,1))
    X_test_scaled[i]=StandardScaler().fit_transform(np.array(X_test_scaled[i]).reshape(-1,1))

In [None]:
X_train.head()

In [None]:
X_train_scaled.head()

##### The dataset is sorted by car brands. The data is mixed so that the Cross Validation scoring works well.

In [None]:
X_train.info()

In [None]:
X_train.head()

## MODEL

### RANDOM FOREST

In [None]:
rf_params_1={"max_depth":[5,10,20],
           "n_estimators":[100,500,1000]}

In [None]:
rf_grid_1= GridSearchCV(RandomForestRegressor(),rf_params_1,cv=10,n_jobs=-1,scoring="r2").fit(X_train,y_train)

In [None]:
rf_grid_1.best_params_

In [None]:
rf_params_2={"min_samples_split":[2,4,6],
             "min_samples_leaf":[2,4,6]}

In [None]:
rf_grid_2= GridSearchCV(RandomForestRegressor(max_depth=10,n_estimators=100),rf_params_2,cv=10,n_jobs=-1,scoring="r2").fit(X_train,y_train)

In [None]:
rf_grid_2.best_params_

In [None]:
rf_tuned= RandomForestRegressor(max_depth=10,n_estimators=100,min_samples_split=4,min_samples_leaf=2,random_state=42).fit(X_train,y_train)

In [None]:
y_pred_rf= rf_tuned.predict(X_test)

In [None]:
r2_score(y_test,y_pred_rf)

In [None]:
y_pred_rf= pd.DataFrame(y_pred_rf,index=y_test.index)

In [None]:
plt.figure(figsize=(7,7),facecolor="mediumspringgreen")
plt.subplot(1,1,1)
plt.scatter(x=y_test.index,y=y_pred_rf,color="red")
plt.plot(y_test.index,y_test)
plt.legend(["Real Prices","Predicted Price"])
plt.title("Random Forest Prediction");

### GBM

In [None]:
gbm_params={"learning_rate": [0.001,0.01,0.1],
              "max_depth":  [5,20,35]}

In [None]:
gbm_grid= GridSearchCV(GradientBoostingRegressor(),gbm_params,cv=10,n_jobs=-1,scoring="r2")

In [None]:
gbm_grid.fit(X_train,y_train)

In [None]:
gbm_grid.best_params_

In [None]:
gbm_params_1={"n_estimators": [100,1000,2000],
              "subsample": [1,0.5,0.75]}
gbm_grid_1= GridSearchCV(GradientBoostingRegressor(learning_rate=0.1,max_depth=5),gbm_params_1,cv=10,n_jobs=-1,scoring="r2")
gbm_grid_1.fit(X_train,y_train)
gbm_grid_1.best_params_

In [None]:
gbm_model= GradientBoostingRegressor(learning_rate=0.01,max_depth=5,n_estimators=1000,subsample=0.5,random_state=42).fit(X_train,y_train)

In [None]:
y_pred_gbm= gbm_model.predict(X_test)

In [None]:
r2_score(y_test,y_pred_gbm)

In [None]:
plt.figure(figsize=(7,7),facecolor="mediumspringgreen")
plt.subplot(1,1,1)
plt.scatter(x=y_test.index,y=y_pred_gbm,color="red")
plt.plot(y_test.index,y_test)
plt.legend(["Real Prices","Predicted Price"])
plt.title("Gradient Boosting Machine Prediction");