In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

In [None]:
data = pd.read_csv("car_price_prediction.csv")
data.head()

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
data = data.drop(["ID"], axis=1)

In [None]:
data.hist(bins=50, figsize=(10, 6))
plt.show()

In [None]:
plt.figure(figsize=(20, 20))

plt.subplot(7, 1, 1)
data["Manufacturer"].value_counts().plot(kind="bar")
plt.subplot(7, 1, 2)
data["Category"].value_counts().plot(kind="bar")
plt.subplot(7, 1, 3)
data["Fuel type"].value_counts().plot(kind="bar")
plt.subplot(7, 1, 4)
data["Gear box type"].value_counts().plot(kind="bar")
plt.subplot(7, 1, 5)
data["Drive wheels"].value_counts().plot(kind="bar")
plt.subplot(7, 1, 6)
data["Wheel"].value_counts().plot(kind="bar")
plt.subplot(7, 1, 7)
data["Color"].value_counts().plot(kind="bar")
plt.show()

In [None]:
sns.boxplot(data=data["Price"])

In [None]:
p25 = data["Price"].quantile(0.25)
p75 = data["Price"].quantile(0.75)
iqr = p75 - p25
ul = p75 + (1.5 * iqr)
ll = p25 - (1.5 * iqr)

In [None]:
data1 = data[(data["Price"] < ul) & (data["Price"] > ll)]
data1

In [None]:
data1["Levy"] = data1["Levy"].replace("-", 0)

In [None]:
data1["Levy"] = data1["Levy"].astype(int)

In [None]:
data1["Price"] = data1["Price"] + data1["Levy"]

In [None]:
data1.reset_index(drop=True)

In [None]:
data1["Vehicle name"] = data1["Manufacturer"] + " " + data1["Model"]

In [None]:
data1 = data1.drop(["Levy", "Manufacturer", "Model", "Doors"], axis=1)

In [None]:
data1["Category"].value_counts()[-4:]

In [None]:
data1 = data1.groupby("Category").filter(lambda x: len(x) > 100)

In [None]:
data1["Mileage"] = data1["Mileage"].astype(str)

In [None]:
def remove_km(x):
    return x.replace(" km", "")


def remove_turbo(x):
    return x.replace(" Turbo", "")

In [None]:
data1["Mileage"] = data1["Mileage"].apply(lambda x: remove_km(x))

In [None]:
data1["Mileage"] = data1["Mileage"].astype(int)

In [None]:
data1["Engine volume"] = data1["Engine volume"].astype(str)
data1["Engine volume"] = data1["Engine volume"].apply(lambda x: remove_turbo(x))
data1["Engine volume"] = data1["Engine volume"].astype(float)

In [None]:
data1["Wheel"].value_counts()

In [None]:
def encode_wheel(x):
    if x == "Left wheel":
        return 0
    else:
        return 1


data1["Wheel"] = data1["Wheel"].astype(str)
data1["Wheel"] = data1["Wheel"].apply(lambda x: encode_wheel(x))
data1["Wheel"] = data1["Wheel"].astype(int)

In [None]:
data1.info()

In [None]:
label_encoder = preprocessing.LabelEncoder()
data1["Vehicle name"] = label_encoder.fit_transform(data1["Vehicle name"])

In [None]:
data1["Gear box type"] = label_encoder.fit_transform(data1["Gear box type"])
data1["Category"] = label_encoder.fit_transform(data1["Category"])
data1["Leather interior"] = label_encoder.fit_transform(data1["Leather interior"])
data1["Fuel type"] = label_encoder.fit_transform(data1["Fuel type"])
data1["Drive wheels"] = label_encoder.fit_transform(data1["Drive wheels"])
data1["Color"] = label_encoder.fit_transform(data1["Color"])

In [None]:
enc = OneHotEncoder(handle_unknown="ignore")

enc_data = pd.DataFrame(
    enc.fit_transform(
        data1[
            [
                "Prod. year",
                "Category",
                "Leather interior",
                "Fuel type",
                "Gear box type",
                "Drive wheels",
                "Color",
                "Vehicle name",
            ]
        ]
    )
)

data1 = data1.drop(
    [
        "Prod. year",
        "Category",
        "Leather interior",
        "Fuel type",
        "Gear box type",
        "Drive wheels",
        "Color",
        "Vehicle name",
    ],
    axis=1,
)
data1 = data1.join(enc_data)

In [None]:
data1

In [None]:
plt.figure(figsize=(20, 20))

plt.subplot(7, 2, 1)
plt.scatter(data1["Prod. year"], data1["Price"])
plt.subplot(7, 2, 2)
plt.scatter(data1["Category"], data1["Price"])
plt.subplot(7, 2, 3)
plt.scatter(data1["Leather interior"], data1["Price"])
plt.subplot(7, 2, 4)
plt.scatter(data1["Fuel type"], data1["Price"])
plt.subplot(7, 2, 5)
plt.scatter(data1["Engine volume"], data1["Price"])
plt.subplot(7, 2, 6)
plt.scatter(data1["Mileage"], data1["Price"])
plt.subplot(7, 2, 7)
plt.scatter(data1["Cylinders"], data1["Price"])
plt.subplot(7, 2, 8)
plt.scatter(data1["Gear box type"], data1["Price"])
plt.subplot(7, 2, 9)
plt.scatter(data1["Drive wheels"], data1["Price"])
plt.subplot(7, 2, 10)
plt.scatter(data1["Wheel"], data1["Price"])
plt.subplot(7, 2, 11)
plt.scatter(data1["Color"], data1["Price"])
plt.subplot(7, 2, 12)
plt.scatter(data1["Airbags"], data1["Price"])
plt.subplot(7, 2, 13)
plt.scatter(data1["Vehicle name"], data1["Price"])

plt.figure(figsize=(16,10))
sns.heatmap(data1.corr(),cmap="YlGnBu", annot=True,center=0)
plt.show()


In [None]:
data2 = data1.drop("Price", axis=1)
data3 = data1["Price"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    data2, data3, random_state=52, test_size=0.2
)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

model = LinearRegression()
model.fit(X_train, y_train)
pred = model.predict(X_train)

score = r2_score(y_train, pred)
print(model.score(X_test, y_test))

In [None]:
from sklearn.ensemble import RandomForestRegressor

regressor = RandomForestRegressor(n_estimators=50, random_state=0)

regressor.fit(X_train, y_train)

print(regressor.score(X_test, y_test))