<a href="https://colab.research.google.com/github/shakeraema/Dataset_Manipulation/blob/main/Task_PDS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("sukhmandeepsinghbrar/car-price-prediction-dataset")

print("Path to dataset files:", path)

In [None]:
dataset_url = "https://raw.githubusercontent.com/sukhmandeepsinghbrar/car-price-prediction-dataset/master/car%20price%20prediction.csv"

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
data = '/content/drive/MyDrive/PDS-Data/cardekho.csv'

df = pd.read_csv(data)

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.size

In [None]:
df.shape

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
plt.figure(figsize=(8, 5))
plt.scatter(df['fuel'], df['selling_price'])
plt.xlabel('Fuel')
plt.ylabel('Selling price')
plt.title('Fuel vs. Selling price')

In [None]:
plt.figure(figsize=(8, 5))
sns.scatterplot(x=df["year"], y=df["selling_price"], alpha=0.5)
plt.xlabel("Year")
plt.ylabel("Selling Price")
plt.title("Price Depreciation Over the Years")
plt.show()

In [None]:
plt.figure(figsize=(8, 5))
sns.scatterplot(x=df["engine"], y=df["selling_price"], alpha=0.5)
plt.xlabel("Engine Capacity (CC)")
plt.ylabel("Selling Price")
plt.title("Impact of Engine Size on Selling Price")
plt.show()

In [None]:
plt.figure(figsize=(8, 5))
sns.scatterplot(x=df["transmission"], y=df["selling_price"], alpha=0.5)
plt.xlabel("Transmission")
plt.ylabel("Selling Price")
plt.title("Transmission(M/A)[M=1, A=0]")
plt.show()

In [None]:
plt.figure(figsize=(8, 5))
sns.scatterplot(x=df["owner"], y=df["selling_price"], alpha=0.5)
plt.xlabel("Owner")
plt.ylabel("Selling Price")
plt.title("Impact of Owner on Selling Price")
plt.show()

In [None]:
plt.figure(figsize=(8, 5))
sns.scatterplot(x=df["year"], y=df["selling_price"], alpha=0.5)
plt.xlabel("Year")
plt.ylabel("Selling Price")
plt.title("Newer car Selling Price")
plt.show()

In [None]:
plt.figure(figsize=(8, 5))
sns.scatterplot(x=df["mileage(km/ltr/kg)"], y=df["selling_price"])
plt.xlabel("Mileage")
plt.ylabel("Selling Price")
plt.title("Impact of Mileage on Selling Price")
plt.show()

In [None]:
plt.figure(figsize=(8, 5))
plt.scatter(df['mileage(km/ltr/kg)'], df['selling_price'], c = df["selling_price"], cmap='viridis')
plt.xlabel('Mileage')
plt.ylabel('Price')
plt.title('Mileage vs. Price')
_ = plt.colorbar(label='Selling Price')

In [None]:
df['fuel'].unique()

In [None]:
len(df['fuel'].unique())

In [None]:
df['fuel'] = df['fuel'].map({'Diesel': 0, 'Petrol': 1, 'LPG': 2, 'CNG': 3})

In [None]:
df['fuel'].unique()

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
le = LabelEncoder()

In [None]:
df['seller_type'] = le.fit_transform(df['seller_type'])

In [None]:
df['seller_type']

In [None]:
df['seller_type'].unique()

In [None]:
len(df['name'].unique())

In [None]:
columns = ['transmission', 'owner', 'name']

In [None]:
for column in columns:
    df[column] = le.fit_transform(df[column])

In [None]:
for column in columns:
    print(f"Unique values in {column}:")
    print(df[column].unique())

In [None]:
df.head()

In [None]:
df.isnull().sum()

In [None]:
for column in df.select_dtypes(include=np.number).columns:
    df[column].fillna(df[column].mean(), inplace=True)

In [None]:
# df["engine"]= df["engine"].fillna(df["engine"].mean())
# df["seats"]= df["seats"].fillna(df["seats"].mean())

In [None]:
df.isnull().sum()

In [None]:
df["max_power"] = pd.to_numeric(df["max_power"], errors='coerce')
df["max_power"] = df["max_power"].fillna(0).astype(int)

In [None]:
df.isnull().sum()

In [None]:
df = df.drop(columns=['name'])

In [None]:
correlation_matrix = df.corr()

In [None]:
plt.figure(figsize=(10, 6))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
plt.title("Feature Correlation Heatmap")
plt.show()

In [None]:
X = df

y = df['selling_price']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [None]:
pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("model", LinearRegression())
])

In [None]:
pipeline.fit(X_train, y_train)

In [None]:
y_predict = pipeline.predict(X_test)

In [None]:
plt.figure(figsize=(6, 4))
plt.scatter(y_test, y_predict, marker="o")
plt.xlabel("Actual Prices")
plt.ylabel("Predicted Prices")
plt.title("Actual vs. Predicted Prices")
plt.show()

In [None]:
y_test[:5]

In [None]:
mae = mean_absolute_error(y_test, y_predict)
mse = mean_squared_error(y_test, y_predict)
r2 = r2_score(y_test, y_predict)

print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")
print(f"R2 Score: {r2}")

In [None]:
num_features = X_train.shape[1]

In [None]:
Xfit = np.zeros((50, num_features))

In [None]:
for i in range(num_features):
    Xfit[:, i] = np.linspace(-1, 11, 50)

In [None]:
yfit = pipeline.predict(Xfit)

In [None]:
y_test[:5]

In [None]:
x = df['mileage(km/ltr/kg)']

In [None]:
plt.figure(figsize=(8, 5))
plt.scatter(x, y, color='blue', alpha=0.5, label="Actual Prices")
plt.plot(xfit, yfit, color='red', linewidth=2, label="Fitted Line")
plt.xlabel("Year")
plt.ylabel("Selling Price")
plt.title("Car Price Prediction using Linear Regression")
plt.legend()
plt.show()