In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from scipy.stats import randint
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

In [None]:
df = pd.read_csv('/kaggle/input/car-price-prediction/CarPrice_Assignment.csv')
df.head()

In [None]:
df.shape

In [None]:
df.info()

- There are 26 columns and 205 rows in this dataset. 
- There are no null values.

In [None]:
df.describe()

- Search for outliers in the following columns: curbweight, enginesize, and price.
    

In [None]:
def find_outliers(column):
    q1 = np.percentile(df[column], 25)
    q2 = np.percentile(df[column], 50)
    q3 = np.percentile(df[column], 75)
    IQR = q3 - q1

    upper_limit = q3 + 1.5 * IQR
    lower_limit = q1 - 1.5 * IQR

    print('upper limit:', upper_limit)
    print('lower limit:', lower_limit)
    outliers = df[(df[column] > upper_limit) | (df[column] < lower_limit)]
    print('There are {} rows contains outliers in the {} column.'.format(len(outliers), column))
    return outliers

In [None]:
find_outliers('price')

In [None]:
find_outliers('curbweight')

In [None]:
find_outliers('enginesize')

In [None]:
df['CarName'].unique()

In [None]:
df['car_make'] = df['CarName'].str.split(' ').str[0]
df['car_model'] = df['CarName'].str.split(' ').str[1:].apply(' '.join)

In [None]:
df.head()

In [None]:
df['car_make'].unique()

In [None]:
df['car_make'] = df['car_make'].replace({'vokswagen': 'volkswagen', 'Nissan': 'nissan', 'porcshce': 'porsche', 'vw': 'volkswagen', 'toyouta': 'toyota', 'maxda': 'mazda'})
df['car_make'].unique()

In [None]:
order = df['car_make'].value_counts().index
sns.countplot(x='car_make', data=df, order=order)
plt.xticks(rotation=90)
plt.show()

- Toyota is the most selling car while mercury is the least.

In [None]:
cat_col = df.select_dtypes(include='object').columns
cat_col

In [None]:
columns = {
    'fueltype': 'Fuel Type',
    'doornumber': 'Door Number',
    'carbody': 'Car Body',
    'enginelocation': 'Engine Location',
    'aspiration': 'Aspiration',
    'drivewheel': 'Drive Wheel',
    'enginetype': 'Engine Type',
    'cylindernumber': 'Cylinder Number',
    'fuelsystem': 'Fuel System'
}

plt.figure(figsize=(10, 10))

for i, (column, title) in enumerate(columns.items(), start=1):
    plt.subplot(3, 3, i)
    labels = df[column].value_counts().index
    sizes = df[column].value_counts().values
    plt.pie(sizes, labels=labels, autopct='%1.1f%%')
    plt.title(title)

plt.tight_layout()
plt.show()


In [None]:
# Identify categorical columns
cat_col = ['fueltype', 'aspiration', 'doornumber', 'carbody',
       'drivewheel', 'enginelocation', 'enginetype', 'cylindernumber',
       'fuelsystem']

# Create a figure with a specific size
plt.figure(figsize=(20, 15))

# Loop through the categorical columns and their subplot positions
for i, column in enumerate(cat_col, start=1):
    plt.subplot(3, 3, i)  # Adjust subplot position
    sns.boxplot(x=column, y='price', data=df)
    plt.title(f'{column} vs price')  # Optional: Add a title for each subplot

# Adjust layout to prevent overlap
plt.tight_layout()

# Display the plot
plt.show()


- Cars with diesel fuel type relatively expensive compared to cars with gas fuel type. 90% of the people chose gas fuel type.
- Turbo aspiration makes cars more expensive. Even though it's more expensive approximately 18% of the people chose turbo aspiration.
- hatchback, sedan, and wagon carbody types are relatively cheaper.
- rwd drivewheel is more expensive compared to fwd and 4wd. Even though rwd drivewheel is more expensive, it was still a popular choice by the customers which is 37.1%. While 4wd was the least popular with 4.4%.
- rear engine makes cars significantly more expensive. Only 1.5% chose rear engine cars due to its price.
- ohc and ohcf engine types are cheaper compared to other alternatives. ohcv enginetype come in a higher price range.

In [None]:
sns.distplot(df['price'])
plt.show()

In [None]:
df_avg_price = df.groupby('car_make')['price'].mean().sort_values(ascending=False).reset_index()

sns.barplot(data=df_avg_price, x='car_make', y='price', errorbar=None)


plt.xticks(rotation=90)
plt.title('car make vs average price')
plt.show()

- The most expensive cars are `Jaguar`, `Buick`, and `Porsche`, while `Chevrolet` is the most affordable.
- Most cars fall within a price range of 7,000 to 20,000.

In [None]:
num_col = df.select_dtypes(exclude=['object'])
plt.figure(figsize=(15,15))
sns.heatmap(num_col.corr(), annot=True, cmap='coolwarm')

- `carlength`, `carwidth`, `curbweigth`, `enginesize` and `horsepower` seems to have positive correlation with `price`.
- `citympg` and `highwaympg` seems to have negative correlation.

In [None]:
df_copy = df.copy()
categorical_columns = df.select_dtypes(include='object').columns
numerical_columns = df.select_dtypes(exclude='object').columns
label_encoder = LabelEncoder()

for column in categorical_columns:
    df_copy[column] = label_encoder.fit_transform(df_copy[column])
    
# Feature scaling
scaler = StandardScaler()
df_copy[numerical_columns] = scaler.fit_transform(df_copy[numerical_columns])

In [None]:
# Splitting the dataset
X = df_copy.drop(['price', 'CarName', 'car_ID'], axis=1)  
y = df_copy['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model training
model = RandomForestRegressor()
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

In [None]:
mse = mean_squared_error(y_test, y_pred)
r2_square = r2_score(y_test,y_pred)
print(f" R-squared: {r2_square}")
print(f'Mean Squared Error: {mse}')

In [None]:
importances = model.feature_importances_

# Create a DataFrame for feature importances
feature_importances = pd.DataFrame({
    'Feature': X.columns,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

print(feature_importances)

- `enginesize` and `curbweight` are the most important features.