In [None]:
#Understand the Problem Statement
#The objective is to model the price of cars using various independent variables to help the company understand pricing dynamics in the American market.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
# Load the dataset
data = pd.read_csv("C:\\Users\\HP\\OneDrive\\POWER BI\\CarPrice_Assignment.csv")

In [None]:
#CHEK THE DATA

In [3]:
# Basic info about the dataset
print(data.info())
print(data.describe())

# Check for missing values
print(data.isnull().sum())

# Check for duplicates
print(data.duplicated().sum())

# Unique values in each column
for column in data.columns:
    print(f'Column: {column}, Unique Values: {data[column].nunique()}')


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   car_ID            205 non-null    int64  
 1   symboling         205 non-null    int64  
 2   CarName           205 non-null    object 
 3   fueltype          205 non-null    object 
 4   aspiration        205 non-null    object 
 5   doornumber        205 non-null    object 
 6   carbody           205 non-null    object 
 7   drivewheel        205 non-null    object 
 8   enginelocation    205 non-null    object 
 9   wheelbase         205 non-null    float64
 10  carlength         205 non-null    float64
 11  carwidth          205 non-null    float64
 12  carheight         205 non-null    float64
 13  curbweight        205 non-null    int64  
 14  enginetype        205 non-null    object 
 15  cylindernumber    205 non-null    object 
 16  enginesize        205 non-null    int64  
 1

In [None]:
#DATA PRE-PROCESSING

In [None]:
# Drop car ID column
data.drop(columns=['car_ID'], inplace=True)

# Extract company name from car name and add to dataframe|
data['company'] = data['carName'].apply(lambda x: x.split()[0])
data.drop(columns=['carName'], inplace=True)

# Fix spelling mistakes in company names (example)
data['company'] = data['company'].replace({'Chrysler': 'Chrysler', 'Chevy': 'Chevrolet'})  # Adjust as necessary

# Label encoding for categorical variables
categorical_cols = data.select_dtypes(include=['object']).columns
label_encoder = LabelEncoder()
for col in categorical_cols:
    data[col] = label_encoder.fit_transform(data[col])

# Outlier detection and removal
for col in ['price', 'mileage', 'engine_size']:  # Adjust with relevant numeric columns
    Q1 = data[col].quantile(0.25)
    Q3 = data[col].quantile(0.75)
    IQR = Q3 - Q1
    data = data[(data[col] >= (Q1 - 1.5 * IQR)) & (data[col] <= (Q3 + 1.5 * IQR))]

In [None]:
#FEACHER SELECTION

In [None]:
# Correlation matrix
correlation_matrix = data.corr()
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.show()

# Remove multicollinearity (remove features with high correlation, e.g., > 0.85)
# Identify highly correlated features
high_correlation = set()
threshold = 0.85
for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > threshold:
            colname = correlation_matrix.columns[i]
            high_correlation.add(colname)

data.drop(columns=high_correlation, inplace=True)


In [None]:
#DATA SPLITING

In [None]:
# Define features and target variable
X = data.drop(columns=['price'])  # Replace 'price' with your target variable
y = data['price']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
#MODEL SELECTION AND IMPLIMENTATION

In [8]:
# Define models
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree Regressor': DecisionTreeRegressor(),
    'Random Forest Regressor': RandomForestRegressor(),
    'Gradient Boosting Regressor': GradientBoostingRegressor(),
    'Support Vector Regressor': SVR()
}

# Train and evaluate models
results = {}
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    results[model_name] = {'MSE': mse, 'R2': r2}

# Display results
results_df = pd.DataFrame(results).T
print(results_df)


NameError: name 'X_train' is not defined

In [None]:
#Model Evaluation
#Review Mean Squared Error (MSE) and R² scores to determine which model performs best.
#Consider plotting predicted vs actual prices for the best model.

In [7]:
# Plotting predicted vs actual for the best model (e.g., Random Forest)
best_model = models['Random Forest Regressor']
y_pred_best = best_model.predict(X_test)

plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred_best)
plt.xlabel('Actual Prices')
plt.ylabel('Predicted Prices')
plt.title('Actual vs Predicted Prices')
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red')  # Diagonal line
plt.show()


NameError: name 'models' is not defined