In [1]:
# Import necessary packages
import os
import sys
import logging
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score


In [2]:
# The correct URL for the raw data file
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data"

# The dataset does not have headers, so we provide them manually
column_names = ["symboling", "normalized-losses", "make", "fuel-type", "aspiration", "num-of-doors",
                "body-style", "drive-wheels", "engine-location", "wheel-base", "length", "width",
                "height", "curb-weight", "engine-type", "num-of-cylinders", "engine-size",
                "fuel-system", "bore", "stroke", "compression-ratio", "horsepower", "peak-rpm",
                "city-mpg", "highway-mpg", "price"]

# Use pandas read_csv to load the data, specifying no header and providing column names
autos_df = pd.read_csv(url, header=None, names=column_names)





In [3]:
# Display the first 5 rows to understand how data looks like
autos_df.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


In [4]:

# Missing values in the data are marked by '?'
# Replace '?' with NaN in the DataFrame for proper missing value handling
autos_df.replace('?', np.nan, inplace=True)

In [5]:
# Identify Missing Data and Convert numeric data into correct data types

# 1. Use a pandas method on the DataFrame with NaN to count the number of missing values (NaN) in each column.
#    Print the resulting series.
missing_values = autos_df.isna().sum()
print("MIssing values in each column", missing_values)

# 2. Check the data types of all columns to see which numeric columns were loaded as 'object' (strings).
# Identify the columns that should be numeric but are currently 'object' type (due to '?' or other non-numeric entries)
# Convert them to numeric, coercing errors (non-numeric values will become NaN)
# Hint: Using errors='coerce' will turn any non-numeric values (including NaNs from previous '?' replacement) into NaN
print(autos_df.dtypes)
numeric_columns = ["normalized-losses", "bore", "stroke", "horsepower", "peak-rpm", "price"]
for col in numeric_columns:
    autos_df[col] = pd.to_numeric(autos_df[col], errors='coerce')
print(autos_df.dtypes)

MIssing values in each column symboling             0
normalized-losses    41
make                  0
fuel-type             0
aspiration            0
num-of-doors          2
body-style            0
drive-wheels          0
engine-location       0
wheel-base            0
length                0
width                 0
height                0
curb-weight           0
engine-type           0
num-of-cylinders      0
engine-size           0
fuel-system           0
bore                  4
stroke                4
compression-ratio     0
horsepower            2
peak-rpm              2
city-mpg              0
highway-mpg           0
price                 4
dtype: int64
symboling              int64
normalized-losses     object
make                  object
fuel-type             object
aspiration            object
num-of-doors          object
body-style            object
drive-wheels          object
engine-location       object
wheel-base           float64
length               float64
width         

In [6]:

# --- Select Variables with Highest Correlation to 'price' ---
# The objective is to identify which numeric variables can be used to predict 'price'.
# For this we need to identify the columns which have highest correlation with 'price', i.e., which columns can be used to predict price the most

# 1. Calculate the full correlation matrix for all numeric variables in the DataFrame.
# Hint: Use function corr to get correlation for numeric columns
corr_matrix = autos_df.corr()  

# 2. Extract the correlation values specifically with the 'price' column.
#    Use absolute values (abs()) to treat strong negative correlations the same as strong positive ones.
price_corr = corr_matrix['price'].abs()

# 3. Sort the correlation values in descending order (highest correlation first).
#    Drop the 'price' column itself from the list, as it will always be 1.0 (perfectly correlated with itself).
price_corr_sorted = price_corr.drop('price').sort_values(ascending=False)
price_corr_sorted = price_corr_sorted.drop('price')

# 4. Print the sorted list to see which variables are the best predictors.
print(price_corr_sorted)

# 5. Define a threshold for selection (e.g., correlation > 0.8) and select the variable names that meet this criteria.
top_features = price_corr_sorted[price_corr_sorted > 0.8].index.tolist()

# 6. Print the list of selected top features.
print("Top features correlated with price:", top_features)


ValueError: could not convert string to float: 'alfa-romero'

In [None]:
# Create a new dataframe with relevant columns for multiple regression which have correlation > 0.8
df_selected = autos_df[top_features + ['price']]

# Handle missing 'price' values by dropping rows. If 'price' is missing then such rows cannot be used for regression
# Hint: dropna can be used to drop rows
df_selected = df_selected.dropna(subset=['price'])
# For the rown which have missing values for columns, they can be either dropped or NaN can be replaced with appropriate value.
# If many rows have missing values, dropping them can reduce the data for model and NaN are replaced with Mean or Median value or with zeros.
# Check how many values are missing and replace/drop values accordingly
print(df_selected.isna().sum())

In [None]:
# --- Multiple Linear Regression ---

# 1. Define the features (X) and the target variable (y) using the cleaned dataframe (df_clean).
#    X should be a DataFrame with three columns: ['horsepower', 'engine-size', 'curb-weight']
#    y should be the 'price' column (a Series).
X = df_selected[top_features]
Y = df_selected['price']

# 2. Split the data into training and testing sets.
#    Use a test size of 30% (0.3) and a random state of 42 for reproducibility.
#    The output variables will be: X_train, X_test, y_train, y_test
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

# 3. Create an instance of the Linear Regression model object from sklearn.
# Hint: use 'LinearRegression'
model = LinearRegression()

# 4. Train the model using the training data (fit the model with X_train and y_train).
# Hint: use 'fit'
model.fit(X_train, Y_train)

# 5. Use the trained model to make predictions on the test features (X_test).
#    Store predictions in a variable called 'predictions'.
# Hint: use 'predict'
predictions = model.predict(X_test)

# 6. Evaluate the model performance: Calculate and print the Mean Squared Error (MSE) and R-squared score.
#    The R-squared score tells us how much variance in price our 3 variables explain together.
mse = mean_squared_error(Y_test, predictions)
r2 = r2_score(Y_test, predictions)

# 7. Print the model's intercept and coefficients for all variables.
print("intercept", model.intercept)
coefficients = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': model.coef_
})
print(coefficients)
# 8. Compare the R-squared value from this *multiple* regression to the R-squared from a *simple* (single variable) regression.
#    (A brief interpretation can be added in a print statement or markdown cell below)
print(
    "The R-squared value from multiple regression is higher than that of a simple "
    "linear regression, showing that using horsepower, engine-size, and curb-weight "
    "together explains more variation in car price than a single variable alone."
)