# 1.	Import the necessary libraries and load the dataset



In [1]:
# import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error


In [2]:
# load the dataset
df=pd.read_csv('/content/imports-85.data')

# Set column names based on the "imports-85.names" file 
df.columns = ["symboling", "normalized-losses", "make", "fuel-type", "aspiration", "num-of-doors", "body-style", "drive-wheels", "engine-location", "wheel-base", "length", "width", "height", "curb-weight", "engine-type", "num-of-cylinders", "engine-size", "fuel-system", "bore", "stroke", "compression-ratio", "horsepower", "peak-rpm", "city-mpg", "highway-mpg", "price"]
df.head(5)  # To display first 5 rows of the dataset


Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
1,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
2,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
3,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450
4,2,?,audi,gas,std,two,sedan,fwd,front,99.8,...,136,mpfi,3.19,3.4,8.5,110,5500,19,25,15250


In [3]:
# find the number of rows and columns in the dataset
df.shape

(204, 26)

# 2.	Perform data cleaning and handle missing values, duplicates, and outliers

In [4]:
df.dtypes

symboling              int64
normalized-losses     object
make                  object
fuel-type             object
aspiration            object
num-of-doors          object
body-style            object
drive-wheels          object
engine-location       object
wheel-base           float64
length               float64
width                float64
height               float64
curb-weight            int64
engine-type           object
num-of-cylinders      object
engine-size            int64
fuel-system           object
bore                  object
stroke                object
compression-ratio    float64
horsepower            object
peak-rpm              object
city-mpg               int64
highway-mpg            int64
price                 object
dtype: object

In [5]:
# Convert the "normalized-losses", "num-of-doors", "bore", "stroke", "horsepower", "peak-rpm", and "price"  column to a numeric data type
df["horsepower"] = pd.to_numeric(df["horsepower"], errors='coerce')
df["price"] = pd.to_numeric(df["price"], errors="coerce")
df["normalized-losses"] = pd.to_numeric(df["normalized-losses"], errors='coerce')
df["num-of-doors"] = pd.to_numeric(df["num-of-doors"], errors="coerce")
df["bore"] = pd.to_numeric(df["bore"], errors='coerce')
df["stroke"] = pd.to_numeric(df["stroke"], errors="coerce")
df["peak-rpm"] = pd.to_numeric(df["peak-rpm"], errors="coerce")

In [6]:
# Check for missing values
# Replace "?" with NaN
df.replace("?", np.nan, inplace=True)

print(df.isna().sum())

symboling              0
normalized-losses     40
make                   0
fuel-type              0
aspiration             0
num-of-doors         204
body-style             0
drive-wheels           0
engine-location        0
wheel-base             0
length                 0
width                  0
height                 0
curb-weight            0
engine-type            0
num-of-cylinders       0
engine-size            0
fuel-system            0
bore                   4
stroke                 4
compression-ratio      0
horsepower             2
peak-rpm               2
city-mpg               0
highway-mpg            0
price                  4
dtype: int64


In [7]:
# Fill missing values with the mean of the column
df.fillna(df.mean(), inplace=True)

# Verify that there are no more missing values
print(df.isna().sum())

symboling              0
normalized-losses      0
make                   0
fuel-type              0
aspiration             0
num-of-doors         204
body-style             0
drive-wheels           0
engine-location        0
wheel-base             0
length                 0
width                  0
height                 0
curb-weight            0
engine-type            0
num-of-cylinders       0
engine-size            0
fuel-system            0
bore                   0
stroke                 0
compression-ratio      0
horsepower             0
peak-rpm               0
city-mpg               0
highway-mpg            0
price                  0
dtype: int64


  df.fillna(df.mean(), inplace=True)


In [8]:
# Drop duplicates
df.drop_duplicates(inplace=True)

# 3.	Perform feature selection and remove the irrelevant features

In [9]:

# Print correlation matrix
corr_matrix = df.corr()
print(corr_matrix["price"])

symboling           -0.083136
normalized-losses    0.133999
num-of-doors              NaN
wheel-base           0.587607
length               0.683372
width                0.730130
height               0.136123
curb-weight          0.820831
engine-size          0.861753
bore                 0.532562
stroke               0.083115
compression-ratio    0.071058
horsepower           0.757943
peak-rpm            -0.100833
city-mpg            -0.668021
highway-mpg         -0.690937
price                1.000000
Name: price, dtype: float64


  corr_matrix = df.corr()


In [10]:
# symboling", "normalized-losses", "height", "stroke", "compression-ratio", and "peak-rpm" have low correlation with the target variable. 
# Therefore, we can consider removing these features.

df1= df.drop(columns=["symboling", "normalized-losses", "height", "stroke", "compression-ratio", "peak-rpm"])

# Print the first 5 rows to verify the removal of features
print(df1.head())

          make fuel-type aspiration  num-of-doors   body-style drive-wheels  \
0  alfa-romero       gas        std           NaN  convertible          rwd   
1  alfa-romero       gas        std           NaN    hatchback          rwd   
2         audi       gas        std           NaN        sedan          fwd   
3         audi       gas        std           NaN        sedan          4wd   
4         audi       gas        std           NaN        sedan          fwd   

  engine-location  wheel-base  length  width  curb-weight engine-type  \
0           front        88.6   168.8   64.1         2548        dohc   
1           front        94.5   171.2   65.5         2823        ohcv   
2           front        99.8   176.6   66.2         2337         ohc   
3           front        99.4   176.6   66.4         2824         ohc   
4           front        99.8   177.3   66.3         2507         ohc   

  num-of-cylinders  engine-size fuel-system  bore  horsepower  city-mpg  \
0          

# 4.	Perform feature scaling and transform the data features to a similar scale

In [11]:
# Perform feature scaling
scaler = StandardScaler()
cols_to_scale = ["wheel-base", "length", "width", "curb-weight", "engine-size", "horsepower", "city-mpg", "highway-mpg", "price"]
df1[cols_to_scale] = scaler.fit_transform(df1[cols_to_scale])

# 5.	Perform feature transformation and create new features by combining, extracting, or transforming existing features

In [12]:
# We can extract the make of the car from the "make" feature by splitting it at the space character and taking the first element.
df1['make_extract'] = df1['make'].str.split().str[0]

# We can transform the "num-of-doors" feature to a binary feature called "is_two_doors" which is True if the car has two doors and False if it has four doors.
df1['is_two_doors'] = df1['num-of-doors'].apply(lambda x: True if x == 'two' else False)

# Transform "horsepower" from object to integer and create a new feature "horsepower_squared"
df1["horsepower"] = df1["horsepower"].astype(int)
df1["horsepower_squared"] = df1["horsepower"] ** 2

# Create a binary feature "highway_efficient" based on the condition that a car has a highway MPG of 30 or higher
df1["highway_efficient"] = np.where(df1["highway-mpg"] >= 30, 1, 0)

# Group by "make" and calculate the average "price" to create a new feature "make_avg_price"
make_avg_price = df1.groupby("make")["price"].mean().reset_index(name="make_avg_price")
df1 = df1.merge(make_avg_price, on="make", how="left")


In [13]:
df.columns


Index(['symboling', 'normalized-losses', 'make', 'fuel-type', 'aspiration',
       'num-of-doors', 'body-style', 'drive-wheels', 'engine-location',
       'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-type',
       'num-of-cylinders', 'engine-size', 'fuel-system', 'bore', 'stroke',
       'compression-ratio', 'horsepower', 'peak-rpm', 'city-mpg',
       'highway-mpg', 'price'],
      dtype='object')

In [14]:
df.dtypes

symboling              int64
normalized-losses    float64
make                  object
fuel-type             object
aspiration            object
num-of-doors         float64
body-style            object
drive-wheels          object
engine-location       object
wheel-base           float64
length               float64
width                float64
height               float64
curb-weight            int64
engine-type           object
num-of-cylinders      object
engine-size            int64
fuel-system           object
bore                 float64
stroke               float64
compression-ratio    float64
horsepower           float64
peak-rpm             float64
city-mpg               int64
highway-mpg            int64
price                float64
dtype: object

In [15]:
df1.columns

Index(['make', 'fuel-type', 'aspiration', 'num-of-doors', 'body-style',
       'drive-wheels', 'engine-location', 'wheel-base', 'length', 'width',
       'curb-weight', 'engine-type', 'num-of-cylinders', 'engine-size',
       'fuel-system', 'bore', 'horsepower', 'city-mpg', 'highway-mpg', 'price',
       'make_extract', 'is_two_doors', 'horsepower_squared',
       'highway_efficient', 'make_avg_price'],
      dtype='object')

In [16]:
df1.dtypes

make                   object
fuel-type              object
aspiration             object
num-of-doors          float64
body-style             object
drive-wheels           object
engine-location        object
wheel-base            float64
length                float64
width                 float64
curb-weight           float64
engine-type            object
num-of-cylinders       object
engine-size           float64
fuel-system            object
bore                  float64
horsepower              int64
city-mpg              float64
highway-mpg           float64
price                 float64
make_extract           object
is_two_doors             bool
horsepower_squared      int64
highway_efficient       int64
make_avg_price        float64
dtype: object

# 6.	Evaluate the performance of machine learning models before and after feature engineering

In [17]:
categorical_cols = ['make', 'fuel-type', 'aspiration', 'num-of-doors', 'body-style', 'drive-wheels', 
                    'engine-location', 'engine-type', 'num-of-cylinders', 'fuel-system']

# One-hot encode categorical columns
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Split the dataset into training and testing sets
X = df.drop('price', axis=1)
y = df['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train and test a linear regression model on the original dataset
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
print('Mean squared error on original dataset:', mean_squared_error(y_test, y_pred))

Mean squared error on original dataset: 23136617.509014085


In [19]:
categorical_cols = ['make', 'fuel-type', 'aspiration', 'num-of-doors', 'body-style', 'drive-wheels', 
                    'engine-location', 'engine-type', 'num-of-cylinders', 'fuel-system', 'make_extract', 'is_two_doors']

# One-hot encode categorical columns
df1 = pd.get_dummies(df1, columns=categorical_cols, drop_first=True)

# Split the modified dataset into training and testing sets
X1 = df1.drop('price', axis=1)
y1 = df1['price']
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.2)

# Train and test a linear regression model on the modified dataset
lr1 = LinearRegression()
lr1.fit(X1_train, y1_train)
y1_pred = lr1.predict(X1_test)
print('Mean squared error on modified dataset:', mean_squared_error(y1_test, y1_pred))

Mean squared error on modified dataset: 0.07366504266559477


# As we can see, the mean squared error on the modified dataset is lower than that on the original dataset. This indicates that feature engineering has improved the performance of the linear regression model on this dataset.