In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

In [17]:
df = pd.read_csv('car-sales-missing-data.csv')
print(df)

     Make Colour  Odometer  Doors    Price
0  Toyota  White  150043.0    4.0   $4,000
1   Honda    Red   87899.0    4.0   $5,000
2  Toyota   Blue       NaN    3.0   $7,000
3     BMW  Black   11179.0    5.0  $22,000
4  Nissan  White  213095.0    4.0   $3,500
5  Toyota  Green       NaN    4.0   $4,500
6   Honda    NaN       NaN    4.0   $7,500
7   Honda   Blue       NaN    4.0      NaN
8  Toyota  White   60000.0    NaN      NaN
9     NaN  White   31600.0    4.0   $9,700


In [18]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Make      9 non-null      object 
 1   Colour    9 non-null      object 
 2   Odometer  6 non-null      float64
 3   Doors     9 non-null      float64
 4   Price     8 non-null      object 
dtypes: float64(2), object(3)
memory usage: 532.0+ bytes
None


In [19]:
df.describe()

Unnamed: 0,Odometer,Doors
count,6.0,9.0
mean,92302.666667,4.0
std,76489.805168,0.5
min,11179.0,3.0
25%,38700.0,4.0
50%,73949.5,4.0
75%,134507.0,4.0
max,213095.0,5.0


In [20]:
df.head(3)

Unnamed: 0,Make,Colour,Odometer,Doors,Price
0,Toyota,White,150043.0,4.0,"$4,000"
1,Honda,Red,87899.0,4.0,"$5,000"
2,Toyota,Blue,,3.0,"$7,000"


In [21]:
print("Missing values after imputation:\n", df.isnull().sum())

Missing values after imputation:
 Make        1
Colour      1
Odometer    4
Doors       1
Price       2
dtype: int64


In [22]:
#impute missing numerical values
imputer_odometer = SimpleImputer(strategy = "mean")
df['Odometer'] = imputer_odometer.fit_transform(df[['Odometer']])

imputer_doors = SimpleImputer(strategy="most_frequent")
df['Doors'] = imputer_doors.fit_transform(df[['Doors']])

#covert price as a string into numeric
df['Price'] = df['Price'].astype(str).str.replace('[\$\,]',"",regex = True)
df['Price'] = pd.to_numeric(df['Price'], errors='coerce').astype(float)

#drop rows where price, Make or color having missing values
df.dropna(subset = ['Price', 'Make', 'Colour'], inplace=True)

In [23]:
#Encoding
df['Make_encoded'] = LabelEncoder().fit_transform(df['Make'])
df['Colour_encoded'] = LabelEncoder().fit_transform(df['Colour'])
df = pd.get_dummies(df, columns=['Make', 'Colour'], drop_first=True)



In [24]:
#scaler
scaler = MinMaxScaler()
df[['Odometer_scaled','Price_scaled']] = scaler.fit_transform(df[['Odometer','Price']])


In [25]:
#define feature and target
x = df[['Make_encoded','Colour_encoded','Doors','Odometer_scaled']]
y = df['Price_scaled']

In [26]:
#testing
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)

In [27]:
#show processed data
print("\nProcessed Data Sample:\n", df.head())
print("\nx_train", x_train.shape)
print("\nx_text", x_test.shape)
print("\ny_train", y_train.shape)
print("\ny_test", y_test.shape)
print("\nSample x_train:\n", x_train.head())
print("\nSample y_train:\n", y_train.head())



Processed Data Sample:
         Odometer  Doors    Price  Make_encoded  Colour_encoded  Make_Honda  \
0  150043.000000    4.0   4000.0             3               4       False   
1   87899.000000    4.0   5000.0             1               3        True   
2   92302.666667    3.0   7000.0             3               1       False   
3   11179.000000    5.0  22000.0             0               0       False   
4  213095.000000    4.0   3500.0             2               4       False   

   Make_Nissan  Make_Toyota  Colour_Blue  Colour_Green  Colour_Red  \
0        False         True        False         False       False   
1        False        False        False         False        True   
2        False         True         True         False       False   
3        False        False        False         False       False   
4         True        False        False         False       False   

   Colour_White  Odometer_scaled  Price_scaled  
0          True         0.687732    

In [29]:
df.isnull().sum()


Unnamed: 0,0
Odometer,0
Doors,0
Price,0
Make_encoded,0
Colour_encoded,0
Make_Honda,0
Make_Nissan,0
Make_Toyota,0
Colour_Blue,0
Colour_Green,0
