In [46]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder


In [47]:
df = pd.read_csv('ford.csv')

In [48]:
df.head()

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize
0,Fiesta,2017,12000,Automatic,15944,Petrol,150,57.7,1.0
1,Focus,2018,14000,Manual,9083,Petrol,150,57.7,1.0
2,Focus,2017,13000,Manual,12456,Petrol,150,57.7,1.0
3,Fiesta,2019,17500,Manual,10460,Petrol,145,40.3,1.5
4,Fiesta,2019,16500,Automatic,1482,Petrol,145,48.7,1.0


In [49]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17965 entries, 0 to 17964
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   model         17965 non-null  object 
 1   year          17965 non-null  int64  
 2   price         17965 non-null  int64  
 3   transmission  17965 non-null  object 
 4   mileage       17965 non-null  int64  
 5   fuelType      17965 non-null  object 
 6   tax           17965 non-null  int64  
 7   mpg           17965 non-null  float64
 8   engineSize    17965 non-null  float64
dtypes: float64(2), int64(4), object(3)
memory usage: 1.2+ MB


In [50]:
df.isnull().sum()

model           0
year            0
price           0
transmission    0
mileage         0
fuelType        0
tax             0
mpg             0
engineSize      0
dtype: int64

In [51]:
Columns = list(df.columns.values)
for col in Columns:
    if df[col].isnull().any():
        if df[col].dtype == 'object':
            df[col] = df[col].fillna(df[col].mode()[0])
        else:
            df[col] = df[col].fillna(df[col].mean())
df.head(5)
df.isnull().sum()

model           0
year            0
price           0
transmission    0
mileage         0
fuelType        0
tax             0
mpg             0
engineSize      0
dtype: int64

In [52]:
df.head()

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize
0,Fiesta,2017,12000,Automatic,15944,Petrol,150,57.7,1.0
1,Focus,2018,14000,Manual,9083,Petrol,150,57.7,1.0
2,Focus,2017,13000,Manual,12456,Petrol,150,57.7,1.0
3,Fiesta,2019,17500,Manual,10460,Petrol,145,40.3,1.5
4,Fiesta,2019,16500,Automatic,1482,Petrol,145,48.7,1.0


In [53]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17965 entries, 0 to 17964
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   model         17965 non-null  object 
 1   year          17965 non-null  int64  
 2   price         17965 non-null  int64  
 3   transmission  17965 non-null  object 
 4   mileage       17965 non-null  int64  
 5   fuelType      17965 non-null  object 
 6   tax           17965 non-null  int64  
 7   mpg           17965 non-null  float64
 8   engineSize    17965 non-null  float64
dtypes: float64(2), int64(4), object(3)
memory usage: 1.2+ MB


In [54]:
encoder = LabelEncoder()

In [55]:
cat_col=df.select_dtypes(include=['object']).columns
cardinality = df[cat_col].nunique()
print(cardinality)

model           23
transmission     3
fuelType         5
dtype: int64


In [56]:
cat_col

Index(['model', 'transmission', 'fuelType'], dtype='object')

In [57]:
for col in cat_col:
    cardinality = df[col].nunique()
    if cardinality <= 5:
        # one-hot encoding
        dummies = pd.get_dummies(df[col], prefix=col, dtype=int)
        # drop the original column and merge the dummy columns
        df = pd.concat([df.drop(col, axis=1), dummies], axis=1)
    else:
        # Apply label encoding
        df[col] = encoder.fit_transform(df[col])

In [58]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17965 entries, 0 to 17964
Data columns (total 15 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   model                   17965 non-null  int64  
 1   year                    17965 non-null  int64  
 2   price                   17965 non-null  int64  
 3   mileage                 17965 non-null  int64  
 4   tax                     17965 non-null  int64  
 5   mpg                     17965 non-null  float64
 6   engineSize              17965 non-null  float64
 7   transmission_Automatic  17965 non-null  int64  
 8   transmission_Manual     17965 non-null  int64  
 9   transmission_Semi-Auto  17965 non-null  int64  
 10  fuelType_Diesel         17965 non-null  int64  
 11  fuelType_Electric       17965 non-null  int64  
 12  fuelType_Hybrid         17965 non-null  int64  
 13  fuelType_Other          17965 non-null  int64  
 14  fuelType_Petrol         17965 non-null

In [59]:
df.head()

Unnamed: 0,model,year,price,mileage,tax,mpg,engineSize,transmission_Automatic,transmission_Manual,transmission_Semi-Auto,fuelType_Diesel,fuelType_Electric,fuelType_Hybrid,fuelType_Other,fuelType_Petrol
0,5,2017,12000,15944,150,57.7,1.0,1,0,0,0,0,0,0,1
1,6,2018,14000,9083,150,57.7,1.0,0,1,0,0,0,0,0,1
2,6,2017,13000,12456,150,57.7,1.0,0,1,0,0,0,0,0,1
3,5,2019,17500,10460,145,40.3,1.5,0,1,0,0,0,0,0,1
4,5,2019,16500,1482,145,48.7,1.0,1,0,0,0,0,0,0,1


In [61]:
from datacleaner import autoclean

In [65]:
df  = autoclean(df)
df.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  input_dataframe[column].fillna(input_dataframe[column].median(), inplace=True)


Unnamed: 0,model,year,price,mileage,tax,mpg,engineSize,transmission_Automatic,transmission_Manual,transmission_Semi-Auto,fuelType_Diesel,fuelType_Electric,fuelType_Hybrid,fuelType_Other,fuelType_Petrol
0,5,2017,12000,15944,150,57.7,1.0,1,0,0,0,0,0,0,1
1,6,2018,14000,9083,150,57.7,1.0,0,1,0,0,0,0,0,1
2,6,2017,13000,12456,150,57.7,1.0,0,1,0,0,0,0,0,1
3,5,2019,17500,10460,145,40.3,1.5,0,1,0,0,0,0,0,1
4,5,2019,16500,1482,145,48.7,1.0,1,0,0,0,0,0,0,1
