## ML Project : Used car price predition

#### Life cycle of Machine learning Project

- Understanding the Problem Statement
- Data Collection
- Data Checks to perform
- Exploratory data analysis
- Data Pre-Processing
- Model Training
- Choose best model

#### 1. Objective
The objective of this project is to predict the price of used cars based on various attributes.

#### 2. Data Collection
- The Data scuorce : https://www.kaggle.com/competitions/playground-series-s4e9/data
- The data consist of 13 columns and 188533 rows

#### 2.1 Import data and required libraries
Import pandas,numpy,seaboen,matplotlib

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder,OneHotEncoder,StandardScaler,MinMaxScaler,MaxAbsScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import re

: 

Import data and store it in csv format

In [126]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')


Show top 5 rows

In [None]:
train.head()

Get shape of the dataframe

In [None]:
train.shape

In [None]:
train.isnull().sum()

In [None]:
train.duplicated().sum()

In [None]:
train.info()

In [None]:
train.nunique()

In [None]:
train.describe()


In [None]:
column = train.columns
numeric_features = [feature for feature in train.columns if train[feature].dtype != 'O']
categorical_features = [feature for feature in train.columns if train[feature].dtype == 'O']

print('Numeric columns in dataframe : {}'.format(numeric_features))
print('Categorical columns in dataframe : {}'.format(categorical_features))

In [None]:
for col in categorical_features:
    print('Total number of unique values in {} are {}:'.format(col,train[col].nunique()))
    print(train[col].unique(),'\n')

In [None]:
for i in train['fuel_type'].unique():
    print('Number of samples of {} in dataset are {} and some exaples are the following are \n {}'.format(i,len(train[train['fuel_type'] == i]),train[train['fuel_type'] == i]))

In [None]:
train.isnull().sum()

In [138]:
most_common = dict()
for i in categorical_features:
    value = str(train[i].value_counts()[:1].index.values[0])
    most_common[i] = value

In [139]:
def extract(df):
    df['transmission'] = df['transmission'].str.lower()
    df['transmission_type'] = df['transmission'].apply(lambda x:
        'manual' if 'm/t' in x or 'manual' in x or 'mt' in x else 
        'automatic' if 'a/t' in x or 'automatic' in x else
        'CVT' if 'CVT' in x else
        'dual' if 'dual' in x else 'other'
    )
    df['hoursepower'] = df['engine'].str.extract(r'(\d+\.\d+)(?=HP)').astype(float)
    df['capacity'] = df['engine'].str.extract(r'(\d+\.\d+)(?=L| Liter)').astype(float)
    df['Cylinder'] = df['engine'].apply(lambda x: x if pd.isnull(x)
                                    else float(re.search('(\d)\s(Cylinder)',x).group(1)) if re.search('(\d)\s(Cylinder)',x)
                                    else float(re.search('\s(V)(\d)', x ).group(2)) if re.search('\s(V)(\d)', x) else np.nan)
    #df['Cylinder'] = df[['engine','Cylinder']].apply(lambda x: pd.Series(x[0]).str.extract(r'\s(V)(\d)')[1].astype(float) if pd.isnull(x[1]) else x[1],axis=1)
    
    return df

In [140]:
extracted_train = extract(train)
extracted_test = extract(test)

In [None]:
extracted_train

In [None]:
train['int_col'].value_counts()

In [None]:
train['brand'].value_counts()

In [144]:
model_sample = extracted_train['model'].value_counts()
low_models_samples = list(model_sample[model_sample.values < 101].index)

In [168]:
def fill_fuel_type(x):
    if pd.isnull(x[1]):
        if 'gasoline' in str.lower(x[0]):
            return 'Gasoline'
        elif 'flex' in str.lower(x[0]):
            return 'E85 Flex Fuel'
        elif (('plug-in'in str.lower(x[0])) or ('electric/gas' in str.lower(x[0]))):
            return 'Plug-In Hybrid'
        elif 'hybrid' in str.lower(x[0]):
            return 'Hybrid'
        elif 'electric' in str.lower(x[0]):
            return 'electric'
        else:
            return np.nan 
    else:
        return x[1]
    
    
def fill_missing_value(df):
    df['fuel_type'] = df[['engine','fuel_type']].apply(lambda x : fill_fuel_type(x),axis=1)
    df['fuel_type'].fillna('Gasoline',inplace=True)
    df['accident'].fillna('None reported',inplace=True)
    df['clean_title'].fillna('Yes' if 'None reported' in df['accident'] else 'No',inplace=True)
    for i in categorical_features:
        df[i] = df[i].replace('–',most_common[i])
    common_color = ['black', 'white', 'gray', 'silver', 'brown', 'red', 'blue', 'green',
        'beige', 'tan', 'orange', 'gold', 'yellow', 'purple', 'pink', 
        'charcoal', 'ivory', 'camel', 'chestnut', 'pearl', 'linen', 'graphite',
        'copper', 'slate', 'bronze', 'sand', 'amber','macchiato','ebony','cocoa']
    df['int_col'] = df['int_col'].apply(lambda x: x if [color for color in common_color if color in str.lower(x).split(' ')] == [] else [color for color in common_color if color in str.lower(x).split(' ')][0])
    df['ext_col'] = df['ext_col'].apply(lambda x: x if [color for color in common_color if color in str.lower(x).split(' ')] == [] else [color for color in common_color if color in str.lower(x).split(' ')][0])


    df['interior_rare_color'] = df['int_col'].apply(lambda x: 1 if str.lower(x) not in common_color else 0)
    df['exterior_rare_color'] = df['ext_col'].apply(lambda x: 1 if str.lower(x) not in common_color else 0)
    
    luxury_brands = ["Mercedes-Benz", "BMW", "Audi", "Porsche", "Land Rover","Land"
    "Lexus", "Cadillac", "Tesla", "INFINITI", "Jaguar", 
    "Bentley", "Maserati", "Lamborghini", "Genesis", "Rolls-Royce", 
    "Ferrari", "McLaren", "Aston Martin", "Lucid", "Lotus", 
    "Karma", "Bugatti", "Maybach"]
    df['is_luxry_brand'] = df['brand'].apply(lambda x: 1 if x in luxury_brands else 0)
    df['age'] = df['model_year'].apply(lambda x: int(2025 - x))
    df['mile/year'] = df['milage']/df['age']
    df['cleaned_model'] = df['model'].apply(lambda x: x if x not in low_models_samples else 'others')
    df.drop(['i','brand','engine','model_year','transmission'],axis=1,inplace=True)
    return df

In [None]:
common_color = ['black', 'white', 'gray', 'silver', 'brown', 'red', 'blue', 'green',
        'beige', 'tan', 'orange', 'gold', 'yellow', 'purple', 'pink', 
        'charcoal', 'ivory', 'camel', 'chestnut', 'pearl', 'linen', 'graphite',
        'copper', 'slate', 'bronze', 'sand', 'amber','macchiato','ebony','cocoa']
pd.Series('Metalic Black').apply(lambda x: x.astype('str') if [color for color in common_color if color in str.lower(x).split(' ')] == [] else [color for color in common_color if color in str.lower(x).split(' ')][0])

In [156]:
extract_train = extracted_train.copy()
extract_test = extracted_test.copy()
cleaned_train_data = fill_missing_value(extract_train)
cleaned_test_data = fill_missing_value(extract_test)


In [None]:
cleaned_test_data.isnull().sum()

In [None]:
sns.kdeplot(x = 'age',data=cleaned_train_data)

In [None]:
sns.scatterplot(x = 'mile/year',y='price',data= cleaned_train_data)

In [None]:
sns.histplot(cleaned_train_data['price'])

In [None]:
plt.figure(figsize=(20,10))
sns.boxplot(data = cleaned_train_data, x='brand',y='price',order=cleaned_train_data.groupby('brand')['price'].mean().sort_values(ascending=False).index)
plt.xticks(rotation= 90)


In [None]:
plt.figure(figsize=(20,10))
sns.boxplot(data = cleaned_train_data, x='transmission',y='price',order=cleaned_train_data.groupby('transmission')['price'].mean().sort_values(ascending=False).index)
plt.xticks(rotation= 90)

In [None]:
cleaned_train_data.columns

In [None]:
sns.heatmap(cleaned_train_data[['age','milage', 'hoursepower','capacity','Cylinder','price']].corr())

In [None]:
sns.histplot(cleaned_train_data['price'
])

In [None]:
cleaned_train_data.info()

In [None]:
cleaned_train_data = cleaned_train_data[cleaned_train_data['price']<300000]
cleaned_train_data.info()

In [173]:
X = cleaned_train_data.drop('price',axis=1)
y = cleaned_train_data['price']

In [174]:
xtrain,xcom,ytrain,ycom  =  train_test_split(X,y,test_size=0.3,random_state=32)
xtest,xval,ytest,yval = train_test_split(xcom,ycom,test_size=0.3,random_state=32)

In [None]:
cleaned_train_data.info()

In [None]:
cleaned_train_data.columns

In [158]:
numeric_features = ['milage','hoursepower', 'capacity',
       'Cylinder', 'interior_rare_color', 'exterior_rare_color',
       'is_luxry_brand', 'age', 'mile/year']
categorical_features = ['cleaned_model','fuel_type','transmission_type','accident','clean_title','int_col','ext_col','model']

In [None]:
xtrain

In [None]:
numeric_features_pipeline = Pipeline(
    [   
        ('fillna',SimpleImputer(strategy='median')),
        ('Scale',MaxAbsScaler())
    ]
)

categorical_features_pipeline = Pipeline(
    [
        ('ohe',OneHotEncoder(drop='first',handle_unknown='ignore',sparse_output=True)),
        ('sclae',MaxAbsScaler())
    ]
)

transformer = ColumnTransformer(
    [
        ('numerical',numeric_features_pipeline,numeric_features),
        ('categorical',categorical_features_pipeline,categorical_features)
    ]
)

xtrain_transfromed_array = transformer.fit_transform(xtrain).toarray()
xtest_transfromed_array = transformer.transform(xtest).toarray()
xval_transformed_array = transformer.transform(xval).toarray()

In [176]:
pd.DataFrame(xtrain_transfromed_array).to_pickle('xtrain_transfromed.csv')
pd.DataFrame(xtest_transfromed_array).to_pickle('xtest_transfromed.csv')
pd.DataFrame(xval_transformed_array).to_pickle('xtval_transfromed.csv')

In [177]:
pd.DataFrame(ytrain).to_pickle('ytrain.pkl')
pd.DataFrame(ytest).to_pickle('ytest.pkl')
pd.DataFrame(yval).to_pickle('yval.pkl')

In [None]:
pd.DataFrame(transformer.transform(X).toarray()).to_pickle('X.csv')
pd.DataFrame(transformer.transform(cleaned_test_data).toarray()).to_pickle('clean_test.csv')


In [179]:
pd.DataFrame(y).to_pickle('y.csv')