<a href="https://colab.research.google.com/github/qedir051/ML-Projects/blob/main/SimpleRegressionModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!wget https://raw.githubusercontent.com/mrdbourke/zero-to-mastery-ml/master/data/car-sales-extended-missing-data.csv

--2024-08-08 07:41:24--  https://raw.githubusercontent.com/mrdbourke/zero-to-mastery-ml/master/data/car-sales-extended-missing-data.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 30921 (30K) [text/plain]
Saving to: ‘car-sales-extended-missing-data.csv’


2024-08-08 07:41:24 (2.87 MB/s) - ‘car-sales-extended-missing-data.csv’ saved [30921/30921]



In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

In [3]:
car_sales_missing = pd.read_csv("/content/car-sales-extended-missing-data.csv")
car_sales_missing

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0
...,...,...,...,...,...
995,Toyota,Black,35820.0,4.0,32042.0
996,,White,155144.0,3.0,5716.0
997,Nissan,Blue,66604.0,4.0,31570.0
998,Honda,White,215883.0,4.0,4001.0


In [4]:
car_sales_missing .isna().sum()

Unnamed: 0,0
Make,49
Colour,50
Odometer (KM),50
Doors,50
Price,50


In [5]:
car_sales_missing.dropna(subset = ['Price'], inplace = True)
car_sales_missing

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0
...,...,...,...,...,...
995,Toyota,Black,35820.0,4.0,32042.0
996,,White,155144.0,3.0,5716.0
997,Nissan,Blue,66604.0,4.0,31570.0
998,Honda,White,215883.0,4.0,4001.0


In [6]:
car_sales_missing.isna().sum()

Unnamed: 0,0
Make,47
Colour,46
Odometer (KM),48
Doors,47
Price,0


In [7]:
X = car_sales_missing.drop('Price', axis = 1)
y = car_sales_missing['Price'].copy()

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [9]:
cat_features = X_train.select_dtypes(exclude = [np.number]).columns
num_features = X_train.select_dtypes(include = [np.number]).columns
cat_features, num_features

(Index(['Make', 'Colour'], dtype='object'),
 Index(['Odometer (KM)', 'Doors'], dtype='object'))

In [10]:
from sklearn.impute import SimpleImputer     # for filling null data
from sklearn.pipeline import Pipeline        # step by step processes
from sklearn.preprocessing import StandardScaler          # normalize data , also there's MinMaxScaler
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer

cat_pipeline = Pipeline([
    ['impute', SimpleImputer(strategy = 'constant', fill_value = 'Missing')],
    ['one_hot', OneHotEncoder(handle_unknown='ignore')]
])

num_pipeline = Pipeline([
    ['impute', SimpleImputer(strategy = 'median')],
    ['scaler', StandardScaler()]
])

transformer = ColumnTransformer([
    ['cat_pipeline', cat_pipeline, cat_features],
    ['num_pipeline', num_pipeline, num_features]
])

In [11]:
X_train_transformed = transformer.fit_transform(X_train)
feature_names = transformer.get_feature_names_out()
X_train_transformed_df = pd.DataFrame(X_train_transformed, columns = feature_names)
X_train_transformed_df

Unnamed: 0,cat_pipeline__Make_BMW,cat_pipeline__Make_Honda,cat_pipeline__Make_Missing,cat_pipeline__Make_Nissan,cat_pipeline__Make_Toyota,cat_pipeline__Colour_Black,cat_pipeline__Colour_Blue,cat_pipeline__Colour_Green,cat_pipeline__Colour_Missing,cat_pipeline__Colour_Red,cat_pipeline__Colour_White,num_pipeline__Odometer (KM),num_pipeline__Doors
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-0.869223,-0.010424
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.482335,-0.010424
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-1.302557,-0.010424
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.976357,-0.010424
4,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.324752,-0.010424
...,...,...,...,...,...,...,...,...,...,...,...,...,...
755,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.318585,-0.010424
756,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.715185,2.630265
757,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.982256,-0.010424
758,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.042179,-0.010424


In [12]:
X_test_transformed = transformer.transform(X_test)
X_test_transformed_df = pd.DataFrame(X_test_transformed, columns = feature_names)
X_test_transformed_df

Unnamed: 0,cat_pipeline__Make_BMW,cat_pipeline__Make_Honda,cat_pipeline__Make_Missing,cat_pipeline__Make_Nissan,cat_pipeline__Make_Toyota,cat_pipeline__Colour_Black,cat_pipeline__Colour_Blue,cat_pipeline__Colour_Green,cat_pipeline__Colour_Missing,cat_pipeline__Colour_Red,cat_pipeline__Colour_White,num_pipeline__Odometer (KM),num_pipeline__Doors
0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,-0.454703,-0.010424
1,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,-1.673014,-0.010424
2,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.003692,-0.010424
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.568635,-0.010424
4,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.510092,-0.010424
...,...,...,...,...,...,...,...,...,...,...,...,...,...
185,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.518623,-0.010424
186,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.346615,-2.651112
187,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,-0.323496,-0.010424
188,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.056216,-2.651112


In [16]:
np.random.seed(42)
rnd_reg = RandomForestRegressor()
rnd_reg.fit(X_train_transformed, y_train)

In [17]:
rnd_reg.score(X_train_transformed, y_train)

0.8804841577261856

In [18]:
rnd_reg.score(X_test_transformed, y_test)

0.21440214110981048