In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
car_missing=pd.read_csv("car-sales-extended-missing-data.csv")
car_missing

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0
...,...,...,...,...,...
995,Toyota,Black,35820.0,4.0,32042.0
996,,White,155144.0,3.0,5716.0
997,Nissan,Blue,66604.0,4.0,31570.0
998,Honda,White,215883.0,4.0,4001.0


In [3]:
car_missing.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [4]:
car_missing.dropna(subset=["Price"],inplace=True)
car_missing

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0
...,...,...,...,...,...
995,Toyota,Black,35820.0,4.0,32042.0
996,,White,155144.0,3.0,5716.0
997,Nissan,Blue,66604.0,4.0,31570.0
998,Honda,White,215883.0,4.0,4001.0


In [5]:
x=car_missing.drop("Price",axis=1)
y=car_missing["Price"]

In [6]:
x

Unnamed: 0,Make,Colour,Odometer (KM),Doors
0,Honda,White,35431.0,4.0
1,BMW,Blue,192714.0,5.0
2,Honda,White,84714.0,4.0
3,Toyota,White,154365.0,4.0
4,Nissan,Blue,181577.0,3.0
...,...,...,...,...
995,Toyota,Black,35820.0,4.0
996,,White,155144.0,3.0
997,Nissan,Blue,66604.0,4.0
998,Honda,White,215883.0,4.0


In [7]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

In [8]:
#here we will fill the empty values using sklearn

In [9]:
cat_imputer=SimpleImputer(strategy="constant",fill_value="missing")
door_imputer=SimpleImputer(strategy="most_frequent")
num_imputer=SimpleImputer(strategy="mean")

In [10]:
cat_features=["Make","Colour"]
door_feature=["Doors"]
num_features=["Odometer (KM)"]

In [11]:
imputer=ColumnTransformer([
    ("cat_imputer",cat_imputer,cat_features),
    ("num_imputer",num_imputer,num_features),
    ("door_imputer",door_imputer,door_feature)
])

In [12]:
filled_x=imputer.fit_transform(x)

In [13]:
filled_x

array([['Honda', 'White', 35431.0, 4.0],
       ['BMW', 'Blue', 192714.0, 5.0],
       ['Honda', 'White', 84714.0, 4.0],
       ...,
       ['Nissan', 'Blue', 66604.0, 4.0],
       ['Honda', 'White', 215883.0, 4.0],
       ['Toyota', 'Blue', 248360.0, 4.0]], dtype=object)

In [14]:
car_sales_filled=pd.DataFrame(filled_x,columns=["Make","Colour","Doors","Odometer (KM)"])
car_sales_filled

Unnamed: 0,Make,Colour,Doors,Odometer (KM)
0,Honda,White,35431.0,4.0
1,BMW,Blue,192714.0,5.0
2,Honda,White,84714.0,4.0
3,Toyota,White,154365.0,4.0
4,Nissan,Blue,181577.0,3.0
...,...,...,...,...
945,Toyota,Black,35820.0,4.0
946,missing,White,155144.0,3.0
947,Nissan,Blue,66604.0,4.0
948,Honda,White,215883.0,4.0


In [15]:
from sklearn.preprocessing import OneHotEncoder

In [16]:
cat_features=["Make","Colour","Doors"]

In [17]:
onehot=OneHotEncoder()

In [18]:
transformer=ColumnTransformer([("One_hot",onehot,cat_features)],remainder="passthrough")

In [19]:
transformed_x=transformer.fit_transform(car_sales_filled)
transformed_x

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 3800 stored elements and shape (950, 913)>

In [20]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

In [21]:
model=RandomForestRegressor()

In [22]:
x_train,x_test,y_train,y_test=train_test_split(transformed_x,y,test_size=0.1)

In [23]:
model.fit(x_train,y_train)

In [24]:
model.score(x_test,y_test)

-0.162889241828037

In [25]:
len(car_sales_filled)

950

In [26]:
x_train

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 3420 stored elements and shape (855, 913)>

In [27]:
from sklearn.datasets import fetch_california_housing as dataset

In [28]:
housing=dataset()

In [29]:
housing["data"]

array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
          37.88      , -122.23      ],
       [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
          37.86      , -122.22      ],
       [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
          37.85      , -122.24      ],
       ...,
       [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
          39.43      , -121.22      ],
       [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
          39.43      , -121.32      ],
       [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
          39.37      , -121.24      ]])

In [30]:
housing_df=pd.DataFrame(housing["data"],columns=housing["feature_names"])
housing_df

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25
...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32


In [31]:
housing_df["target"]=housing["target"]
housing_df

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,target
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422
...,...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,0.847


In [32]:
x=housing_df.drop("target",axis=1)

In [33]:
y=housing_df["target"]

In [34]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25)

In [35]:

model=RandomForestRegressor()

In [36]:
model.fit(x_train,y_train)

In [37]:
model.score(x_test,y_test)

0.808599671594818

In [38]:
from sklearn.linear_model import LinearRegression

In [39]:
model2=LinearRegression()

In [40]:
model2.fit(x_train,y_train)

In [41]:
model2.score(x_test,y_test)

0.596798010199534