# Обработка признаков

В этом задании решаем задачу предсказания стоимости автомобилей по их различным характеристикам.

In [None]:
import pandas as pd

RANDOM_STATE = 42

In [None]:
df = pd.read_csv("https://raw.githubusercontent.com/evgpat/edu_stepik_practical_ml/main/datasets/cars_prices.csv", decimal='.')

In [None]:
df.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


### Описание некоторых признаков

`symboling` - rating corresponds to the degree to which the auto is more risky than its price indicates (+3 more risk and -3 is pretty safe)  
`make` - car types (i.e. car brand)  
`fuel-type` - types of fuel (gas or diesel)  
`aspiration` - engine aspiration (standard or turbo)  
`num-of-doors` - numbers of doors (two or four)  
`body-style` - car body style (sedan or hachback)  
`drive-wheels` - which types of drive wheel (forward-fwd, reversed-rwd)  
`engine-location` - engine mounted location (front or back)  
`wheel-base` - расстояние между осями передних и задних колес  
`length` - car lenght  
`weight` - car weight  
`width` - car width  
`height` - car height  

In [None]:
df.shape

(205, 26)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   symboling          205 non-null    int64  
 1   normalized-losses  205 non-null    object 
 2   make               205 non-null    object 
 3   fuel-type          205 non-null    object 
 4   aspiration         205 non-null    object 
 5   num-of-doors       205 non-null    object 
 6   body-style         205 non-null    object 
 7   drive-wheels       205 non-null    object 
 8   engine-location    205 non-null    object 
 9   wheel-base         205 non-null    float64
 10  length             205 non-null    float64
 11  width              205 non-null    float64
 12  height             205 non-null    float64
 13  curb-weight        205 non-null    int64  
 14  engine-type        205 non-null    object 
 15  num-of-cylinders   205 non-null    object 
 16  engine-size        205 non

## Заполнение пропусков

Пропуски в этом датасете обозначены как `?`

In [None]:
for c in df.columns:
    print(c, len(df[df[c] == '?']))

symboling 0
normalized-losses 41
make 0
fuel-type 0
aspiration 0
num-of-doors 2
body-style 0
drive-wheels 0
engine-location 0
wheel-base 0
length 0
width 0
height 0
curb-weight 0
engine-type 0
num-of-cylinders 0
engine-size 0
fuel-system 0
bore 4
stroke 4
compression-ratio 0
horsepower 2
peak-rpm 2
city-mpg 0
highway-mpg 0
price 4


Удалим строки, для которых неизвестно значение price, так как это целевая переменная.

Сколько строк осталось в данных:

In [None]:
df = df[df['price'] != '?']
df.shape

(201, 26)

Заполним средним значением пропуски в столбцах для числовых признаков и самым популярным значением для категориальных признаков
* `num-of-doors`
* `bore`
* `stroke`
* `horsepower`
* `peak-rpm`

In [None]:
import numpy as np

cats = ['num-of-doors']
nums = ['bore', 'stroke', 'horsepower', 'peak-rpm']

df.replace('?', np.nan, inplace=True)

# Обработка категориальных столбцов
for c in cats:
    mode_value = df[c].mode().iloc[0]
    df[c].fillna(mode_value, inplace=True)
    print(f'{c} : {df[c].mode()}')

# Обработка числовых столбцов
for n in nums:
    df[n] = pd.to_numeric(df[n], errors='coerce')
    mean_value = df[n].mean()
    df[n].fillna(mean_value, inplace=True)
    print(f'{n} : {df[n].mean()}')


num-of-doors : 0    four
Name: num-of-doors, dtype: object
bore : 3.330710659898477
stroke : 3.2569035532994914
horsepower : 103.39698492462313
peak-rpm : 5117.587939698492


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.replace('?', np.nan, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[c].fillna(mode_value, inplace=True)


In [None]:
for c in df.columns:
    count_nan = df[c].isna().sum()
    print(c, count_nan)

symboling 0
normalized-losses 37
make 0
fuel-type 0
aspiration 0
num-of-doors 0
body-style 0
drive-wheels 0
engine-location 0
wheel-base 0
length 0
width 0
height 0
curb-weight 0
engine-type 0
num-of-cylinders 0
engine-size 0
fuel-system 0
bore 0
stroke 0
compression-ratio 0
horsepower 0
peak-rpm 0
city-mpg 0
highway-mpg 0
price 0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 201 entries, 0 to 204
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   symboling          201 non-null    int64  
 1   normalized-losses  164 non-null    object 
 2   make               201 non-null    object 
 3   fuel-type          201 non-null    object 
 4   aspiration         201 non-null    object 
 5   num-of-doors       201 non-null    object 
 6   body-style         201 non-null    object 
 7   drive-wheels       201 non-null    object 
 8   engine-location    201 non-null    object 
 9   wheel-base         201 non-null    float64
 10  length             201 non-null    float64
 11  width              201 non-null    float64
 12  height             201 non-null    float64
 13  curb-weight        201 non-null    int64  
 14  engine-type        201 non-null    object 
 15  num-of-cylinders   201 non-null    object 
 16  engine-size        201 non-null

In [None]:
df.head(15)

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,13495
1,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,16500
2,1,,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154.0,5000.0,19,26,16500
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102.0,5500.0,24,30,13950
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115.0,5500.0,18,22,17450
5,2,,audi,gas,std,two,sedan,fwd,front,99.8,...,136,mpfi,3.19,3.4,8.5,110.0,5500.0,19,25,15250
6,1,158.0,audi,gas,std,four,sedan,fwd,front,105.8,...,136,mpfi,3.19,3.4,8.5,110.0,5500.0,19,25,17710
7,1,,audi,gas,std,four,wagon,fwd,front,105.8,...,136,mpfi,3.19,3.4,8.5,110.0,5500.0,19,25,18920
8,1,158.0,audi,gas,turbo,four,sedan,fwd,front,105.8,...,131,mpfi,3.13,3.4,8.3,140.0,5500.0,17,20,23875
10,2,192.0,bmw,gas,std,two,sedan,rwd,front,101.2,...,108,mpfi,3.5,2.8,8.8,101.0,5800.0,23,29,16430


Среднее значение `peak-rpm` до заполнения пропусков:


Пропуски в столбце `normalized-losses` предскажем при помощи линейной регрессии по признакам
`symboling`, `wheel-base`, `length`, `width`, `height`, `curb-weight`, `engine-size`, `compression-ratio`, `city-mpg`, `highway-mpg` и заполним их предсказаниями

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

df['normalized-losses'] = pd.to_numeric(df['normalized-losses'], errors='coerce')

df_no_nan = df.dropna(subset=['normalized-losses'])
df_nan = df[df['normalized-losses'].isnull()]

X = df_no_nan[['symboling', 'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-size', 'compression-ratio', 'city-mpg', 'highway-mpg']]
y = df_no_nan['normalized-losses']

# Создаем и обучаем модель линейной регрессии
model = LinearRegression()
model.fit(X, y)

X_nan = df_nan[['symboling', 'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-size', 'compression-ratio', 'city-mpg', 'highway-mpg']]
predictions = model.predict(X_nan)

df.loc[df['normalized-losses'].isnull(), 'normalized-losses'] = predictions

# Шаг 9: Отображение заполненных строк
filled_data = df[df.index.isin(df_nan.index)]
filled_data

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,168.072493,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,13495
1,3,168.072493,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,16500
2,1,134.001799,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154.0,5000.0,19,26,16500
5,2,150.033477,audi,gas,std,two,sedan,fwd,front,99.8,...,136,mpfi,3.19,3.4,8.5,110.0,5500.0,19,25,15250
7,1,124.364599,audi,gas,std,four,wagon,fwd,front,105.8,...,136,mpfi,3.19,3.4,8.5,110.0,5500.0,19,25,18920
14,1,136.541277,bmw,gas,std,four,sedan,rwd,front,103.5,...,164,mpfi,3.31,3.19,9.0,121.0,4250.0,20,25,24565
15,0,127.287711,bmw,gas,std,four,sedan,rwd,front,103.5,...,209,mpfi,3.62,3.39,8.0,182.0,5400.0,16,22,30760
16,0,138.090392,bmw,gas,std,two,sedan,rwd,front,103.5,...,209,mpfi,3.62,3.39,8.0,182.0,5400.0,16,22,41315
17,0,130.513069,bmw,gas,std,four,sedan,rwd,front,110.0,...,209,mpfi,3.62,3.39,8.0,182.0,5400.0,15,20,36880
43,0,113.647804,isuzu,gas,std,four,sedan,rwd,front,94.3,...,111,2bbl,3.31,3.23,8.5,78.0,4800.0,24,29,6785


## 2. Кодирование категориальных признаков

1. Закодируем бинарные признаки `fuel-type`, `aspiration`, `num-of-doors`, `engine-location` каждый отдельной колонкой, состоящей из 0 и 1.
Единицей кодируем самую частую категорию.

In [None]:
# your code here
binary_features = ['fuel-type', 'aspiration', 'num-of-doors', 'engine-location']

for feature in binary_features:
    most_frequent = df[feature].mode()[0]
    df[feature] = (df[feature] == most_frequent).astype(int)

2. Вынесем в переменную `y` целевую переменную `price`, а все остальные колонки - в матрицу `X`.

Закодируем признаки `make`, `body-style`, `engine-type`, `fuel-system` при помощи LeaveOneOutEncoder.

In [None]:
!pip install category_encoders -q

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/81.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/81.9 kB[0m [31m641.5 kB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━[0m [32m61.4/81.9 kB[0m [31m1.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/81.9 kB[0m [31m806.4 kB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from category_encoders.leave_one_out import LeaveOneOutEncoder

# your code here

y = df['price']
X = df.drop('price', axis=1)

cat_feat = ['make', 'body-style', 'engine-type', 'fuel-system']

encoder = LeaveOneOutEncoder(cols=cat_feat, handle_unknown='impute')
X_encoded = encoder.fit_transform(X, y)

mean_body_style = X_encoded['body-style'].mean()

X_encoded

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,num-of-cylinders,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg
0,3,168.072493,16500.0,1,1,0,23569.600000,rwd,1,88.6,...,four,130,17650.307692,3.47,2.68,9.0,111.0,5000.0,21,27
1,3,168.072493,14997.5,1,1,0,22968.600000,rwd,1,88.6,...,four,130,17617.285714,3.47,2.68,9.0,111.0,5000.0,21,27
2,1,134.001799,14997.5,1,1,0,9859.791045,rwd,1,94.5,...,six,152,17617.285714,2.68,3.47,9.0,154.0,5000.0,19,26
3,2,164.000000,18641.0,1,1,1,14465.236559,fwd,1,99.8,...,four,109,17645.307692,3.19,3.40,10.0,102.0,5500.0,24,30
4,2,164.000000,17941.0,1,1,1,14427.602151,4wd,1,99.4,...,five,136,17606.846154,3.19,3.40,8.0,115.0,5500.0,18,22
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,-1,95.000000,18185.0,1,1,1,14434.107527,rwd,1,109.1,...,four,141,17613.494505,3.78,3.15,9.5,114.0,5400.0,23,28
201,-1,95.000000,17965.0,1,0,1,14410.451613,rwd,1,109.1,...,four,141,17589.318681,3.78,3.15,8.7,160.0,5300.0,19,25
202,-1,95.000000,17721.0,1,1,1,14384.215054,rwd,1,109.1,...,six,173,17562.505495,3.58,2.87,8.8,134.0,5500.0,18,23
203,-1,95.000000,17622.5,0,0,1,14373.623656,rwd,1,109.1,...,six,145,15489.105263,3.01,3.40,23.0,106.0,4800.0,26,27


3. Закодируем признак `drive-wheels` при помощи OHE из библиотеки category_encoders.

In [None]:
from category_encoders.one_hot import OneHotEncoder

X = pd.get_dummies(X_encoded, columns=['drive-wheels'], drop_first=True)

4. В столбце `num-of-cylinders` категории упорядочены по смыслу. Закодируем их подряд идущими числами, начиная с 1, согласно смыслу.


In [None]:
df['num-of-cylinders'].unique()

array(['four', 'six', 'five', 'three', 'twelve', 'two', 'eight'],
      dtype=object)

In [None]:
# your code here
cylinder_mapping = {
    'two': 2, 'three': 3, 'four': 4, 'five': 5, 'six': 6, 'eight': 8, 'twelve': 12
}
X['num-of-cylinders'] = X['num-of-cylinders'].map(cylinder_mapping)

In [None]:
X.shape

(201, 26)

In [None]:
X['normalized-losses'] = X['normalized-losses'].astype(float)
X['bore'] = X['bore'].astype(float)
X['stroke'] = X['stroke'].astype(float)
X['horsepower'] = X['horsepower'].astype(float)
X['peak-rpm'] = X['peak-rpm'].astype(float)

y = y.astype(float)

Разобьём данные на тренировочную и тестовую часть в пропорции 3 к 1.

In [None]:
from sklearn.model_selection import train_test_split

# your code here
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

Масштабируем данные при помощи MinMaxScaler.

In [None]:
from sklearn.preprocessing import MinMaxScaler

# your code here
scaler = MinMaxScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
from sklearn.metrics import r2_score

# your code here
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)

# Вычисление R2 на тестовых данных
r2 = r2_score(y_test, y_pred)

r2

0.9097057418100629