<a href="https://colab.research.google.com/github/strzelnat/machine_learning_cor/blob/main/model_prep_II.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Standaryzacja danych numerycznych**

Standaryzacja to technika przekształcania danych numerycznych w celu nadania im wspólnej skali, zazwyczaj o średniej 0 i odchyleniu standardowym 1. Jest szczególnie użyteczna w algorytmach opartych na odległościach (np. regresja logistyczna, SVM, k-NN).


In [2]:
import pandas as pd
import numpy as np


In [3]:
data = {
    'size': ['XL', 'L', 'M', 'L', 'M'],
    'color': ['red', 'green', 'blue', 'green', 'red'],
    'gender': ['female', 'male', 'male', 'female', 'female'],
    'price': [199.0, 89.0, 99.0, 129.0, 79.0],
    'weight': [500, 450, 300, 380, 410],
    'bought': ['yes', 'no', 'yes', 'no', 'yes']
}

data

{'size': ['XL', 'L', 'M', 'L', 'M'],
 'color': ['red', 'green', 'blue', 'green', 'red'],
 'gender': ['female', 'male', 'male', 'female', 'female'],
 'price': [199.0, 89.0, 99.0, 129.0, 79.0],
 'weight': [500, 450, 300, 380, 410],
 'bought': ['yes', 'no', 'yes', 'no', 'yes']}

In [6]:
df_raw = pd.DataFrame(data = data)
df_raw

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500,yes
1,L,green,male,89.0,450,no
2,M,blue,male,99.0,300,yes
3,L,green,female,129.0,380,no
4,M,red,female,79.0,410,yes


In [7]:
df = df_raw.copy()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   size    5 non-null      object 
 1   color   5 non-null      object 
 2   gender  5 non-null      object 
 3   price   5 non-null      float64
 4   weight  5 non-null      int64  
 5   bought  5 non-null      object 
dtypes: float64(1), int64(1), object(4)
memory usage: 372.0+ bytes


In [9]:
for col in ['size', 'color', 'gender', 'bought']:
  df[col] = df[col].astype('category')

df['weight']  = df['weight'].astype('float')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   size    5 non-null      category
 1   color   5 non-null      category
 2   gender  5 non-null      category
 3   price   5 non-null      float64 
 4   weight  5 non-null      float64 
 5   bought  5 non-null      category
dtypes: category(4), float64(2)
memory usage: 744.0 bytes


In [10]:
df.describe()

Unnamed: 0,price,weight
count,5.0,5.0
mean,119.0,408.0
std,48.476799,75.299402
min,79.0,300.0
25%,89.0,380.0
50%,99.0,410.0
75%,129.0,450.0
max,199.0,500.0


In [11]:
#Dane Kategoryczne
df.describe(include = ['category']).T

Unnamed: 0,count,unique,top,freq
size,5,3,L,2
color,5,3,green,2
gender,5,2,female,3
bought,5,2,yes,3


In [14]:
from sklearn.preprocessing import scale, StandardScaler

#preparation for model
df = pd.get_dummies(df, drop_first = True).astype(int)
df




Unnamed: 0,price,weight,size_M,size_XL,color_green,color_red,gender_male,bought_yes
0,199,500,0,1,0,1,0,1
1,89,450,0,0,1,0,1,0
2,99,300,1,0,0,0,1,1
3,129,380,0,0,1,0,0,0
4,79,410,1,0,0,1,0,1


In [15]:
scaler = StandardScaler()

df[['weight','price']] = scaler.fit_transform(df[['weight','price']])
df

Unnamed: 0,price,weight,size_M,size_XL,color_green,color_red,gender_male,bought_yes
0,1.845062,1.366002,0,1,0,1,0,1
1,-0.691898,0.62361,0,0,1,0,1,0
2,-0.461266,-1.603567,1,0,0,0,1,1
3,0.230633,-0.41574,0,0,1,0,0,0
4,-0.922531,0.029696,1,0,0,1,0,1
