# Data Preprocessing Tools

## Importing the libraries

In [10]:
import numpy as np # array
import matplotlib.pyplot as plt # charts
import pandas as pd # preprocess 

## Importing the dataset

In [11]:
dataset = pd.read_csv("Data.csv")

# 0 1 2 컬럼
# iloc[row_index_range, col_index_range]
# iloc[:,:-1]: all the rows & except last col
# -1: last column
# A:B include lowerbound (A) except upperbound (B)

X = dataset.iloc[:, :-1].values

# dependent variables vector
# last col
# 모든 행, 마지막 열
y = dataset.iloc[:, -1].values


In [12]:
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [13]:
print(y)

['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


## Taking care of missing data

In [14]:
# 1% remove data

# replace missing data to avg value
# SimpleImputer 
# Univariate imputer for completing missing values with simple strategies.
# Replace missing values using a descriptive statistic (e.g. mean, median, or most frequent) along each column, or using a constant value.

from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X[:, 1:3]) # Age, Salary # Select all the numberic columns
X[:, 1:3] = imputer.transform(X[:, 1:3]) # Return updated dataset 


In [15]:
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


## Encoding categorical data

avoid mis interpretation

one hot Encoding

country column -> 3 colums (3 different classes) / 5 cols (5 diff classes)

France  - 1 0 0 
Spain   - 0 1 0 
Germany - 0 0 1

Avoid numeric order

Purchased - 1 0 - Binary outcome

### Encoding the Independent Variable

In [16]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

In [17]:
print(X)

[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


### Encoding the Dependent Variable

In [18]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [19]:
print(y)

[0 1 0 0 1 1 0 1 0 1]


Coding Exercise 3: Encoding Categorical Data for Machine Learning
1: Import required libraries - Pandas, Numpy, and required classes for this task - ColumnTransformer, OneHotEncoder, LabelEncoder.

2: Start by loading the Titanic dataset into a pandas data frame. This can be done using the pd.read_csv function. The dataset's name is 'titanic.csv'.

3: Identify the categorical features in your dataset that need to be encoded. You can store these feature names in a list for easy access later.

4: To apply OneHotEncoding to these categorical features, create an instance of the ColumnTransformer class. Make sure to pass the OneHotEncoder() as an argument along with the list of categorical features.

5: Use the fit_transform method on the instance of ColumnTransformer to apply the OneHotEncoding.

6: The output of the fit_transform method should be converted into a NumPy array for further use.

7: The 'Survived' column in your dataset is the dependent variable. This is a binary categorical variable that should be encoded using LabelEncoder.

8: Print the updated matrix of features and the dependent variable vector

In [20]:
# # Importing the necessary libraries
# import pandas as pd
# import numpy as np
# from sklearn.compose import ColumnTransformer
# from sklearn.preprocessing import OneHotEncoder, LabelEncoder

# # Load the dataset
# df = pd.read_csv('titanic.csv')

# # Identify the categorical data
# categorical_features = ['Sex', 'Embarked', 'Pclass']

# # Implement an instance of the ColumnTransformer class
# ct = ColumnTransformer(
#     transformers=[
#         ('encoder', OneHotEncoder(), categorical_features)
#     ], remainder='passthrough')

# # Apply the fit_transform method on the instance of ColumnTransformer
# X = ct.fit_transform(df)

# # Convert the output into a NumPy array
# X = np.array(X)

# # Use LabelEncoder to encode binary categorical data
# le = LabelEncoder()
# y = le.fit_transform(df['Survived'])

# # Print the updated matrix of features and the dependent variable vector
# print("Updated matrix of features: \n", X)
# print("Updated dependent variable vector: \n", y)

## Splitting the dataset into the Training set and Test set

훈련세트와 테스트세트 분리는 훈련 세트만 사용하여 모델링을 하고 모델을 테스트 세트를 사용하여 평가하는 것이다.

반면 스케일링은 의존 변수(features) 끼리 독립 변수에 미치는 영향력을 균등하게 하기 위해 사이즈를 조정하는 것이다.

만약, 스케일링은 한 후, 스플릿팅을 하는 경우, 테스트 세트의 정보가 스케일링에 반영되어, 테스트 세트가 순수하게 분리되지 않게 된다.

그렇기 때문에 반드시, 훈련세트와 테스트세트를 분리한 후, 스케일링을 해야한되다.

In [21]:
# from "package.module" import "method"
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
# X: Independent
# y: Dependent
# train: 80%
# test: 20%

In [22]:
print(X_train)

[[0.0 0.0 1.0 38.77777777777778 52000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 35.0 58000.0]]


In [23]:
print(X_test)

[[0.0 1.0 0.0 30.0 54000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


In [24]:
print(y_train)

[0 1 0 0 1 1 0 1]


In [25]:
print(y_test)

[0 1]


In [26]:
# # Import necessary libraries
# import pandas as pd
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import StandardScaler

# # Load the Iris dataset using pd.read_csv
# iris_df = pd.read_csv('iris.csv')

# # Separate features and target
# X = iris_df.drop('target', axis=1)  # Assuming 'target' is the column name       for the target variable
# y = iris_df['target']

# # Split the dataset into an 80-20 training-test set
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Apply StandardScaler to scale the feature variables
# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)

# # Print scaled training and test sets
# print("Scaled Training Set:")
# print(X_train)
# print("\nScaled Test Set:")
# print(X_test)

## Feature Scaling

모든 특성이 같은 크기를 같도록 하는 것
한 특성으로 인해 다른 특성이 무시되는 것을 방지

Standardisation
- (X - mean(x)) / standard deviation(x)
- (X - mu) / sigma
- between -3 and +3
- Always work

Normalisation
- (X - min(x)) / (max(x) - min(x))
- between 0 and 1
- When most of features follow normalisation distribution

In [29]:
from sklearn.preprocessing import StandardScaler
# create instance of class
sc = StandardScaler()

# 더미 값을 스케일 해야하는 가? NO
# 스케일링을 하는 경우 인코딩의 의미를 잃어버린다. 
# "숫자형" 컬럼에만 스케일링을 적용 (3, 4 컬럼)

X_train[:, 3:] = sc.fit_transform(X_train[:, 3:])

# 테스트 세트는 트레이닝 세트를 기반으로 스케일링이 되어야 한다
# 트레이닝 세트에서 사용된 스케일러를 그대로 사용
X_test[:, 3:] = sc.fit_transform(X_test[:, 3:])


In [30]:
print(X_train)

[[0.0 0.0 1.0 -0.1915918438457856 -1.0781259408412427]
 [0.0 1.0 0.0 -0.014117293757057902 -0.07013167641635401]
 [1.0 0.0 0.0 0.5667085065333239 0.6335624327104546]
 [0.0 0.0 1.0 -0.3045301939022488 -0.30786617274297895]
 [0.0 0.0 1.0 -1.901801144700799 -1.4204636155515822]
 [1.0 0.0 0.0 1.1475343068237056 1.2326533634535488]
 [0.0 1.0 0.0 1.4379472069688966 1.5749910381638883]
 [1.0 0.0 0.0 -0.7401495441200352 -0.5646194287757336]]


In [31]:
print(X_test)

[[0.0 1.0 0.0 -1.0 -1.0]
 [1.0 0.0 0.0 1.0 1.0]]
