# Section 1: Data preprocessing process

### Import libraries

In [None]:
# Import libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection  import train_test_split

### Load dataset

In [None]:
# Load dataset.
data_path = 'Data.csv'
dataset = pd.read_csv(data_path)

### Create matrix of features and dependent variable

In [None]:
# Matrix of features 'X' (independent variables).
X = dataset.iloc[:, :-1].values  # [: -> all rows, :-1 all columns except last]
# Dependent variable 'Y'.
Y = dataset.iloc[:, -1].values

### Solve missing data problem with mean values

In [None]:
# Solving missing data replacing it with the mean column value
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
X[:, 1:3] = imputer.fit_transform(X[:, 1:3]) # Fit imputer in columns 1 and 2

### Envode categorical varaibles

In [None]:
# Encode categorical varaibles (non numerical variables like, city, country, yes, no...)
# creating dummy variables for categorical data since all categories should
# be treated with the same importance.
ct = ColumnTransformer([("Country", OneHotEncoder(), [0])], remainder = 'passthrough')
X = ct.fit_transform(X)
# Encode categorical varaibles of Y
Y = LabelEncoder().fit_transform(Y)

### Split dataset in Training and Test sets

In [None]:
# Split dataset in Training and Test sets.
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.2, random_state = 42) 

### Feature Scaling 

In [38]:
# Feature scaling to avoid distance calculation problems since
# most models use the Euclidean distance.
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

# Section 1: Data preprocessing template

In [None]:
### Required
# Load dataset
data_path = 'Data.csv'
dataset = pd.read_csv(data_path)
# Create matrix of features and dependent variable.
X = dataset.iloc[:, :-1].values
Y = dataset.iloc[:, -1].values
# Split dataset in Training and Test sets.
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.2, random_state = 42)
# Optional
"""
# Feature Scaling
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
"""