In [1]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Importing the dataset
df = pd.read_csv(r'C:\Users\Stelios_Ntanavaras\Documents\Python Scripts\Machine-Learning-A-Z-AI-Python-R-ChatGPT-Prize-2024-\Machine Learning\Data.csv')
df.head()

# Split my data to X (independent variables) and y (dependent variable)
# .iloc : locate indexes, take the indexes of the columns we want to extract from the dataset
X = df.iloc[:, :-1].values # -1 in Python is the index of the last column (exluded)
y = df.iloc[:, -1].values

print('My independent variables are:', X)
print('My depdentent variabel is:', y)

My independent variables are: [['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]
My depdentent variabel is: ['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


In [2]:
# Taking care of Missing Data (avg method)

from sklearn.impute import SimpleImputer

# Replace the missing value by the avg of values of the column the missing value belongs to
imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean')
imputer.fit(X[:, 1:3]) # The '.fit' method looks for only numerical missing values, that is why we exclude the first column (categorical)
X[:, 1:3] = imputer.transform(X[:, 1:3]) # The '.transform' method does the transformation from missing data to the avg of all values

print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [3]:
# Encoding Categorical Data

# We could just tag each categorical data into a number, like Germany = 0, France = 1, and Spain = 2.
# But in this way our model will understand that there is a numerical order between these three countries.
# But we want to avoid the model to understand this correlation because it is not true, we want the numbers just to be labels.
# So, we are going to do OneHotEncoding (where there is no numerical order).

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

ct = ColumnTransformer(transformers = [('encoder', OneHotEncoder(), [0])], remainder = 'passthrough') # 'remainder' arg is to keep the columns that the transformation is not applied (Age and Salary cols)
X = np.array(ct.fit_transform(X))

print(X)

[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


In [4]:
# Encoding the Dependent Variable y

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y = le.fit_transform(y) # doesn't need to be a numpy array

print(y)


[0 1 0 0 1 1 0 1 0 1]


We apply Feature Scaling !!! AFTER !!! splitting the dataset into training and testing set. That's because the test set suppose to be a brand new set where we are going to evaluate our model. If we do feature scaling on test set before the split we will have a test set leakage.

In [5]:
# Split the dataset into Training and Testing set

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

Do we need to apply Standardization on feature columns with dummy variables? !!! NO !!! i.e. if we apply this on dummy vectors like OneHot vectors, we will lose the interpretation of which country belongs to which vector.

In [6]:
# Feature Scaling (purpose: prevent some features dominate other features)
# Feature Scaling is always applying to COLUMNS, not ROWS.

from sklearn.preprocessing import StandardScaler

# First Technique: STANDARDIZATION (almost all the values inside the columns will be mapped in a range of [-3,3])
# stand = x - mean(x) / std(x)

sc = StandardScaler()
X_train[:, 3:] = sc.fit_transform(X_train[:, 3:]) # '.fit' will find the mean and std of all values, and '.transform' will apply the formula
X_test[:, 3:] = sc.fit_transform(X_test[:, 3:])

print(X_train)
print(X_test)

[[0.0 0.0 1.0 -0.19159184384578545 -1.0781259408412425]
 [0.0 1.0 0.0 -0.014117293757057777 -0.07013167641635372]
 [1.0 0.0 0.0 0.566708506533324 0.633562432710455]
 [0.0 0.0 1.0 -0.30453019390224867 -0.30786617274297867]
 [0.0 0.0 1.0 -1.9018011447007988 -1.420463615551582]
 [1.0 0.0 0.0 1.1475343068237058 1.232653363453549]
 [0.0 1.0 0.0 1.4379472069688968 1.5749910381638885]
 [1.0 0.0 0.0 -0.7401495441200351 -0.5646194287757332]]
[[0.0 1.0 0.0 -1.0 -1.0]
 [1.0 0.0 0.0 1.0 1.0]]


In [8]:
# Second Technique: NORMALIZATION (almost all the values inside the columns will be mapped into a range of [0,1])
# norm = x - min(x) / max(x) - min(x)

