# Data preprocessing temlate

## Step 1 - Import librairies

In [62]:
import pandas as pd

# Since the Scikit library is big, we only import what we need

# train_test_split : method to split the dataset (DataFrame) into two distinct : one for training, one for testing
from sklearn.model_selection import train_test_split

# SimpleImputer : class which allow us to fill missing values (null, NaN) with specific strategy (median, mean, mode, etc.)
from sklearn.impute import SimpleImputer

# StandardScaler : class which allow us to standardize (put on the same scale) numerical features (using z-score)
# OneHotEncoder : class which allow us to encode categorical features (create a column per category that contain only 0 and 1)
# LabelEncoder : class which allow us to encode labels (same as categorical, but with a "binary" template of data) 
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder

# ColumnTransformer : : class which allow us to use scalers and encoders ojects to transform our DataFrames, returning numpy arrays
from sklearn.compose import ColumnTransformer

## Step 2 - Import dataset 

In [63]:
df = pd.read_csv("/Users/qxzjy/vscworkspace/dse-ft-100/ml_module/data/Data.csv")
df.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


In [64]:
# Print the shape of dataset in the form of (#rows, #columns)
print(f"Rows , Columuns : {df.shape}")
print()

# Describe dataset's main statistics
# We're using the include="all" attribut because we want to see/explore all values (not only numbers)
# We need to see if there's missing values (null / NaN) and the type of data we have (numerical, categorical, labels, etc.)
print(df.describe(include="all"))

Rows , Columuns : (10, 4)

       Country        Age        Salary Purchased
count       10   9.000000      9.000000        10
unique       3        NaN           NaN         2
top     France        NaN           NaN        No
freq         4        NaN           NaN         5
mean       NaN  38.777778  63777.777778       NaN
std        NaN   7.693793  12265.579662       NaN
min        NaN  27.000000  48000.000000       NaN
25%        NaN  35.000000  54000.000000       NaN
50%        NaN  38.000000  61000.000000       NaN
75%        NaN  44.000000  72000.000000       NaN
max        NaN  50.000000  83000.000000       NaN


## Step 3 - Separate Target from feature variables

In [None]:
print("Separating labels from features...")

# We create two separate DataFrames, one with our features X and one for the target variable Y (that we want to predict)
# We're using a List to enumerate the columns that we're going to use as features X
features_list = ["Country", "Age", "Salary"]
X = df.loc[:,features_list]
y = df.loc[:,"Purchased"]

print("...Done.")
print()

print(X.head())
print()
print(y.head())

Separating labels from features...
...Done.

   Country   Age   Salary
0   France  44.0  72000.0
1    Spain  27.0  48000.0
2  Germany  30.0  54000.0
3    Spain  38.0  61000.0
4  Germany  40.0      NaN

0     No
1    Yes
2     No
3     No
4    Yes
Name: Purchased, dtype: object


## Step 4 - Train / Test split 

In [66]:
print("Splitting dataset into train set and test set...")

# The method will return 4 arrays : 2 with features X and 2 with target variables y (each time : 1 for training the model, 1 for testing it)
# X : our DataFrames of features
# y : our DataFrames of target variables
# test_size : the size of the training set compared with total set, using proportion => 0.20 (20%)
# random_state : the method we use to randomly selected our data
# stratify : allows to stratify your sample (same proportion of categories in test and train set) on a specific column
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

print("...Done.")  
print()                    

print(X_train)
print()                    
print(y_train)

Splitting dataset into train set and test set...
...Done.

   Country   Age   Salary
4  Germany  40.0      NaN
9   France  37.0  67000.0
1    Spain  27.0  48000.0
6    Spain   NaN  52000.0
7   France  48.0  79000.0
3    Spain  38.0  61000.0
0   France  44.0  72000.0
5   France  35.0  58000.0

4    Yes
9    Yes
1    Yes
6     No
7    Yes
3     No
0     No
5    Yes
Name: Purchased, dtype: object


## Step 5 - Training

In [67]:
print("--- Training pipeline ---")

--- Training pipeline ---


### Imputing missing values

In [None]:
print("Imputing missing values...")
print()

# Instanciate class of SimpleImputer with strategy of mean
imputer = SimpleImputer(strategy="mean")

# Copy dataset to avoid caveats of assign a copy of a slice of a DataFrame
X_train = X_train.copy()
print(X_train)
print()

# Fit and transform columns where there are missing values
# Alternative : X_train.loc[:,["Age", "Salary"]] = imputer.fit_transform(X_train.iloc[:,["Age", "Salary"]])
X_train.iloc[:,[1,2]] = imputer.fit_transform(X_train.iloc[:,[1,2]])

print("...Done!")
print()

print(X_train)

Imputing missing values...

   Country   Age   Salary
4  Germany  40.0      NaN
9   France  37.0  67000.0
1    Spain  27.0  48000.0
6    Spain   NaN  52000.0
7   France  48.0  79000.0
3    Spain  38.0  61000.0
0   France  44.0  72000.0
5   France  35.0  58000.0

...Done!

   Country        Age        Salary
4  Germany  40.000000  62428.571429
9   France  37.000000  67000.000000
1    Spain  27.000000  48000.000000
6    Spain  38.428571  52000.000000
7   France  48.000000  79000.000000
3    Spain  38.000000  61000.000000
0   France  44.000000  72000.000000
5   France  35.000000  58000.000000


### Standardizing (scaling) and encoding

In [None]:
print("Encoding categorical features and standardizing numerical features...")
print()

# We create a list with ids of columns containing numerical features in order to standardize them 
numeric_features = [1, 2]
numeric_transformer = StandardScaler()

# We create a list with ids of columns containing categorical features in order to encode them 
categorical_features = [0]
categorical_transformer = OneHotEncoder(drop="first")

# Instansiate the class ColumnTransformer with our transformers (in this case : one for scaling, one for encoding)
featureencoder = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features),    
        ('num', numeric_transformer, numeric_features)
    ]
)

X_train = featureencoder.fit_transform(X_train)
print("...Done.")
print()
print(X_train[:5]) # print first 5 rows (not using iloc since now X_train became a numpy array)
print()

# Encoding labels
print("Encoding labels...")
print(y_train)
print()

labelencoder = LabelEncoder()
Y_train = labelencoder.fit_transform(y_train)

print("...Done.")
print()
print(Y_train[:5]) # print first 5 rows (not using iloc since now y_train became a numpy array)
print()

Encoding categorical features and standardizing numerical features...

...Done.

[[ 0.          1.          0.          0.27063731  0.        ]
 [ 1.          0.          0.         -0.24603392  0.47997   ]
 [ 0.          0.          1.         -1.96827133 -1.51490532]
 [ 0.          0.          1.          0.         -1.09493157]
 [ 1.          0.          0.          1.64842723  1.73989126]]

Encoding labels...
4    Yes
9    Yes
1    Yes
6     No
7    Yes
3     No
0     No
5    Yes
Name: Purchased, dtype: object

...Done.

[1 1 1 0 1]



In [70]:
print("*** HERE WILL BE THE TRAINING STEP (NOT IN THE SCOPE AT THIS STAGE OF THE LECTURE) ***")

*** HERE WILL BE THE TRAINING STEP (NOT IN THE SCOPE AT THIS STAGE OF THE LECTURE) ***


## Step 6 - Testing

In [71]:
print("--- Test pipeline ---")

# Missing values
print("Imputing missing values...")
print()
print(X_test)
print()
# Copy dataset to avoid caveats of assign a copy of a slice of a DataFrame
X_test = X_test.copy()

# /!\ We don't use the fit_transform on test sets /!\
X_test.iloc[:,[1,2]] = imputer.transform(X_test.iloc[:,[1,2]])
print("...Done.")
print()
print(X_test) 
print()   

# Encoding categorical features and standardizing numeric features
print("Encoding categorical features and standardizing numerical features...")
print()

# /!\ We don't use the fit_transform on test sets /!\
X_test = featureencoder.transform(X_test)
print("...Done.")
print()
print(X_test)
print()

# Encoding labels
print("Encoding labels...")
print()
print(y_test)
print()

# /!\ We don't use the fit_transform on test sets /!\
y_test = labelencoder.transform(y_test)
print("...Done.")
print()
print(y_test)
print()


--- Test pipeline ---
Imputing missing values...

   Country   Age   Salary
2  Germany  30.0  54000.0
8  Germany  50.0  83000.0

...Done.

   Country   Age   Salary
2  Germany  30.0  54000.0
8  Germany  50.0  83000.0

Encoding categorical features and standardizing numerical features...

...Done.

[[ 0.          1.          0.         -1.4516001  -0.88494469]
 [ 0.          1.          0.          1.99287472  2.15986501]]

Encoding labels...

2    No
8    No
Name: Purchased, dtype: object

...Done.

[0 0]



## Step 7 - Predict and evaluate

In [72]:
print("*** HERE WILL BE THE PREDICTION STEP ***")
print()
print("*** HERE WILL BE THE ASSESSMENT OF PERFORMANCES ***")
print()

*** HERE WILL BE THE PREDICTION STEP ***

*** HERE WILL BE THE ASSESSMENT OF PERFORMANCES ***

