In [9]:
# Data Preprocessing Template

# Created by Sofie Aspeslagh 

## ------- How to use ------- ##

# Any steps the need to be done within the code with be annotated with:
# ----->>

# Follow the steps below to 
# Step 1 - Remove lst variable and import your own data as df
# Step 2 - Check that the dependent variable is the last column. If not change the values for setting X and y
# Step 3 - Check for missing data
# Step 4 - If there is missing data, either delete rows, replace by mean or consider using other ML techniques
# Step 5 - Encode any Categorical features. If binary use LabelEncoder, if multi-class use OneHotEncoder
# Step 6 - Split data into train and test sets
# Step 7 - Feature scale any numerical data (might also include encoded data)
# Step 8 - Delete/comment any sections you haven't used
# Step 9 - Move libraries to the top

# TODO - add in print heads instead of all data
# add in text pre processing
# finish off prints
# watch a cloud guru for more stuff that might need to be added in 


## ------- Import libraries and data ------- ##

import pandas as pd
# ----->> put addition libraries here 

lst = [['France', 44, 72000, 'No'], 
       ['Spain', 27, 48000, 'Yes'],
       ['Germany', 30, 54000, 'No'],
       ['Spain', 38, 61000, 'No'],
       ['Germany', 40, None , 'Yes'],
       ['France', 35, 58000, 'Yes'],
       ['Spain', None , 52000, 'No'],
       ['France', 48,79000, 'Yes'],
       ['Germany', 50, 83000, 'No'],
       ['France', 37, 67000, 'Yes']]  

df = pd.DataFrame(lst, columns =['Country', 'Age', 'Salary', 'Purchased'])
# ----->> remove lst and import data as df i.e df = pd.read_csv('filename.csv')

# set X as all independent variables
X = df.iloc[:,:-1].values

# set y as dependent variable
y = df.iloc[:,-1].values

print('IMPORTING DATA \n')
print('X: \n')
print(X)
print('\ny: \n')
print(y)
print('\n','_'*7,'\n')
# ----->> check that X and y are correct


## ------- Detecting missing values ------- ##

# detect any NaN values - keep in mind missing values may be represented in various ways. 
missing_data = df.isnull().sum()

print('MISSING DATA \n')
print('Missing values in each feature: \n')
print(missing_data)
print('\n','_'*7,'\n')



## ------- Replace missing values by the Mean ------- ##

import numpy as np
from sklearn.impute import SimpleImputer

print('REPLACING MISSING DATA \n')
print('Check X before imputer: \n\n', X, '\n\n')

# create imputer object - for more info use help(SimpleImputer)
imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean')

# fit object to data - take only the cols that have missing data
imputer = imputer.fit(X[:, 1:3])

# replace nan values
X[:,1:3] = imputer.transform(X[:,1:3])

print('Check X after imputer: \n\n', X)
print('\n','_'*7,'\n')



## ------- Categorical encoding ------- ##

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer

print('CATEGOTICAL ENCODING \n')
print('X before encoding: \n\n', X, '\n\n')

# Create Column Transformer object for multi-categoical 
columntransformer = ColumnTransformer([('encoder', OneHotEncoder(), [0])], remainder='passthrough')

# take a copy for mapping purposes 
X_names = X.copy()

# fit to data
X = columntransformer.fit_transform(X) 

# need to use drop if you want to get feature names 
columntransformer_names = ColumnTransformer([('encoder', OneHotEncoder(), [0])], remainder='drop')
X_names = columntransformer_names.fit_transform(X_names)
X_mapping_vals = columntransformer_names.get_feature_names()

print('X after encoding: \n\n', X, '\n\n')
print('X mapping (col names): \n\n', X_mapping_vals, '\n\n')


print('y before encoding: \n\n', X, '\n\n')

# label encoder create object for binary categoical features
labelencoder_y = LabelEncoder()

# fit object to the data and replace categorical data
y  = labelencoder_y.fit_transform(y)

# get mapping of features 
y_mapping_vals = dict(zip(labelencoder_y.classes_, labelencoder_y.transform(labelencoder_y.classes_)))

print('y after encoding: \n\n', y, '\n\n')
print('y mapping: \n\n', y_mapping_vals, '\n')
print('\n','_'*7,'\n')

## ------- Splitting into Training and Test set ------- ##

from sklearn.model_selection import train_test_split

# random_state is set to ensure that same outcome each time. Would work fine without this
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

print('SPLITTING DATA \n')


## ------- Feature Scaling ------- ##
from sklearn.preprocessing import StandardScaler

# Make scaling object using Standardization
sc_X = StandardScaler()

# Fit and transform to training data
X_train = sc_X.fit_transform(X_train) 

# use the same fit as the training set and transform the test set
X_test = sc_X.transform(X_test) 


# Y scaling only needs to be done for regression not classification 
# sc_y = StandardScaler()
# y_train = sc_y.fit_transform(y_train)

IMPORTING DATA 

X: 

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]

y: 

['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']

 _______ 

MISSING DATA 

Missing values in each feature: 

Country      0
Age          1
Salary       1
Purchased    0
dtype: int64

 _______ 

REPLACING MISSING DATA 

Check X before imputer: 

 [['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]] 


Check X after imputer: 

 [['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]