## import libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## import dataset

In [2]:
dataset = pd.read_csv("Data.csv")
X = dataset.iloc[:,:-1].values
Y = dataset.iloc[:,-1].values

###### another approach 
    features = ['Country', 'Age', 'Salary']
    x = dataset[features]
    y = dataset['Purchased']

In [3]:
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [4]:
print(Y)

['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


## handling missing values

In [5]:
from sklearn.impute import SimpleImputer

In [6]:
i = SimpleImputer(missing_values=np.nan, strategy='mean') 
#it fills the missing cell by the average of the values of the column with missing values

In [7]:
i.fit(X[:,1:3]) #take the required part of data from the whole dataset

SimpleImputer(add_indicator=False, copy=True, fill_value=None,
              missing_values=nan, strategy='mean', verbose=0)

In [8]:
X[:,1:3]=i.transform(X[:,1:3])  #transforms the missing values with the new added values

In [9]:
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


# categorical data

### one hot encoding method for independent non-numerical values

In [10]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [11]:
ct = ColumnTransformer([('encoder', OneHotEncoder(), [0])], remainder = 'passthrough')
#passthrough makes no change in the remaning data on X

In [12]:
#ColumnTransformer has both fit and fit_transform in one function only
#no need to seperately fit and then transform like we did while handling missing data
X = np.array(ct.fit_transform(X))
print(X)

[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


In [13]:
from sklearn.preprocessing import LabelEncoder
#label encodng changes values by being in the same column- yes/no  becomes  1/0
le = LabelEncoder()
Y = le.fit_transform(Y)
#no need to seperately fit and then transform like we did while handling missing data
# no need to change into array as only one column
print(Y)

[0 1 0 0 1 1 0 1 0 1]


# split the dataset

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
x_train, x_test, y_train, y_test = train_test_split(X, Y,test_size = 0.2, random_state = 1)

In [16]:
print(x_train, y_train, "\n")
print(x_test, y_test)

[[0.0 0.0 1.0 38.77777777777778 52000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 35.0 58000.0]] [0 1 0 0 1 1 0 1] 

[[0.0 1.0 0.0 30.0 54000.0]
 [1.0 0.0 0.0 37.0 67000.0]] [0 1]


# feature scaling

In [17]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

In [18]:
x_train[:,3:] = sc.fit_transform(x_train[:,3:])
x_test[:,3:] = sc.transform(x_test[:,3:]) 
#dont want to calculate the mean for this column but just use the formula of standardisaton
#hence just transform
#fit - calcuate mean and transfer it to the standard deviation
#transform - use standardisation formula

In [19]:
print(x_train,'\n')
print(x_test)

[[0.0 0.0 1.0 -0.19159184384578545 -1.0781259408412425]
 [0.0 1.0 0.0 -0.014117293757057777 -0.07013167641635372]
 [1.0 0.0 0.0 0.566708506533324 0.633562432710455]
 [0.0 0.0 1.0 -0.30453019390224867 -0.30786617274297867]
 [0.0 0.0 1.0 -1.9018011447007988 -1.420463615551582]
 [1.0 0.0 0.0 1.1475343068237058 1.232653363453549]
 [0.0 1.0 0.0 1.4379472069688968 1.5749910381638885]
 [1.0 0.0 0.0 -0.7401495441200351 -0.5646194287757332]] 

[[0.0 1.0 0.0 -1.4661817944830124 -0.9069571034860727]
 [1.0 0.0 0.0 -0.44973664397484414 0.2056403393225306]]
