In [36]:
# Step 1:Importing the libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [37]:
# Step 2: Importing the data set
dataset=pd.read_csv('Data.csv')

In [38]:
# iloc = locate index iloc [rows,columns]
x = dataset.iloc[:, :-1].values  # get all values except last column from ALL rows
y = dataset.iloc[:, -1].values  # get values only from last column from ALL rows

In [39]:
x

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [40]:
y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

Step 3: Taking care of Missing data

In [41]:
from sklearn.impute import SimpleImputer

In [42]:
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')  # Replace missing values(nan) with mean of all values i same column
imputer.fit(x[:, 1:3])  # Include only numeric columns to replace -- chooses column 1 and 2 only
x[:, 1:3] = imputer.transform(x[:, 1:3])  # Transform the values

In [43]:
x

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [44]:
y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

Step 4: Encoding Categorical Data
To encode independent variable

One hot encoding -> to divide a data set containing categorical data into different columns 
instead of algorithm assuming that order of data is a feature.
So instead of assuming the order of countries is a feature, divide it into categories.

In [45]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [46]:
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')

transformers --> 
1. Kind of transformation - encoding
2. the type of encoding- 1 hot encoding
3. index of column you wanna encode - country
passthrough --> to retain all columns, otherwise it will have only 3 rows.

In [47]:
x = np.array(ct.fit_transform(x))

fit and transform at the same time
fit_transform returns a matrix --> transform that to numpy array

In [48]:
x

array([[1.0, 0.0, 0.0, 44.0, 72000.0],
       [0.0, 0.0, 1.0, 27.0, 48000.0],
       [0.0, 1.0, 0.0, 30.0, 54000.0],
       [0.0, 0.0, 1.0, 38.0, 61000.0],
       [0.0, 1.0, 0.0, 40.0, 63777.77777777778],
       [1.0, 0.0, 0.0, 35.0, 58000.0],
       [0.0, 0.0, 1.0, 38.77777777777778, 52000.0],
       [1.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 1.0, 0.0, 50.0, 83000.0],
       [1.0, 0.0, 0.0, 37.0, 67000.0]], dtype=object)

To encode dependent variable
label encoder --> to change Yes and No as 0 and 1

In [49]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [50]:
y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

 Step 5 : Splitting Data set into training and test set
 split as 80 and 20%

In [51]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)

In [52]:
x_train

array([[0.0, 0.0, 1.0, 38.77777777777778, 52000.0],
       [0.0, 1.0, 0.0, 40.0, 63777.77777777778],
       [1.0, 0.0, 0.0, 44.0, 72000.0],
       [0.0, 0.0, 1.0, 38.0, 61000.0],
       [0.0, 0.0, 1.0, 27.0, 48000.0],
       [1.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 1.0, 0.0, 50.0, 83000.0],
       [1.0, 0.0, 0.0, 35.0, 58000.0]], dtype=object)

In [53]:
y_train

array([0, 1, 0, 0, 1, 1, 0, 1])

In [54]:
x_test

array([[0.0, 1.0, 0.0, 30.0, 54000.0],
       [1.0, 0.0, 0.0, 37.0, 67000.0]], dtype=object)

In [55]:
y_test

array([0, 1])