### Importing libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

### Importing the dataset

In [2]:
dataset = pd.read_csv("Data.csv")
dataset

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [3]:
X = dataset.iloc[:,:-1].values
Y = dataset.iloc[:,-1].values

In [4]:
print(X)
print()
print(Y)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]

['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


### Taking care of missing data

In [5]:
from sklearn.preprocessing import Imputer

In [6]:
imputer = Imputer(missing_values="NaN",strategy = 'mean',axis = 0)



In [7]:
#another possible strategies are mode and 
#median(stated as most_frequent)

In [8]:
imputer = imputer.fit(X[:,1:3])

In [9]:
X[:,1:3] = imputer.transform(X[:,1:3])
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [10]:
#note that 38.77 and 63777.77 are
#the means in the respective collumns


Handling categorical data : - collumn 0 of the feature matrix and the labelled results

In [11]:
from sklearn.preprocessing import LabelEncoder

In [12]:
L_encdr_X = LabelEncoder()

In [13]:
X[:,0] = L_encdr_X.fit_transform(X[:,0])
pd.DataFrame(X)

Unnamed: 0,0,1,2
0,0,44.0,72000.0
1,2,27.0,48000.0
2,1,30.0,54000.0
3,2,38.0,61000.0
4,1,40.0,63777.8
5,0,35.0,58000.0
6,2,38.7778,52000.0
7,0,48.0,79000.0
8,1,50.0,83000.0
9,0,37.0,67000.0


 NOTE:- the first collumn is encoded but a new problem arises as in it relates Spain as in someway being closer to Germany than to France which is not the case as these are categorical variables and we only want to encode them, not assign them numerical values.

 hence we will be using dummy variables instead:- we have the number of collumns equal to arg_max (2 here) +1 i.e. 3 collumns and each of them will be binarily coded

In [14]:
from sklearn.preprocessing import OneHotEncoder

In [15]:
oneHtEncoder = OneHotEncoder(categorical_features= [0])

In [16]:
X = oneHtEncoder.fit_transform(X).toarray()

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [17]:
pd.DataFrame(X)

Unnamed: 0,0,1,2,3,4
0,1.0,0.0,0.0,44.0,72000.0
1,0.0,0.0,1.0,27.0,48000.0
2,0.0,1.0,0.0,30.0,54000.0
3,0.0,0.0,1.0,38.0,61000.0
4,0.0,1.0,0.0,40.0,63777.777778
5,1.0,0.0,0.0,35.0,58000.0
6,0.0,0.0,1.0,38.777778,52000.0
7,1.0,0.0,0.0,48.0,79000.0
8,0.0,1.0,0.0,50.0,83000.0
9,1.0,0.0,0.0,37.0,67000.0


 note that the categorical data is now encoded using 3 binary collumns

 doing the same for Y(the labels) now 

In [18]:
L_encdr_Y = LabelEncoder()

In [19]:
Y = L_encdr_Y.fit_transform(Y)

In [21]:
pd.DataFrame(Y)

Unnamed: 0,0
0,0
1,1
2,0
3,0
4,1
5,1
6,0
7,1
8,0
9,1


 note that no need to pass collumns here as all of Y is categorical

### Train-Test Set splitting

In [25]:
from sklearn.model_selection import train_test_split

In [28]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size = 0.2,random_state =0)

###### random_state is the seed

a good train_test split ratio will be around 0.7:0.3 but not smaller..

In [34]:
pd.DataFrame(X_train)

Unnamed: 0,0,1,2,3,4
0,0.0,1.0,0.0,40.0,63777.777778
1,1.0,0.0,0.0,37.0,67000.0
2,0.0,0.0,1.0,27.0,48000.0
3,0.0,0.0,1.0,38.777778,52000.0
4,1.0,0.0,0.0,48.0,79000.0
5,0.0,0.0,1.0,38.0,61000.0
6,1.0,0.0,0.0,44.0,72000.0
7,1.0,0.0,0.0,35.0,58000.0


In [31]:
pd.DataFrame(X_test)

Unnamed: 0,0,1,2,3,4
0,0.0,1.0,0.0,30.0,54000.0
1,0.0,1.0,0.0,50.0,83000.0


In [32]:
pd.DataFrame(Y_train)

Unnamed: 0,0
0,1
1,1
2,1
3,0
4,1
5,0
6,0
7,1


In [33]:
pd.DataFrame(Y_test)

Unnamed: 0,0
0,0
1,0


### Feature scaling

necessary due to fact that some machine learning models use euclidian distance for gauging the "distance" hence we transform the features in the same range to avoid this

note that even if an algorithm doesn't use euclidean distance, other methods such as decision trees will work faster on a scaled feature matrix

In [36]:
from sklearn.preprocessing import StandardScaler

In [37]:
sc_X = StandardScaler()

In [39]:
X_train = sc_X.fit_transform(X_train)

In [40]:
X_test = sc_X.fit_transform(X_test)

In [41]:
pd.DataFrame(X_train)

Unnamed: 0,0,1,2,3,4
0,-1.0,2.645751,-0.774597,0.263068,0.123815
1,1.0,-0.377964,-0.774597,-0.253501,0.461756
2,-1.0,-0.377964,1.290994,-1.975398,-1.530933
3,-1.0,-0.377964,1.290994,0.052614,-1.11142
4,1.0,-0.377964,-0.774597,1.640585,1.720297
5,-1.0,-0.377964,1.290994,-0.081312,-0.167514
6,1.0,-0.377964,-0.774597,0.951826,0.986148
7,1.0,-0.377964,-0.774597,-0.597881,-0.482149


note that even categorical data is scaled and this depends on the context of the kind of features being used

google about scaling of categorical data for more info