In [4]:
#Step 1: Loading the dataset
import pandas as pd
dataset = pd.read_excel('age_salary.xlsx')
dataset

Unnamed: 0,age,salary
0,25.0,35000.0
1,27.0,40000.0
2,50.0,54000.0
3,35.0,
4,40.0,60000.0
5,35.0,58000.0
6,,52000.0
7,48.0,79000.0
8,50.0,83000.0
9,37.0,


In [24]:
#S2: Classifying the dependent and Independent Variables
#X : independent variable set
X = dataset.iloc[:,:-1].values #Takes all rows of all columns except the last column
#Y : dependent variable set
Y = dataset.iloc[:,-1].values # Takes all rows of the last column
Y

array([35000., 40000., 54000.,    nan, 60000., 58000., 52000., 79000.,
       83000.,    nan, 24000., 60000., 70000.])

In [25]:
#S3: Dealing with Missing Data
from sklearn.impute import SimpleImputer
import numpy as np
#use the default strategy for filling missing values which is the mean
imp = SimpleImputer(missing_values=np.nan, strategy="mean")
#The fit_transform() method will fit the imputer object and then transforms the arrays.
X = imp.fit_transform(X)
#The imputer can not be applied on 1D arrays and since Y is a 1D array, it needs to be converted to a compatible shape.
Y = Y.reshape(-1,1)
Y = imp.fit_transform(Y)
Y = Y.reshape(-1)
X

array([[25.        ],
       [27.        ],
       [50.        ],
       [35.        ],
       [40.        ],
       [35.        ],
       [39.18181818],
       [48.        ],
       [50.        ],
       [37.        ],
       [21.        ],
       [39.18181818],
       [63.        ]])

In [26]:
#S4: Dealing with Categorical Data
dataset = pd.read_excel("dataset.xlsx")
dataset

Unnamed: 0,nation,purchased_item,age,salary
0,India,No,25.0,35000.0
1,Russia,Yes,27.0,40000.0
2,Germany,No,50.0,54000.0
3,Russia,No,35.0,
4,Germany,Yes,40.0,60000.0
5,India,Yes,35.0,58000.0
6,Russia,No,,52000.0
7,India,Yes,48.0,79000.0
8,Germany,No,50.0,83000.0
9,India,Yes,37.0,


In [31]:
X = dataset.iloc[:,[0,2,3]].values
Y = dataset.iloc[:,1].values
X

array([['India', 25.0, 35000.0],
       ['Russia', 27.0, 40000.0],
       ['Germany', 50.0, 54000.0],
       ['Russia', 35.0, nan],
       ['Germany', 40.0, 60000.0],
       ['India', 35.0, 58000.0],
       ['Russia', nan, 52000.0],
       ['India', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['India', 37.0, nan],
       ['Germany', 21.0, 24000.0],
       ['India', nan, 60000.0],
       ['Russia', 63.0, 70000.0]], dtype=object)

In [44]:
#This dataset has 3 countries listed. Dummy variables are the solution. Using one hot encoding we will create a dummy variable 
#for each of the category in the column. And uses binary encoding for each dummy variable.
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
le_X = LabelEncoder()
X[:,0] = le_X.fit_transform(X[:,0])
ohe_X = OneHotEncoder()
X = ohe_X.fit_transform(X).toarray()
X

array([[1., 0., 0., ..., 1., 1., 0.],
       [1., 0., 0., ..., 1., 1., 0.],
       [0., 1., 1., ..., 1., 1., 0.],
       ...,
       [0., 1., 1., ..., 1., 1., 0.],
       [1., 0., 0., ..., 1., 1., 0.],
       [1., 0., 0., ..., 1., 1., 0.]])

In [33]:
Y = le_X.fit_transform(Y)
Y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0])

In [48]:
#S5: Splitting the Dataset into Training and Testing sets
from sklearn.model_selection import train_test_split
'''
test_size: the desired size of the test_set. 0.3 denotes 30%.
random_state:  This is used to preserve the uniqueness. The split will happen uniquely for a random_state.
'''
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, random_state = 0)
Y_train

array([0, 0, 1, 1, 1, 0, 0, 1, 0])

In [50]:
#S6: Scaling the features
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

Y_train = Y_train.reshape((len(Y_train), 1))
Y_train = sc.fit_transform(Y_train)
Y_train = Y_train.ravel()
X_train

array([[-1.87082869,  1.87082869,  1.87082869, ...,  0.53452248,
         0.53452248, -0.53452248],
       [-1.87082869,  1.87082869,  1.87082869, ...,  0.53452248,
         0.53452248, -0.53452248],
       [ 0.53452248, -0.53452248, -0.53452248, ...,  0.53452248,
         0.53452248, -0.53452248],
       ...,
       [ 0.53452248, -0.53452248, -0.53452248, ...,  0.53452248,
         0.53452248, -0.53452248],
       [ 0.53452248, -0.53452248, -0.53452248, ...,  0.53452248,
         0.53452248, -0.53452248],
       [ 0.53452248, -0.53452248, -0.53452248, ...,  0.53452248,
         0.53452248, -0.53452248]])