## Data Processing Tool

In [1]:
# import the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [7]:
# import the dataset
dataset = pd.read_csv('Data.csv')
print(dataset)

   Country   Age   Salary Purchased
0   France  44.0  72000.0        No
1    Spain  27.0  48000.0       Yes
2  Germany  30.0  54000.0        No
3    Spain  38.0  61000.0        No
4  Germany  40.0      NaN       Yes
5   France  35.0  58000.0       Yes
6    Spain   NaN  52000.0        No
7   France  48.0  79000.0       Yes
8  Germany  50.0  83000.0        No
9   France  37.0  67000.0       Yes


In [12]:
# print except for the last column(Purchased)
x = dataset.iloc[:,:-1].values
print(x)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [10]:
# print the last column value only(Purchased)
y = dataset.iloc[:,-1].values
print(y)

['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


### take care missing data with scikit-learn

In [20]:
# take care missing data (mean = avg)
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')


In [21]:
# calculate the training data with imputer(defined above)
imputer.fit(x[:,1:3])

SimpleImputer()

In [22]:
# update the training data from fit()
x[:, 1:3] = imputer.transform(x[:, 1:3])

In [23]:
# replace the NaN value to mean
print(x)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


### Encode the categorical data(name, label data to numerical data) 
- One-Hot encoding for Country column 
- Label encoding for Salary column(label data)
> Label data = Target data (the one we want to predict!!!)


- Do Calculation&Update with fit_transform()
> fit_transform() : only applicable to training data (test data X)

In [42]:
# encode the independent variable(Country column)
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
column_transformer = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')


> remainder='passthrough' 

this parameter is used in ColumnTransformer and it keeps the untransformed columns after finishing transformation 
In this case : column[0] is transformed but column[1:-1] are not changede (remained same data format)  

In [43]:
# the values changed to one-hot encoding by OneHotEncoder()
x = np.array(column_transformer.fit_transform(x))
print(x)

[[1.0 0.0 1.0 0.0 1.0 0.0 0.0 44.0 72000.0]
 [0.0 1.0 0.0 1.0 0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 1.0 0.0 1.0 0.0 30.0 54000.0]
 [0.0 1.0 0.0 1.0 0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 1.0 0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 1.0 0.0 1.0 0.0 0.0 35.0 58000.0]
 [0.0 1.0 0.0 1.0 0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 1.0 0.0 1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 1.0 0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 1.0 0.0 1.0 0.0 0.0 37.0 67000.0]]


In [44]:
# encode the dependent variable(Salary column)
# label data(yes or no) to numerical data
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()


In [45]:
# yes = 1, no = 0
y = label_encoder.fit_transform(y)
print(y)

[0 1 0 0 1 1 0 1 0 1]


### Split the dataset into the Training set and Test set

In [48]:
# test size = 0.2 
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2, random_state =1)


In [47]:
print(x_train)

[[0.0 1.0 0.0 1.0 0.0 0.0 1.0 38.77777777777778 52000.0]
 [0.0 1.0 0.0 1.0 0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 1.0 0.0 1.0 0.0 0.0 44.0 72000.0]
 [0.0 1.0 0.0 1.0 0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 1.0 0.0 0.0 1.0 27.0 48000.0]
 [1.0 0.0 1.0 0.0 1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 1.0 0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 1.0 0.0 1.0 0.0 0.0 35.0 58000.0]]


In [52]:
print(x_test)

[[0.0 1.0 0.0 1.0 0.0 1.0 0.0 30.0 54000.0]
 [1.0 0.0 1.0 0.0 1.0 0.0 0.0 37.0 67000.0]]


In [53]:
print(y_train)

[0 1 0 0 1 1 0 1]


In [54]:
print(y_test)

[0 1]


### Feature Scaling
- do Stardardization which has normalization(0 to 1)

In [58]:
from sklearn.preprocessing import StandardScaler
standard_scalar = StandardScaler()


In [59]:
x_train = standard_scalar.fit_transform(x_train)
print(x_train)

[[-0.77459667  0.77459667 -0.77459667  0.77459667 -0.77459667 -0.57735027
   1.29099445 -0.19159184 -1.07812594]
 [-0.77459667  0.77459667 -0.77459667  0.77459667 -0.77459667  1.73205081
  -0.77459667 -0.01411729 -0.07013168]
 [ 1.29099445 -1.29099445  1.29099445 -1.29099445  1.29099445 -0.57735027
  -0.77459667  0.56670851  0.63356243]
 [-0.77459667  0.77459667 -0.77459667  0.77459667 -0.77459667 -0.57735027
   1.29099445 -0.30453019 -0.30786617]
 [-0.77459667  0.77459667 -0.77459667  0.77459667 -0.77459667 -0.57735027
   1.29099445 -1.90180114 -1.42046362]
 [ 1.29099445 -1.29099445  1.29099445 -1.29099445  1.29099445 -0.57735027
  -0.77459667  1.14753431  1.23265336]
 [-0.77459667  0.77459667 -0.77459667  0.77459667 -0.77459667  1.73205081
  -0.77459667  1.43794721  1.57499104]
 [ 1.29099445 -1.29099445  1.29099445 -1.29099445  1.29099445 -0.57735027
  -0.77459667 -0.74014954 -0.56461943]]


In [60]:
x_test = standard_scalar.transform(x_test)
print(x_test)

[[ 2.77555756e-17  1.00000000e+00  2.77555756e-17  1.00000000e+00
   2.77555756e-17  1.00000000e+00  0.00000000e+00  3.00000000e+01
   5.40000000e+04]
 [ 1.00000000e+00 -2.77555756e-17  1.00000000e+00 -2.77555756e-17
   1.00000000e+00  4.16333634e-17  0.00000000e+00  3.70000000e+01
   6.70000000e+04]]
