### Importing the libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

### Importing the dataset

In [2]:
dataset = pd.read_csv("Data.csv")

In [3]:
X = dataset.iloc[:, :-1].values
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [4]:
y = dataset.iloc[:, -1].values
y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

### Taking care of missing data

In [7]:
    help(SimpleImputer)

Help on class SimpleImputer in module sklearn.impute._base:

class SimpleImputer(_BaseImputer)
 |  SimpleImputer(missing_values=nan, strategy='mean', fill_value=None, verbose=0, copy=True, add_indicator=False)
 |  
 |  Imputation transformer for completing missing values.
 |  
 |  Read more in the :ref:`User Guide <impute>`.
 |  
 |  Parameters
 |  ----------
 |  missing_values : number, string, np.nan (default) or None
 |      The placeholder for the missing values. All occurrences of
 |      `missing_values` will be imputed.
 |  
 |  strategy : string, default='mean'
 |      The imputation strategy.
 |  
 |      - If "mean", then replace missing values using the mean along
 |        each column. Can only be used with numeric data.
 |      - If "median", then replace missing values using the median along
 |        each column. Can only be used with numeric data.
 |      - If "most_frequent", then replace missing using the most frequent
 |        value along each column. Can be used wi

In [8]:

from sklearn.impute import SimpleImputer 
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer = imputer.fit(X[:, 1:3])
X[:, 1:3] = imputer.transform(X[:, 1:3])
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

### Feature Scaling

### Encoding Categorical Data

In [9]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

labelencoder_X = LabelEncoder()
X[:, 0] = labelencoder_X.fit_transform(X[:, 0])
print(X)
print(X.shape)
print("\n")

onehotencoder = OneHotEncoder(categorical_features = [0])
X = onehotencoder.fit_transform(X).toarray()
print(X)
print(X.shape)
print("\n")

labelencoder_y = LabelEncoder()
y = labelencoder_y.fit_transform(y)
print(y)

[[0 44.0 72000.0]
 [2 27.0 48000.0]
 [1 30.0 54000.0]
 [2 38.0 61000.0]
 [1 40.0 63777.77777777778]
 [0 35.0 58000.0]
 [2 38.77777777777778 52000.0]
 [0 48.0 79000.0]
 [1 50.0 83000.0]
 [0 37.0 67000.0]]
(10, 3)




TypeError: __init__() got an unexpected keyword argument 'categorical_features'

### Splitting the dataset into Training set and Test set

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
print(X_train)
print(X_train.shape)
print("\n")
print(X_test)
print(X_test.shape)
print("\n")
print(y_train)
print(y_train.shape)
print("\n")
print(y_test)
print(y_test.shape)

[[1 40.0 63777.77777777778]
 [0 37.0 67000.0]
 [2 27.0 48000.0]
 [2 38.77777777777778 52000.0]
 [0 48.0 79000.0]
 [2 38.0 61000.0]
 [0 44.0 72000.0]
 [0 35.0 58000.0]]
(8, 3)


[[1 30.0 54000.0]
 [1 50.0 83000.0]]
(2, 3)


['Yes' 'Yes' 'Yes' 'No' 'Yes' 'No' 'No' 'Yes']
(8,)


['No' 'No']
(2,)


In [12]:
from sklearn.preprocessing import StandardScaler

sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
print(X_train)
print(X_train.shape)
print("\n")
print(X_test)
print(X_test.shape)

[[ 0.13483997  0.26306757  0.12381479]
 [-0.94387981 -0.25350148  0.46175632]
 [ 1.21355975 -1.97539832 -1.53093341]
 [ 1.21355975  0.05261351 -1.11141978]
 [-0.94387981  1.64058505  1.7202972 ]
 [ 1.21355975 -0.0813118  -0.16751412]
 [-0.94387981  0.95182631  0.98614835]
 [-0.94387981 -0.59788085 -0.48214934]]
(8, 3)


[[ 0.13483997 -1.45882927 -0.90166297]
 [ 0.13483997  1.98496442  2.13981082]]
(2, 3)
