## Importing the libararies

In [1]:


import pandas as pd 

import numpy as np 

import matplotlib.pyplot as plt


## Importing the data sets  ( note that -1 is used to index the last value )


In [2]:

dataset =  pd.read_csv('Data.csv')

x = dataset.iloc[ : , : -1 ].values

y = dataset.iloc[ : , -1 ].values 



In [3]:
print(x)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [4]:
print(y)

['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


## Taking care of missing data

In [5]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values = np.nan , strategy='mean')

imputer.fit(x[ : , 1:3 ])

x[ : , 1:3 ] = imputer.transform(x[ : , 1:3 ])

print(x)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


## Encoding Categorical Data

## Encoding independent variable

In [6]:
from sklearn.compose import ColumnTransformer

from sklearn.preprocessing  import OneHotEncoder

ct = ColumnTransformer(transformers = [( 'encoder' , OneHotEncoder() , [0] )] , remainder = 'passthrough')

x = np.array(ct.fit_transform(x))

print(x)

[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


## Encoding the dependent varibale

In [7]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

y = le.fit_transform(y)

print(y)

[0 1 0 0 1 1 0 1 0 1]


## Splitting the Dataset into training and test set

In [8]:
from sklearn.model_selection import train_test_split

train_x , test_x , train_y , test_y = train_test_split(x , y , test_size = 0.2 , random_state = 1)


In [9]:
print(train_x)

[[0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 37.0 67000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [1.0 0.0 0.0 44.0 72000.0]
 [1.0 0.0 0.0 35.0 58000.0]]


In [10]:
print(test_x)

[[0.0 1.0 0.0 30.0 54000.0]
 [0.0 1.0 0.0 50.0 83000.0]]


In [11]:
print(train_y)

[1 1 1 0 1 0 0 1]


In [12]:
print(test_y)

[0 0]


## Feature Scaling

In [13]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()

train_x[ : , 3: ] = sc.fit_transform(train_x[ : , 3: ])

test_x[ : , 3: ] = sc.transform(test_x[ : , 3: ])


In [14]:
print(train_x)

[[0.0 1.0 0.0 0.2630675731713538 0.1238147854838185]
 [1.0 0.0 0.0 -0.25350147960148617 0.4617563176278856]
 [0.0 0.0 1.0 -1.9753983221776195 -1.5309334063940294]
 [0.0 0.0 1.0 0.05261351463427101 -1.1114197802841526]
 [1.0 0.0 0.0 1.6405850472322605 1.7202971959575162]
 [0.0 0.0 1.0 -0.08131179534387283 -0.16751412153692966]
 [1.0 0.0 0.0 0.9518263102018072 0.9861483502652316]
 [1.0 0.0 0.0 -0.5978808481167128 -0.48214934111933727]]


In [15]:
print(test_x)

[[0.0 1.0 0.0 -1.4588292694047795 -0.9016629672292141]
 [0.0 1.0 0.0 1.984964415747487 2.139810822067393]]
