In [1]:
# libraries
import numpy as np # used for handling numbers
import pandas as pd # used for handling the dataset

In [2]:
from sklearn.preprocessing import Imputer # used for handling missing data

In [3]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder # used for encoding categorical data

In [4]:
from sklearn.model_selection import train_test_split # used for splitting training and testing data
from sklearn.preprocessing import StandardScaler # used for feature scaling


In [5]:
dataset = pd.read_csv('DataPreprocessing.csv') # to import the dataset into a variable
dataset.head()

Unnamed: 0,Region,Age,Income,Online Shopper
0,India,49.0,86400.0,No
1,Brazil,32.0,57600.0,Yes
2,USA,35.0,64800.0,No
3,Brazil,43.0,73200.0,No
4,USA,45.0,,Yes


In [6]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
Region            10 non-null object
Age               9 non-null float64
Income            9 non-null float64
Online Shopper    10 non-null object
dtypes: float64(2), object(2)
memory usage: 400.0+ bytes


In [7]:
# Splitting the attributes into independent and dependent attributes
X = dataset.iloc[:, :-1].values # attributes to determine dependent variable / Class
Y = dataset.iloc[:, -1].values # dependent variable / Class

In [8]:
imputer = Imputer(missing_values=np.nan, strategy='mean')

In [9]:
imputer =imputer.fit(X[:,1:])

In [10]:
X

array([['India', 49.0, 86400.0],
       ['Brazil', 32.0, 57600.0],
       ['USA', 35.0, 64800.0],
       ['Brazil', 43.0, 73200.0],
       ['USA', 45.0, nan],
       ['India', 40.0, 69600.0],
       ['Brazil', nan, 62400.0],
       ['India', 53.0, 94800.0],
       ['USA', 55.0, 99600.0],
       ['India', 42.0, 80400.0]], dtype=object)

In [11]:
Y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

In [12]:
X[:, 1:] = imputer.transform(X[:, 1:])

In [13]:
# handling the missing data and replace missing values with nan from numpy and replace with mean of all the other values

In [14]:
X

array([['India', 49.0, 86400.0],
       ['Brazil', 32.0, 57600.0],
       ['USA', 35.0, 64800.0],
       ['Brazil', 43.0, 73200.0],
       ['USA', 45.0, 76533.33333333333],
       ['India', 40.0, 69600.0],
       ['Brazil', 43.77777777777778, 62400.0],
       ['India', 53.0, 94800.0],
       ['USA', 55.0, 99600.0],
       ['India', 42.0, 80400.0]], dtype=object)

In [15]:
labelencoder_X = LabelEncoder()

In [16]:
X[:, 0] = labelencoder_X.fit_transform(X[:, 0])
onehotencoder = OneHotEncoder(categorical_features=[0])
X = onehotencoder.fit_transform(X).toarray()
labelencoder_Y = LabelEncoder()
Y = labelencoder_Y.fit_transform(Y)

In [17]:
X

array([[0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 4.90000000e+01,
        8.64000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 3.20000000e+01,
        5.76000000e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 3.50000000e+01,
        6.48000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.30000000e+01,
        7.32000000e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 4.50000000e+01,
        7.65333333e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 4.00000000e+01,
        6.96000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.37777778e+01,
        6.24000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 5.30000000e+01,
        9.48000000e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 5.50000000e+01,
        9.96000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 4.20000000e+01,
        8.04000000e+04]])

In [18]:
Y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1], dtype=int64)

In [19]:
# splitting the dataset into training set and test set
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

In [20]:
X_train

array([[0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 4.50000000e+01,
        7.65333333e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 4.20000000e+01,
        8.04000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 3.20000000e+01,
        5.76000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.37777778e+01,
        6.24000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 5.30000000e+01,
        9.48000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.30000000e+01,
        7.32000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 4.90000000e+01,
        8.64000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 4.00000000e+01,
        6.96000000e+04]])

In [21]:
# feature scaling

sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

In [22]:
X_train

array([[-0.77459667, -1.        ,  2.64575131,  0.26306757,  0.12381479],
       [-0.77459667,  1.        , -0.37796447, -0.25350148,  0.46175632],
       [ 1.29099445, -1.        , -0.37796447, -1.97539832, -1.53093341],
       [ 1.29099445, -1.        , -0.37796447,  0.05261351, -1.11141978],
       [-0.77459667,  1.        , -0.37796447,  1.64058505,  1.7202972 ],
       [ 1.29099445, -1.        , -0.37796447, -0.0813118 , -0.16751412],
       [-0.77459667,  1.        , -0.37796447,  0.95182631,  0.98614835],
       [-0.77459667,  1.        , -0.37796447, -0.59788085, -0.48214934]])