In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('data_linereg.csv')

In [3]:
"""
> read data 
> basic data exploration 
> split dependant and independant variables 
> impute nan values using a strategy 
> categorical encoding using onehot, label, dummy encoding 
> train-test splitting 
> feature scaling - standard, normalize 
> MODELLING
"""

'\n> read data \n> basic data exploration \n> split dependant and independant variables \n> impute nan values using a strategy \n> categorical encoding using onehot, label, dummy encoding \n> train-test splitting \n> feature scaling - standard, normalize \n> MODELLING\n'

In [4]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

In [5]:
X.head(3), y.head(3)

(   Country   Age   Salary
 0   France  44.0  72000.0
 1    Spain  27.0  48000.0
 2  Germany  30.0  54000.0,
 0     No
 1    Yes
 2     No
 Name: Purchased, dtype: object)

In [6]:
X.isnull().sum()

Country    0
Age        1
Salary     1
dtype: int64

In [7]:
y.isnull().sum()

0

In [8]:
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

In [9]:
from sklearn.impute import SimpleImputer

In [10]:
imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean')
imputer.fit(X[:, 1:])
X[:, 1:] = imputer.transform(X[:, 1:])

In [11]:
X[:5]

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778]], dtype=object)

In [12]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [13]:
transformer = ColumnTransformer(
    transformers = [('encoder', OneHotEncoder(), [0])],
    remainder = 'passthrough'
)
transformer.fit(X)
X = transformer.transform(X)

In [14]:
X[:5]

array([[1.0, 0.0, 0.0, 44.0, 72000.0],
       [0.0, 0.0, 1.0, 27.0, 48000.0],
       [0.0, 1.0, 0.0, 30.0, 54000.0],
       [0.0, 0.0, 1.0, 38.0, 61000.0],
       [0.0, 1.0, 0.0, 40.0, 63777.77777777778]], dtype=object)

In [15]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
encoder.fit(y)
y = encoder.transform(y)

In [16]:
y[:5]

array([0, 1, 0, 0, 1])

In [17]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [18]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train[:, -2:])
X_train[:, -2:] = scaler.transform(X_train[:, -2:])
X_test[:, -2:] = scaler.transform(X_test[:, -2:])

In [19]:
X_train, X_test

(array([[0.0, 1.0, 0.0, 0.2630675731713538, 0.1238147854838185],
        [1.0, 0.0, 0.0, -0.25350147960148617, 0.4617563176278856],
        [0.0, 0.0, 1.0, -1.9753983221776195, -1.5309334063940294],
        [0.0, 0.0, 1.0, 0.05261351463427101, -1.1114197802841526],
        [1.0, 0.0, 0.0, 1.6405850472322605, 1.7202971959575162],
        [0.0, 0.0, 1.0, -0.08131179534387283, -0.16751412153692966],
        [1.0, 0.0, 0.0, 0.9518263102018072, 0.9861483502652316],
        [1.0, 0.0, 0.0, -0.5978808481167128, -0.48214934111933727]],
       dtype=object),
 array([[0.0, 1.0, 0.0, -1.4588292694047795, -0.9016629672292141],
        [0.0, 1.0, 0.0, 1.984964415747487, 2.139810822067393]],
       dtype=object))

In [20]:
X_test

array([[0.0, 1.0, 0.0, -1.4588292694047795, -0.9016629672292141],
       [0.0, 1.0, 0.0, 1.984964415747487, 2.139810822067393]],
      dtype=object)