### importing libraries

In [252]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### importing the dataset

In [253]:
dataset = pd.read_csv('Data.csv')
dataset.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


### choosing dependent and independent variables

In [254]:
# Extract features (X) from the dataset
# Select all columns except 'Purchased'
X = dataset[['Country', 'Age', 'Salary']].values

# Extract target variable (y) from the dataset
# Select only the 'Purchased' column
y = dataset['Purchased'].values

In [255]:
# Extract features (X) from the dataset
# Select all rows (:) and all columns except the last one (:-1)
X = dataset.iloc[:, :-1].values

# Extract target variable (y) from the dataset
# Select all rows (:) and only the last column (-1)
y = dataset.iloc[:, -1].values

In [256]:
print(X)
print(y)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]
['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


#### Taking care of missing data

In [257]:
dataset

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


Here we have missing data in 4th row i.e. salary and in 6th row i.e age

To handle missing data we can 

- delete the row.

- Replace the NaN value with the mean or median salary of the dataset.(preferred)

In [258]:
# for deleteing

# print(dataset.isnull().sum())
# dataset = dataset.dropna()

# dataset

In [259]:
from sklearn.impute import SimpleImputer

# Assuming X is your dataset where you want to impute missing values
# X[:, 1:3] selects columns 1 and 2 (0-based index) in all rows

# Step 1: Create an instance of SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

# Step 2: Fit the imputer instance on the selected columns
imputer.fit(X[:, 1:3])

# Step 3: Transform and replace missing values in selected columns with means
X[:, 1:3] = imputer.transform(X[:, 1:3])

print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


### Encoding categorical data

encoding categorical data is a fundamental preprocessing step in machine learning that ensures the data is in a format suitable for training and evaluation of models, thereby facilitating accurate and effective learning from categorical information.

##### Encoding the Independent variable

In [260]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# Step 1: Create a ColumnTransformer instance
# 'encoder' specifies the name for this transformation step
# OneHotEncoder() specifies that we want to use the OneHotEncoder for this step
# [0] indicates the index (column) in X that we want to apply this transformation to
# The 'remainder='passthrough'' argument specifies that all columns not specified in transformers should be passed through without any transformation.

ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')

# Step 2: Apply the ColumnTransformer to transform X
X = ct.fit_transform(X)

print(X)


[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


##### Encoding the dependent variable

In [261]:
from sklearn.preprocessing import LabelEncoder

# Step 1: Create a LabelEncoder instance
le = LabelEncoder()

# Step 2: Fit the LabelEncoder instance on y and transform y
y = le.fit_transform(y)

print(y)

[0 1 0 0 1 1 0 1 0 1]


### Splitting the dataset into training set and test set

The main reason to apply feature scaling after splitting the dataset into training and testing sets is to **prevent data leakage** from the test set into the training process.

**Data Leakage**: 

**You should scale your features after splitting the dataset into training and testing sets to avoid your testing data influencing how your model learns from your training data.** This separation ensures that your model evaluates its performance on data it hasn't seen before, which gives you a more realistic measure of how well it will work on new, unseen data in the future.

In [262]:
from sklearn.model_selection import train_test_split

# test_size=0.2 specifies that 20% of the data should be allocated to the test set.
# random_state=1 ensures that the random split into training and testing sets will be the same every time you execute this code.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [263]:
print(X_train)

[[0.0 0.0 1.0 38.77777777777778 52000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 35.0 58000.0]]


In [264]:
print(X_test)

[[0.0 1.0 0.0 30.0 54000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


In [265]:
print(y_train)

[0 1 0 0 1 1 0 1]


In [266]:
print(y_test)

[0 1]


### Feature Scaling

In [267]:
from sklearn.preprocessing import StandardScaler

# Initialize the StandardScaler
sc = StandardScaler()

# Fit and transform the training data (X_train) for columns from index 3 onwards
X_train[:, 3:] = sc.fit_transform(X_train[:, 3:])

# Transform the test data (X_test) using the parameters learned from the training data
X_test[:, 3:] = sc.transform(X_test[:, 3:])


In [268]:
print(X_train)

[[0.0 0.0 1.0 -0.19159184384578545 -1.0781259408412425]
 [0.0 1.0 0.0 -0.014117293757057777 -0.07013167641635372]
 [1.0 0.0 0.0 0.566708506533324 0.633562432710455]
 [0.0 0.0 1.0 -0.30453019390224867 -0.30786617274297867]
 [0.0 0.0 1.0 -1.9018011447007988 -1.420463615551582]
 [1.0 0.0 0.0 1.1475343068237058 1.232653363453549]
 [0.0 1.0 0.0 1.4379472069688968 1.5749910381638885]
 [1.0 0.0 0.0 -0.7401495441200351 -0.5646194287757332]]


In [269]:
print(X_test)

[[0.0 1.0 0.0 -1.4661817944830124 -0.9069571034860727]
 [1.0 0.0 0.0 -0.44973664397484414 0.2056403393225306]]
