In [17]:
import pandas as pd
import numpy as np

In [None]:
#Use-case (preprocess-sample.csv)
# Shopping mall owner
# You need to create a model that can predict whether the customer will purchase the product from my website based on his/her 
# location, salary and age
# Features --- Input to your ML Model
# Label ----- Predicted output generated by Model
# Features ---> Country , Age, Salary
# Label ------> Purchased

In [2]:
# Ensure your dataset is in compliance with the ML algo requirement (Features)
# 1. Your data must be complete
# 2. Your data must be strictly numeric
# 3. Your data must be in the form of numpy array

In [11]:
data = pd.read_csv('pre-process_datasample.csv')
data

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [4]:
# Handle the missing data scenario
# Missing data is handled using the following techniques
# 1. Delete that specific records
# 2. Check the datatype of the missing data (Operation only when the data is numeric)
#     Data is numeric
#     if coldata is non-numeric, delete that record
#     if coldata is numeric, perform Imputation

# Imputation is a process of replacing the missing values with statistical mean , median or mode depending on the
# domain perspective.

In [12]:
# If there exists any missing data
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
Country      10 non-null object
Age          9 non-null float64
Salary       9 non-null float64
Purchased    10 non-null object
dtypes: float64(2), object(2)
memory usage: 400.0+ bytes


In [6]:
# By checking the info I can infer the fact that age is missing with one value and salary is missing with one value
# Perform imputation on age column and salary column
# Type of data age - continous numeric ---- mean 
# Type of data salary - continous numeric --- mean

In [8]:
data.Age.mean()

38.77777777777778

In [13]:
#fillna(condition, inplace = True)
data['Age'].fillna(int(data.Age.mean()) , inplace = True)
data

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,38.0,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [14]:
data['Salary'].fillna((data.Salary.mean()), inplace=True)
data

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,63777.777778,Yes
5,France,35.0,58000.0,Yes
6,Spain,38.0,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [15]:
#Given below columns are non-numeric
# Lets handle the same
# Country & Purchased
# Type of Data --- Country ----  Categorical
# The process to handle Categorical Features is called Encoding ---- DummyVariable Creation !!!!

In [21]:
dataFinal = pd.concat([data,pd.get_dummies(data.Country)],axis = 1)
dataFinal

Unnamed: 0,Country,Age,Salary,Purchased,France,Germany,Spain
0,France,44.0,72000.0,No,1,0,0
1,Spain,27.0,48000.0,Yes,0,0,1
2,Germany,30.0,54000.0,No,0,1,0
3,Spain,38.0,61000.0,No,0,0,1
4,Germany,40.0,63777.777778,Yes,0,1,0
5,France,35.0,58000.0,Yes,1,0,0
6,Spain,38.0,52000.0,No,0,0,1
7,France,48.0,79000.0,Yes,1,0,0
8,Germany,50.0,83000.0,No,0,1,0
9,France,37.0,67000.0,Yes,1,0,0


In [22]:
dataFinal.drop(['Country'],axis =1 )

Unnamed: 0,Age,Salary,Purchased,France,Germany,Spain
0,44.0,72000.0,No,1,0,0
1,27.0,48000.0,Yes,0,0,1
2,30.0,54000.0,No,0,1,0
3,38.0,61000.0,No,0,0,1
4,40.0,63777.777778,Yes,0,1,0
5,35.0,58000.0,Yes,1,0,0
6,38.0,52000.0,No,0,0,1
7,48.0,79000.0,Yes,1,0,0
8,50.0,83000.0,No,0,1,0
9,37.0,67000.0,Yes,1,0,0


In [24]:
features = dataFinal.iloc[:,[0,1,3,4,5]].values  #values convert DF to numpy array
type(features)

numpy.ndarray

In [29]:
#Label ---- > Result / Response variable / Dependent Variable
# If label is categorical , just replace values with 0,1,2,.... (whole numbers)
# If label is numerical, No processing
# If label has NaN, simply delete that record
# If label is a string with no sense..... Then its not a label !!! ;) . Technically you may be dealing with Unsupervised
# Learning
label = dataFinal.Purchased
label = label.replace('Yes',1)
label = label.replace('No',0)
label = label.values
label
#Logic created by ML Engineer
# Yes == 1
# No == 0

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1], dtype=int64)

In [30]:
dataFinal.Purchased.unique()

array(['No', 'Yes'], dtype=object)