@Author: Bhupinder Singh

@Date: 03-01-2023

@Last Modified by: Bhupinder Singh

@Last Modified date: 04-01-2023

@Title: df Preprocessing

In [4]:
import pandas as pd
from data_log import get_logger
from csv import reader
from math import sqrt

lg = get_logger(name="(dataPreprocessing)", file_name="df_log.log")

### 1. Handling Missing df

In [5]:
df = pd.read_csv('data_preprocessing.csv')
lg.info("df File before handling missing df")
display(df)
df['Age']=df["Age"].fillna(df["Age"].mean())
df['Salary']=df["Salary"].fillna(df["Salary"].mean())
lg.info("df File After handling missing df")
display(df)

(dataPreprocessing) - 2023-01-13 13:10:06,854 - INFO - df File before handling missing df
(dataPreprocessing) - 2023-01-13 13:10:06,854 - INFO - df File before handling missing df


Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


(dataPreprocessing) - 2023-01-13 13:10:06,865 - INFO - df File After handling missing df
(dataPreprocessing) - 2023-01-13 13:10:06,865 - INFO - df File After handling missing df


Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,63777.777778,Yes
5,France,35.0,58000.0,Yes
6,Spain,38.777778,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


### 2. Handling categorical df

One Hot Encoding

In [6]:
df = pd.get_dummies(df, columns = ['Country'])
df

Unnamed: 0,Age,Salary,Purchased,Country_France,Country_Germany,Country_Spain
0,44.0,72000.0,No,1,0,0
1,27.0,48000.0,Yes,0,0,1
2,30.0,54000.0,No,0,1,0
3,38.0,61000.0,No,0,0,1
4,40.0,63777.777778,Yes,0,1,0
5,35.0,58000.0,Yes,1,0,0
6,38.777778,52000.0,No,0,0,1
7,48.0,79000.0,Yes,1,0,0
8,50.0,83000.0,No,0,1,0
9,37.0,67000.0,Yes,1,0,0


Binary Encoding

In [7]:
def binary_encode(df_frame,col_name):
    encoded_list = []
    for i in df_frame[col_name]:
        if i == 'Yes':
            encoded_value = 1
        else:
            encoded_value = 0
        encoded_list.append(encoded_value)
    return encoded_list

In [8]:
purchased_list = binary_encode(df,'Purchased')
df.drop(['Purchased'],axis=1,inplace=True)
df.insert(loc=2, column="Purchased", value=purchased_list)

In [9]:
df

Unnamed: 0,Age,Salary,Purchased,Country_France,Country_Germany,Country_Spain
0,44.0,72000.0,0,1,0,0
1,27.0,48000.0,1,0,0,1
2,30.0,54000.0,0,0,1,0
3,38.0,61000.0,0,0,0,1
4,40.0,63777.777778,1,0,1,0
5,35.0,58000.0,1,1,0,0
6,38.777778,52000.0,0,0,0,1
7,48.0,79000.0,1,1,0,0
8,50.0,83000.0,0,0,1,0
9,37.0,67000.0,1,1,0,0


#### 3. Feature scaling

In [10]:
columns=['Age','Salary']
for col in columns:
        print("Scaling the ",col)
        if (((df[col].dtype)=='float64') | ((df[col].dtype)=='int64')):
         df[col] = (df[col] - df[col].mean())/df[col].std()
print("Train df")
df.head(5)

Scaling the  Age
Scaling the  Salary
Train df


Unnamed: 0,Age,Salary,Purchased,Country_France,Country_Germany,Country_Spain
0,0.719931,0.7110128,0,1,0,0
1,-1.623675,-1.364376,1,0,0,1
2,-1.210098,-0.8455287,0,0,1,0
3,-0.107224,-0.240207,0,0,0,1
4,0.168495,6.29185e-16,1,0,1,0


### 4. Split the dfset into training set and test set

In [11]:
# Select ratio
ratio = 0.80
  
total_rows = df.shape[0]
train_size = int(total_rows*ratio)
  
# Split df into test and train
train = df[0:train_size]
test = df[train_size:]


# print train set
lg.info("Train dfframe")
display(train)
  
# print test set
lg.info("Test dfframe")
display(test)

train.shape, test.shape

(dataPreprocessing) - 2023-01-13 13:10:07,079 - INFO - Train dfframe
(dataPreprocessing) - 2023-01-13 13:10:07,079 - INFO - Train dfframe


Unnamed: 0,Age,Salary,Purchased,Country_France,Country_Germany,Country_Spain
0,0.719931,0.7110128,0,1,0,0
1,-1.623675,-1.364376,1,0,0,1
2,-1.210098,-0.8455287,0,0,1,0
3,-0.107224,-0.240207,0,0,0,1
4,0.168495,6.29185e-16,1,0,1,0
5,-0.520801,-0.4996306,1,1,0,0
6,0.0,-1.018478,0,0,0,1
7,1.271368,1.316334,1,1,0,0


(dataPreprocessing) - 2023-01-13 13:10:07,085 - INFO - Test dfframe
(dataPreprocessing) - 2023-01-13 13:10:07,085 - INFO - Test dfframe


Unnamed: 0,Age,Salary,Purchased,Country_France,Country_Germany,Country_Spain
8,1.547087,1.662233,0,0,1,0
9,-0.245083,0.27864,1,1,0,0


((8, 6), (2, 6))