In [44]:
# Loading the data

import pandas as pd

fulldata_df = pd.read_csv('online_store_customer_data.csv')

In [45]:
# Getting Shape

file_shape = fulldata_df.shape

file_shape

(2512, 11)

In [46]:
# Describing Dataframe

file_description = fulldata_df.describe()

file_description

Unnamed: 0,Transaction_ID,Age,Referal,Amount_spent
count,2512.0,2470.0,2357.0,2270.0
mean,152443.931131,46.637652,0.6521,1418.422577
std,724.580482,18.186277,0.476405,878.507451
min,151200.0,15.0,0.0,2.09
25%,151815.75,32.0,0.0,678.1925
50%,152443.5,47.0,1.0,1341.435
75%,153071.25,62.0,1.0,2038.1025
max,153699.0,78.0,1.0,2999.98


In [47]:
# Finding out columns, their data types and number of NON NULL values in each column

file_info = fulldata_df.info()

file_info

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2512 entries, 0 to 2511
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Transaction_date  2512 non-null   object 
 1   Transaction_ID    2512 non-null   int64  
 2   Gender            2484 non-null   object 
 3   Age               2470 non-null   float64
 4   Marital_status    2512 non-null   object 
 5   State_names       2512 non-null   object 
 6   Segment           2512 non-null   object 
 7   Employees_status  2486 non-null   object 
 8   Payment_method    2512 non-null   object 
 9   Referal           2357 non-null   float64
 10  Amount_spent      2270 non-null   float64
dtypes: float64(3), int64(1), object(7)
memory usage: 216.0+ KB


In [68]:
# Finding out unique values and their frequency in each column

file_unique_values = fulldata_df[['Gender','Marital_status','Payment_method']].value_counts()

file_unique_values

Gender  Marital_status  Payment_method
Female  Married         PayPal            359
Male    Married         PayPal            303
Female  Single          PayPal            255
        Married         Card              243
Male    Single          PayPal            239
        Married         Card              193
Female  Married         Other             186
        Single          Card              186
Male    Married         Other             172
Female  Single          Other             127
Male    Single          Card              117
                        Other             104
Name: count, dtype: int64

In [49]:
# Finding out missing values in data set

#Using isna()

file_missing_values = fulldata_df.isna().sum()

file_missing_values

Transaction_date      0
Transaction_ID        0
Gender               28
Age                  42
Marital_status        0
State_names           0
Segment               0
Employees_status     26
Payment_method        0
Referal             155
Amount_spent        242
dtype: int64

In [50]:
# Using isnull()

file_missing_values = fulldata_df.isnull().sum()

file_missing_values

Transaction_date      0
Transaction_ID        0
Gender               28
Age                  42
Marital_status        0
State_names           0
Segment               0
Employees_status     26
Payment_method        0
Referal             155
Amount_spent        242
dtype: int64

Handling missing values in the common task in the data pre-processing part. For many reasons most
of the time we will encounter missing values. Without dealing with this we can't do the proper
model building. You have already find out the missing value count in Task 2. Now, we decided how to
handle them. We can handle this by removing affected columns or rows or replacing appropriate
values there.

In [51]:
# Dropping amount_spent column and rows modification

fulldata_df.drop(columns=['Amount_spent'],inplace=True)
fulldata_df


Unnamed: 0,Transaction_date,Transaction_ID,Gender,Age,Marital_status,State_names,Segment,Employees_status,Payment_method,Referal
0,1/1/2019,151200,Female,19.0,Single,Kansas,Basic,Unemployment,Other,1.0
1,1/1/2019,151201,Male,49.0,Single,Illinois,Basic,self-employed,Card,0.0
2,1/1/2019,151202,Male,63.0,Married,New Mexico,Basic,workers,PayPal,1.0
3,1/1/2019,151203,,18.0,Single,Virginia,Platinum,workers,Card,1.0
4,1/1/2019,151204,Male,27.0,Single,Connecticut,Basic,self-employed,Card,0.0
...,...,...,...,...,...,...,...,...,...,...
2507,5/1/2021,153695,Female,57.0,Single,South Carolina,Platinum,self-employed,Card,0.0
2508,5/1/2021,153696,Female,36.0,Married,Hawaii,Silver,self-employed,PayPal,1.0
2509,5/1/2021,153697,Male,22.0,Single,South Carolina,Basic,workers,PayPal,1.0
2510,5/1/2021,153698,,44.0,Single,New York,Basic,Employees,PayPal,0.0


In [52]:
# Dropping Rows

fulldata_df.dropna(subset=['Employees_status'],inplace=True)
fulldata_df


Unnamed: 0,Transaction_date,Transaction_ID,Gender,Age,Marital_status,State_names,Segment,Employees_status,Payment_method,Referal
0,1/1/2019,151200,Female,19.0,Single,Kansas,Basic,Unemployment,Other,1.0
1,1/1/2019,151201,Male,49.0,Single,Illinois,Basic,self-employed,Card,0.0
2,1/1/2019,151202,Male,63.0,Married,New Mexico,Basic,workers,PayPal,1.0
3,1/1/2019,151203,,18.0,Single,Virginia,Platinum,workers,Card,1.0
4,1/1/2019,151204,Male,27.0,Single,Connecticut,Basic,self-employed,Card,0.0
...,...,...,...,...,...,...,...,...,...,...
2507,5/1/2021,153695,Female,57.0,Single,South Carolina,Platinum,self-employed,Card,0.0
2508,5/1/2021,153696,Female,36.0,Married,Hawaii,Silver,self-employed,PayPal,1.0
2509,5/1/2021,153697,Male,22.0,Single,South Carolina,Basic,workers,PayPal,1.0
2510,5/1/2021,153698,,44.0,Single,New York,Basic,Employees,PayPal,0.0


In [57]:
# Resetting the data

fulldata_df = pd.read_csv('online_store_customer_data.csv')

Impute/Replace Missing Values: Most of the time, we can’t afford to delete rows or columns. It’s
always better to replace missing values rather than deleting data. We will learn how to replace
missing values for both numeric and categorical features.

In [60]:
# Numeric Replace with 0 or Mean Value

mean_amount_spent = fulldata_df['Amount_spent'].mean()
mean_amount_spent

1418.4225770925111

In [61]:
# Replacing Null data with mean value

temp_df = fulldata_df['Amount_spent'].fillna(mean_amount_spent)
temp_df

0       2051.360000
1        544.040000
2       1572.600000
3       1199.790000
4       1418.422577
           ...     
2507     150.100000
2508     708.880000
2509    2030.070000
2510    1909.770000
2511    1073.150000
Name: Amount_spent, Length: 2512, dtype: float64

In [62]:
mean_age = fulldata_df['Age'].mean()
mean_age

46.63765182186235

In [63]:
temp_df = fulldata_df['Age'].fillna(mean_age)
temp_df

0       19.0
1       49.0
2       63.0
3       18.0
4       27.0
        ... 
2507    57.0
2508    36.0
2509    22.0
2510    44.0
2511    48.0
Name: Age, Length: 2512, dtype: float64

In [65]:
# Categorical Features

mode_emp = fulldata_df['Employees_status'].mode().iloc[0]
mode_emp

'Employees'

In [69]:
temp_df = fulldata_df['Employees_status'].fillna(mode_emp)
temp_df

0        Unemployment
1       self-employed
2             workers
3             workers
4       self-employed
            ...      
2507    self-employed
2508    self-employed
2509          workers
2510        Employees
2511          workers
Name: Employees_status, Length: 2512, dtype: object

In [70]:
gender_mode = fulldata_df['Gender'].mode().iloc[0]
gender_mode

'Female'

In [71]:
temp_df = fulldata_df['Gender'].fillna(gender_mode)
temp_df

0       Female
1         Male
2         Male
3       Female
4         Male
         ...  
2507    Female
2508    Female
2509      Male
2510    Female
2511      Male
Name: Gender, Length: 2512, dtype: object