### Customer Personality Analysis - Dataset Cleaning

Importing Libraries

In [7]:
import pandas as pd 
import numpy as np 

Load the Dataset

In [8]:
# The dataset id tab-separated (.tsv)
df = pd.read_csv('marketing_campaign.csv', sep='\t')

# Preview the Data
df.head()

Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,...,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response
0,5524,1957,Graduation,Single,58138.0,0,0,04-09-2012,58,635,...,7,0,0,0,0,0,0,3,11,1
1,2174,1954,Graduation,Single,46344.0,1,1,08-03-2014,38,11,...,5,0,0,0,0,0,0,3,11,0
2,4141,1965,Graduation,Together,71613.0,0,0,21-08-2013,26,426,...,4,0,0,0,0,0,0,3,11,0
3,6182,1984,Graduation,Together,26646.0,1,0,10-02-2014,26,11,...,6,0,0,0,0,0,0,3,11,0
4,5324,1981,PhD,Married,58293.0,1,0,19-01-2014,94,173,...,5,0,0,0,0,0,0,3,11,0


Explore Basic Info

In [11]:
# Shape and data types
print("Dataset Shape: ", df.shape)
print("\nDataset Info: ")
df.info()

# Check for Missing Values
print("\nMissing values:")
print(df.isnull().sum())

Dataset Shape:  (2240, 29)

Dataset Info: 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2240 entries, 0 to 2239
Data columns (total 29 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ID                   2240 non-null   int64  
 1   Year_Birth           2240 non-null   int64  
 2   Education            2240 non-null   object 
 3   Marital_Status       2240 non-null   object 
 4   Income               2216 non-null   float64
 5   Kidhome              2240 non-null   int64  
 6   Teenhome             2240 non-null   int64  
 7   Dt_Customer          2240 non-null   object 
 8   Recency              2240 non-null   int64  
 9   MntWines             2240 non-null   int64  
 10  MntFruits            2240 non-null   int64  
 11  MntMeatProducts      2240 non-null   int64  
 12  MntFishProducts      2240 non-null   int64  
 13  MntSweetProducts     2240 non-null   int64  
 14  MntGoldProds         2240 non-null   int64  


Handle Missing Values

In [12]:
# Drop rows where 'Income' is missing
df = df.dropna(subset=['Income'])

Remove Duplicate Rows

In [14]:
# Check for Duplicates
print("Duplicate Rows:", df.duplicated().sum())

# Drop Duplicates
df = df.drop_duplicates()

Duplicate Rows: 0


 Standardize Text Columns

In [15]:
# Standardize 'Education' and 'Marital_Status'
df['Education'] = df['Education'].str.strip().str.title()
df['Marital_Status'] = df['Marital_Status'].str.strip().str.title()

Convert Date Columns

In [16]:
df['Dt_Customer'] = pd.to_datetime(df['Dt_Customer'], format='%d-%m-%Y')

Rename Columns 

In [17]:
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")

Verify Data Types

In [21]:
# Check data types
print("\nUpdated Data Types:")
print(df.dtypes)


Updated Data Types:
id                              int64
year_birth                      int64
education                      object
marital_status                 object
income                        float64
kidhome                         int64
teenhome                        int64
dt_customer            datetime64[ns]
recency                         int64
mntwines                        int64
mntfruits                       int64
mntmeatproducts                 int64
mntfishproducts                 int64
mntsweetproducts                int64
mntgoldprods                    int64
numdealspurchases               int64
numwebpurchases                 int64
numcatalogpurchases             int64
numstorepurchases               int64
numwebvisitsmonth               int64
acceptedcmp3                    int64
acceptedcmp4                    int64
acceptedcmp5                    int64
acceptedcmp1                    int64
acceptedcmp2                    int64
complain                     

Save Cleaned Data

In [20]:
df.to_csv('marketing_campaign_cleaned.csv', index=False)

print("Cleaned data saved successfully!")

Cleaned data saved successfully!
