# Data Cleaning and Preprocessing

In [22]:
import pandas as pd
# Load dataset (tab-separated)
df = pd.read_csv(r"C:\\Users\\saksh\\Desktop\\marketing_campaign.csv",sep='\t')
print(df)

         ID  Year_Birth   Education Marital_Status   Income  Kidhome  \
0      5524        1957  Graduation         Single  58138.0        0   
1      2174        1954  Graduation         Single  46344.0        1   
2      4141        1965  Graduation       Together  71613.0        0   
3      6182        1984  Graduation       Together  26646.0        1   
4      5324        1981         PhD        Married  58293.0        1   
...     ...         ...         ...            ...      ...      ...   
2235  10870        1967  Graduation        Married  61223.0        0   
2236   4001        1946         PhD       Together  64014.0        2   
2237   7270        1981  Graduation       Divorced  56981.0        0   
2238   8235        1956      Master       Together  69245.0        0   
2239   9405        1954         PhD        Married  52869.0        1   

      Teenhome Dt_Customer  Recency  MntWines  ...  NumWebVisitsMonth  \
0            0  04-09-2012       58       635  ...            

# Display initial info

In [23]:
print("Initial Shape:", df.shape)
print("\nMissing values before cleaning:\n", df.isnull().sum())

Initial Shape: (2240, 29)

Missing values before cleaning:
 ID                      0
Year_Birth              0
Education               0
Marital_Status          0
Income                 24
Kidhome                 0
Teenhome                0
Dt_Customer             0
Recency                 0
MntWines                0
MntFruits               0
MntMeatProducts         0
MntFishProducts         0
MntSweetProducts        0
MntGoldProds            0
NumDealsPurchases       0
NumWebPurchases         0
NumCatalogPurchases     0
NumStorePurchases       0
NumWebVisitsMonth       0
AcceptedCmp3            0
AcceptedCmp4            0
AcceptedCmp5            0
AcceptedCmp1            0
AcceptedCmp2            0
Complain                0
Z_CostContact           0
Z_Revenue               0
Response                0
dtype: int64


# Drop rows with missing values

In [24]:
df = df.dropna()
print("\nShape after dropping missing values:", df.shape)



Shape after dropping missing values: (2216, 29)


# Remove duplicates

In [9]:
df = df.drop_duplicates()
print("\nShape after dropping duplicates:", df.shape)


Shape after dropping duplicates: (2240, 1)


# Standardize text columns

In [15]:
df['Education'] = df['Education'].str.title().str.strip()
df['Marital_Status'] = df['Marital_Status'].str.upper().str.strip()

# Convert date columns to datetime format

In [16]:
df['Dt_Customer'] = pd.to_datetime(df['Dt_Customer'], dayfirst=True)

# Clean column names

In [17]:
df.columns = df.columns.str.lower().str.replace(" ", "_").str.strip()


# Convert data types

In [18]:
numeric_columns = ['income', 'kidhome', 'teenhome']
for col in numeric_columns:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Drop rows again if type conversion introduced NaNs

In [19]:
df = df.dropna()

# Final shape and summary

In [20]:
print("\nFinal shape:", df.shape)
print("\nFinal data types:\n", df.dtypes)


Final shape: (2216, 29)

Final data types:
 id                              int64
year_birth                      int64
education                      object
marital_status                 object
income                        float64
kidhome                         int64
teenhome                        int64
dt_customer            datetime64[ns]
recency                         int64
mntwines                        int64
mntfruits                       int64
mntmeatproducts                 int64
mntfishproducts                 int64
mntsweetproducts                int64
mntgoldprods                    int64
numdealspurchases               int64
numwebpurchases                 int64
numcatalogpurchases             int64
numstorepurchases               int64
numwebvisitsmonth               int64
acceptedcmp3                    int64
acceptedcmp4                    int64
acceptedcmp5                    int64
acceptedcmp1                    int64
acceptedcmp2                    int64
compl

# Save cleaned dataset

In [21]:
df.to_csv("marketing_campaign_cleaned.csv", index=False)
print("\nCleaned data saved to 'marketing_campaign_cleaned.csv'")


Cleaned data saved to 'marketing_campaign_cleaned.csv'
