In [4]:
import pandas as pd
import numpy as np

df=pd.read_csv("marketing_campaign_data_messy.csv").head()

df


Unnamed: 0,Campaign_ID,Campaign_Name,Start_Date,End_Date,Channel,Impressions,Clicks,Spend,Conversions,Active,Clicks.1,Campaign_Tag
0,CMP-00001,Q4_Summer_CMP-00001,2023-11-24 00:00:00,2023-12-13,TikTok,16795,197,$102.82,20.0,Y,,TI
1,CMP-00002,Q1_Launch_CMP-00002,2023-05-06 00:00:00,2023-05-12,Facebook,1860,30,24.33,1.0,0,,FA
2,CMP-00003,Q3_Winter_CMP-00003,2023-12-13 00:00:00,2023-12-20,Email,77820,843,1323.39,51.0,No,,EM
3,CMP-00004,Q1_BlackFriday_CMP-00004,2023-10-30,2023-11-03,TikTok,55886,2019,2180.38,135.0,True,,TI
4,CMP-00005,Q2_Winter_CMP-00005,2023-04-22 00:00:00,2023-04-23,Facebook,7265,169,252.44,30.0,Yes,,FA


In [5]:
#-------------------------------------------
# Step 1 : Cleaning Headers and Column names
#-------------------------------------------

print(df.columns.to_list())

df.columns=df.columns.str.strip().str.lower().str.replace(' ','_')

print("Fix")

df.columns=df.columns.to_list()

print(df.columns.to_list())


[' Campaign_ID ', 'Campaign_Name', 'Start_Date', 'End_Date', 'Channel', 'Impressions', 'Clicks ', 'Spend', 'Conversions', 'Active', 'Clicks', 'Campaign_Tag']
Fix
['campaign_id', 'campaign_name', 'start_date', 'end_date', 'channel', 'impressions', 'clicks', 'spend', 'conversions', 'active', 'clicks', 'campaign_tag']


In [6]:
df.head()


Unnamed: 0,campaign_id,campaign_name,start_date,end_date,channel,impressions,clicks,spend,conversions,active,clicks.1,campaign_tag
0,CMP-00001,Q4_Summer_CMP-00001,2023-11-24 00:00:00,2023-12-13,TikTok,16795,197,$102.82,20.0,Y,,TI
1,CMP-00002,Q1_Launch_CMP-00002,2023-05-06 00:00:00,2023-05-12,Facebook,1860,30,24.33,1.0,0,,FA
2,CMP-00003,Q3_Winter_CMP-00003,2023-12-13 00:00:00,2023-12-20,Email,77820,843,1323.39,51.0,No,,EM
3,CMP-00004,Q1_BlackFriday_CMP-00004,2023-10-30,2023-11-03,TikTok,55886,2019,2180.38,135.0,True,,TI
4,CMP-00005,Q2_Winter_CMP-00005,2023-04-22 00:00:00,2023-04-23,Facebook,7265,169,252.44,30.0,Yes,,FA


In [7]:
#-----------------------------------------------
#Type conversion and currency cleaning
#-----------------------------------------------


# Step 1: Convert to string
df['spend'] = df['spend'].astype(str)

# Step 2: Remove dollar sign and commas manually
df['spend'] = df['spend'].str.replace('$', '').str.replace(',', '')

# Step 3: Convert back to numbers
df['spend'] = pd.to_numeric(df['spend'])

print("after cleaning spend column :   ")
df['spend']

after cleaning spend column :   


0     102.82
1      24.33
2    1323.39
3    2180.38
4     252.44
Name: spend, dtype: float64

In [8]:
#-------------------------------------------
#step 3 :categorical Typos
#-------------------------------------------

print(df['channel'].unique())  # hence here it is not working due to data set 
#this block of code is available in demo.ipynb with the data set demo.csv

['TikTok' 'Facebook' 'Email']


In [9]:
#-------------------------------------------
# Step 4: Handling missed booleans
#-------------------------------------------

df['active'].unique()

clean_boolean={
    "yes":'True',
    "no":'False',
    "Y":'True',
    "N":'False',
    "N/A":'np.nan',
    '0':'False',
    '1':'True',
    'Yes':'True',
    'No':'False'
}

df['active']=df['active'].replace(clean_boolean)

df['active'].unique()
print("full data set after cleaning boolean : ")
df

full data set after cleaning boolean : 


Unnamed: 0,campaign_id,campaign_name,start_date,end_date,channel,impressions,clicks,spend,conversions,active,clicks.1,campaign_tag
0,CMP-00001,Q4_Summer_CMP-00001,2023-11-24 00:00:00,2023-12-13,TikTok,16795,197,102.82,20.0,True,,TI
1,CMP-00002,Q1_Launch_CMP-00002,2023-05-06 00:00:00,2023-05-12,Facebook,1860,30,24.33,1.0,False,,FA
2,CMP-00003,Q3_Winter_CMP-00003,2023-12-13 00:00:00,2023-12-20,Email,77820,843,1323.39,51.0,False,,EM
3,CMP-00004,Q1_BlackFriday_CMP-00004,2023-10-30,2023-11-03,TikTok,55886,2019,2180.38,135.0,True,,TI
4,CMP-00005,Q2_Winter_CMP-00005,2023-04-22 00:00:00,2023-04-23,Facebook,7265,169,252.44,30.0,True,,FA


In [10]:
#-------------------------------------------
# Step 5 : Date parsing
#-------------------------------------------

df['start_date'].dtype # initially the data type is object
df['start_date'] = pd.to_datetime(df['start_date'], format="%Y/%m/%d", errors='coerce')
df['start_date'].dtype # after conversion the data type is datetime64[ns]


dtype('<M8[ns]')

In [15]:
#-------------------------------------------
# Step 6 : feature extraction for the season without using if else condition
#-------------------------------------------

def get_season(campaign_name):
    if 'spring' in campaign_name.lower():
        return 'spring'
    elif 'summer' in campaign_name.lower():
        return 'summer'
    elif 'fall' in campaign_name.lower() or 'autumn' in campaign_name.lower():
        return 'fall'
    elif 'winter' in campaign_name.lower():
        return 'winter'
    else:
        return 'unknown'

df['season']=df['campaign_name'].apply(get_season)
df

Unnamed: 0,campaign_id,campaign_name,start_date,end_date,channel,impressions,clicks,spend,conversions,active,clicks.1,campaign_tag,season
0,CMP-00001,Q4_Summer_CMP-00001,NaT,2023-12-13,TikTok,16795,197,102.82,20.0,True,,TI,summer
1,CMP-00002,Q1_Launch_CMP-00002,NaT,2023-05-12,Facebook,1860,30,24.33,1.0,False,,FA,unknown
2,CMP-00003,Q3_Winter_CMP-00003,NaT,2023-12-20,Email,77820,843,1323.39,51.0,False,,EM,winter
3,CMP-00004,Q1_BlackFriday_CMP-00004,NaT,2023-11-03,TikTok,55886,2019,2180.38,135.0,True,,TI,unknown
4,CMP-00005,Q2_Winter_CMP-00005,NaT,2023-04-23,Facebook,7265,169,252.44,30.0,True,,FA,winter
