In [101]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [102]:
#Data overview
data = pd.read_csv("/kaggle/input/customer-personality-analysis/marketing_campaign.csv", 
                   sep = "\t")
data.head()

In [103]:
#First, lets view all our columns
data.info()

In [104]:
#Let's check if there are any NaN values
data.isnull().sum()
#Looks like the only column with nan values is income. To fix this, we can get the average income and replace these values with that.
averageIncome = data['Income'].mean()
print('The average income is', '$'+ str(averageIncome))
#Since the data seems to round the income to the nearest dollar, we should do the same with average income.
roundedAverageIncome = round(averageIncome,0)
print('Rounded: ' + str(roundedAverageIncome))

#Now lets fill these values with average income
data['Income'] = data['Income'].fillna(roundedAverageIncome)

In [105]:
#Replacing birth year column with Age column
data = data.rename(columns = {"Year_Birth": "Age"})
data['Age'] = 2021 - data['Age']
data['Age'].head()

#Merging KidsHome and TeenHome columns to number of kids
data['MntKids'] = data['Kidhome'] + data['Teenhome']
data = data.drop('Kidhome', 1).drop('Teenhome',1)

#Lets also drop some unneeded columns
data = data.drop('AcceptedCmp1',1).drop('AcceptedCmp2',1).drop('AcceptedCmp3',1).drop('AcceptedCmp4',1).drop('AcceptedCmp5',1)
data = data.drop('Response',1)

In [106]:
#Checking for more unneeded columns
data['Z_CostContact'].unique()
data['Z_Revenue'].unique()
#It seems like these two columns have one value only, so we will not worry about these columns.
data = data.drop('Z_CostContact',1).drop('Z_Revenue',1)
data.columns

In [107]:
#Now we are done with the data cleaning, lets analyze the data
import matplotlib.pyplot as plt
#First lets find out the average age of shoppers. I will split age into 3 categories: below 30, between 30 and 50, then 50 and above
below30 = []
btw30and50 = []
above50 = []
for i in data['Age']:
    if i < 30:
        below30.append(i)
    if 30 <= i <= 50:
        btw30and50.append(i)
    if i > 50:
        above50.append(i)
below30 = len(below30)
btw30and50 = len(btw30and50)
above50 = len(above50)

plt.style.use('ggplot')
x = ['Below 30', 'Between 30 and 50', 'Above 50']
y = [below30,btw30and50,above50]

x_pos = [i for i, _ in enumerate(x)]
plt.bar(x_pos, y, color='green',width = 0.4)
plt.xlabel("Age Group")
plt.ylabel("# of Shoppers")
plt.title("Shoppers based on age group")
plt.xticks(x_pos, x)
for i, v in enumerate(y):
    plt.text(x_pos[i] - 0.05, v, str(v))
plt.show()
#As you can see, most shoppers are 30 and above, with very few shoppers below 30.

In [108]:
data['totalMntProducts'] = data['MntWines']+data['MntFruits']+data['MntMeatProducts']+data['MntFishProducts']+data['MntSweetProducts']+data['MntGoldProds']
new_df = data.groupby(['MntKids'])['totalMntProducts'].mean().round().reset_index()
xList = []
yList = []
for i in new_df['MntKids']:
    xList.append(i)
for s in new_df['totalMntProducts']:
    yList.append(s)
#now we have our info, lets plot it
plt.style.use('ggplot')
x = xList
y = yList
x_pos = [i for i, _ in enumerate(x)]
plt.bar(x_pos, y, color='red',width = 0.4)
plt.xlabel("Number of Kids")
plt.ylabel("Avg products purchased")
plt.title("Products purchased based on Children")
plt.xticks(x_pos, x)
for i, v in enumerate(y):
    plt.text(x_pos[i] - 0.15, v+15, str(v))
plt.show()

#To my surprise, the less children a person has the more products they usually purchase. 
#I would've presumed that more children = more products that you need to buy.


In [109]:
#Purchases based on age group
#[0]wine,[1]fruits,[2]meat,[3]fish,[4]sweets,[5]gold
below30 = []
btw30and50 = []
above50 = []

below30.append(sum(data.loc[data['Age'] < 30, 'MntWines']))
x = sum(data.loc[data['Age'] <= 30, 'MntWines'])
btw30and50.append(sum(data.loc[data['Age'] <= 50, 'MntWines'])-x)
above50.append(sum(data.loc[data['Age'] > 50, 'MntWines']))

below30.append(sum(data.loc[data['Age'] < 30, 'MntFruits']))
x1 = sum(data.loc[data['Age'] <= 30, 'MntFruits'])
btw30and50.append(sum(data.loc[data['Age'] <= 50, 'MntFruits'])-x1)
above50.append(sum(data.loc[data['Age'] > 50, 'MntFruits']))

below30.append(sum(data.loc[data['Age'] < 30, 'MntMeatProducts']))
x2 = sum(data.loc[data['Age'] <= 30, 'MntMeatProducts'])
btw30and50.append(sum(data.loc[data['Age'] <= 50, 'MntMeatProducts'])-x2)
above50.append(sum(data.loc[data['Age'] > 50, 'MntMeatProducts']))

below30.append(sum(data.loc[data['Age'] < 30, 'MntFishProducts']))
x3 = sum(data.loc[data['Age'] <= 30, 'MntFishProducts'])
btw30and50.append(sum(data.loc[data['Age'] <= 50, 'MntFishProducts'])-x3)
above50.append(sum(data.loc[data['Age'] > 50, 'MntFishProducts']))

below30.append(sum(data.loc[data['Age'] < 30, 'MntSweetProducts']))
x4 = sum(data.loc[data['Age'] <= 30, 'MntWines'])
btw30and50.append(sum(data.loc[data['Age'] <= 50, 'MntSweetProducts'])-x4)
above50.append(sum(data.loc[data['Age'] > 50, 'MntSweetProducts']))

below30.append(sum(data.loc[data['Age'] < 30, 'MntGoldProds']))
x5 = sum(data.loc[data['Age'] <= 30, 'MntGoldProds'])
btw30and50.append(sum(data.loc[data['Age'] <= 50, 'MntGoldProds'])-x5)
above50.append(sum(data.loc[data['Age'] > 50, 'MntGoldProds']))


#now we have all the data to start making our graphs.
#[0]wine,[1]fruits,[2]meat,[3]fish,[4]sweets,[5]gold

labels = 'Below 30', 'Between 30 and 50', 'Above 50'
fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(20, 4))
fig.suptitle('Spending by Age Group')

axes[0,0].title.set_text('Wine')
sizes = [below30[0],btw30and50[0],above50[0]]
axes[0,0].pie(sizes, labels=labels, autopct='%1.1f%%',
        shadow=True, startangle=90)

axes[0,1].title.set_text('Fruits')
sizes = [below30[1],btw30and50[1],above50[1]]
axes[0,1].pie(sizes, colors = ['red','teal','yellow'], labels=labels, autopct='%1.1f%%',
        shadow=True, startangle=90)

axes[0,2].title.set_text('Meat')
sizes = [below30[2],btw30and50[2],above50[2]]
axes[0,2].pie(sizes, labels=labels, autopct='%1.1f%%',
        shadow=True, startangle=90)

axes[1,0].title.set_text('Fish')
sizes = [below30[3],btw30and50[3],above50[3]]
axes[1,0].pie(sizes, colors = ['red','teal','yellow'], labels=labels, autopct='%1.1f%%',
        shadow=True, startangle=90)

axes[1,1].title.set_text('Sweets')
sizes = [below30[4],btw30and50[4],above50[4]]
axes[1,1].pie(sizes, labels=labels, autopct='%1.1f%%',
        shadow=True, startangle=90)

axes[1,2].title.set_text('Gold')
sizes = [below30[5],btw30and50[5],above50[5]]
axes[1,2].pie(sizes, colors = ['red','teal','yellow'], labels=labels, autopct='%1.1f%%',
        shadow=True, startangle=90)
plt.tight_layout()

In [110]:
#Gold puchases based on income
#My guess is that people with more income would be inclined to spend more on gold than those with lower incomes. Let's see if this is correct.
#First, lets group income based on 4 ranges: 0-25k, 25k-50k, 50k-100k, and 100k+ 
y = []
first = (sum(data.loc[data['Income'] < 25000, 'MntGoldProds']))
y.append(first)
second = (sum(data.loc[data['Income'] < 50000, 'MntGoldProds'])) - first
y.append(second)
third = (sum(data.loc[data['Income'] < 100000, 'MntGoldProds'])) - sec
y.append(third)
fourth = (sum(data.loc[data['Income'] > 100000, 'MntGoldProds']))
y.append(fourth)

x = [25000,50000,100000,150000]
plt.xlabel('Income')
plt.ylabel('Amount Spent on Gold')
plt.plot(x, y,marker = 'o')
plt.show()

#As you can see, my hypothesis was somewhat correct, as the more income a person makes, the more they are likely to spend on gold. However, this is 
#not true for people with incomes above 100k

In [111]:
#Are younger customers more likely to purchase items online when compared to older customers?
below30 = []
btw30and50 = []
above50 = []

below30.append(sum(data.loc[data['Age'] < 30, 'NumWebPurchases']))
s = sum(data.loc[data['Age'] <= 30, 'NumWebPurchases'])
btw30and50.append(sum(data.loc[data['Age'] <= 50, 'NumWebPurchases'])-s)
above50.append(sum(data.loc[data['Age'] > 50, 'NumWebPurchases']))

below30.append(sum(data.loc[data['Age'] < 30, 'NumStorePurchases']))
v = sum(data.loc[data['Age'] <= 30, 'NumStorePurchases'])
btw30and50.append(sum(data.loc[data['Age'] <= 50, 'NumStorePurchases'])-v)
above50.append(sum(data.loc[data['Age'] > 50, 'NumStorePurchases']))

labels = ['Below 30', '30-50', 'Above 50']
webPurchases = [below30[0], btw30and50[0], above50[0]]
storePurchases = [below30[1], btw30and50[1], above50[1]]

x = np.arange(len(labels))  # the label locations
width = 0.35  # the width of the bars

fig, ax = plt.subplots()
rects1 = ax.bar(x - width/2, webPurchases, width, label='Web Purchases')
rects2 = ax.bar(x + width/2, storePurchases, width, label='Store Purchases')

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Items Purchased')
ax.set_xlabel('Age Group')
ax.set_title('Method of Purchasing')
ax.legend()
ax.bar_label(rects1, padding=3)
ax.bar_label(rects2, padding=3)
plt.xticks(x, labels)

fig.tight_layout()
plt.show()

#From this graph, we can see that even though store purchases are dominant, web purchases are still very prominent between all age groups. 
#No age group is more likely to puchase through web than in store. 

In [112]:
#Does having more kids equate to buying more sweet products?

sweetProducts = []
sweetProducts.append((sum(data.loc[data['MntKids']==0, 'MntSweetProducts'])))
sweetProducts.append((sum(data.loc[data['MntKids']==1, 'MntSweetProducts'])))
sweetProducts.append((sum(data.loc[data['MntKids']==2, 'MntSweetProducts'])))
sweetProducts.append((sum(data.loc[data['MntKids']==3, 'MntSweetProducts'])))
print(sweetProducts)

xvalues = [0,1,2,3]
plt.xlabel('Number of Kids')
plt.ylabel('Amount Spent on Sweets')
plt.xticks(xvalues)
plt.title('Sweets Purchased based on Kids')
plt.plot(xvalues, sweetProducts ,marker = 'o', color = 'blue')
plt.show()

#My initial idea was wrong. It seems like the more kids a customer tends to have, the less likely they are to purchase sweets.