In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2_contingency

### General Data Overview
- Check first few rows of data
- Check the data types of each column
- Check for any missing values in the dataset

In [None]:
df = pd.read_csv('clean_marketing_campaign.csv')
df.head()

In [None]:
df.info()
print('No null values observed in (cleaned) dataset.')

### Descriptive Statistics
- Compute summary statistics for numerial columns
- Explore the distribution of numerical values using histograms or box plots

In [None]:
df.describe()

- Some possible anomalies observed in Year_Birth and Age (i.e. born in 1893 not likely).
- Z_CostContact and Z-Revenue columns seem to all contain the same values, so likely not relevant for analysis.

In [None]:
sns.boxplot(y=df['Year_Birth'])
plt.title('Birth Year Box Plot')
plt.show()
print('Median birth year appears to be around 1972, with some outliers around 1900 (likely errors and not relevant to data analysis).')
sns.boxplot(y=df['Income'])
plt.title('Income Box Plot')
plt.show()
print('Most incomes fall between 40-70k, with a few outliers around the 150k mark (not concerning), but there is one outlier over 600k, which may be erroneous.')
sns.boxplot(y=df['Recency'])
plt.title('Recency Box Plot')
plt.show()
print('Median recency is around 50; no outliers observed.')
sns.boxplot(y=df['MntWines'])
plt.title('Wine Sold Box Plot')
plt.show()
sns.boxplot(y=df['MntFruits'])
plt.title('Fruit Sold Box Plot')
plt.show()
sns.boxplot(y=df['MntMeatProducts'])
plt.title('Meat Products Sold Box Plot')
plt.show()
sns.boxplot(y=df['MntFishProducts'])
plt.title('Fish Products Sold Box Plot')
plt.show()
sns.boxplot(y=df['MntSweetProducts'])
plt.title('Sweet Products Sold Box Plot')
plt.show()
sns.boxplot(y=df['MntGoldProds'])
plt.title('Gold Products Sold Box Plot')
plt.show()
print('Majority of orders for all products had a multitude of outliers well above the expected range.')
sns.boxplot(y=df['NumDealsPurchases'])
plt.title('Deals Purchased Box Plot')
plt.show()
print('Median deal acceptance per customer was 2, but several outliers observed ranging from 7 to 15 deals purchased!')
sns.boxplot(y=df['NumWebPurchases'])
plt.title('Web Purchases Box Plot')
plt.show()
sns.boxplot(y=df['NumCatalogPurchases'])
plt.title('Catalog Purchases Box Plot')
plt.show()
sns.boxplot(y=df['NumStorePurchases'])
plt.title('Store Purchases Box Plot')
plt.show()
print('Average store purchases greatly outweighed both catalog and web purchases.')
sns.boxplot(y=df['NumWebVisitsMonth'])
plt.title('Monthly Web Visits Box Plot')
plt.show()
print('Majority of customers visited the website ana verage 6 times monthly, though some customers visited more frequently, as much as 20 times monthly.')
sns.boxplot(y=df['Z_CostContact'])
plt.title('Contact Cost Box Plot')
plt.show()
print('Irrelevant, as all values were the same.')
sns.boxplot(y=df['Z_Revenue'])
plt.title('Revenue Box Plot')
plt.show()
print('Irrelevant, as all values were the same.')
sns.boxplot(y=df['Age'])
plt.title('Age Box Plot')
plt.show()
print('Majroity of customers appear to be between 50 and 65 years of age. A handful of outliers were observed, all above age 120.')

### Univariate Analysis
- Explore the distribution of each numerical variable using histograms or kernel density plots
- Explore the distribution of each categorical variable using bar plots or pie charts
- Identify outliers in numerical variables using box plots or scatter plots

In [None]:
sns.kdeplot(data=df, x='Year_Birth', fill=True)
plt.title('Birth Year Kernel Density')
plt.show()
print('Fairly normal distribution observed, with highest density observed around 1975.')
sns.kdeplot(data=df, x='Income', fill=True)
plt.title('Income Kernel Density')
plt.show()
print('Fairly normal distribution observed.')
sns.kdeplot(data=df, x='Recency', fill=True)
plt.title('Recency Kernel Density')
plt.show()
print('Wide range of values observed, but all fairly evenly distributed.')
sns.kdeplot(data=df, x='MntWines', fill=True)
plt.title('Wine Sales Kernel Density')
plt.show()
sns.kdeplot(data=df, x='MntFruits', fill=True)
plt.title('Fruit Sales Kernel Density')
plt.show()
sns.kdeplot(data=df, x='MntMeatProducts', fill=True)
plt.title('Meat Product Sales Kernel Density')
plt.show()
sns.kdeplot(data=df, x='MntFishProducts', fill=True)
plt.title('Fish Product Sales Kernel Density')
plt.show()
sns.kdeplot(data=df, x='MntSweetProducts', fill=True)
plt.title('Sweet Product Sales Kernel Density')
plt.show()
sns.kdeplot(data=df, x='MntGoldProds', fill=True)
plt.title('Gold Product Sales Kernel Density')
plt.show()
print('All products show a fairly even distribution for smaller order quantities, and distribution tapers off as order quantity increases.')
sns.kdeplot(data=df, x='NumDealsPurchases', fill=True)
plt.title('Deal Purchases Kernel Density')
plt.show()
print('Highest distribution observed around 2 deals purchased, but several instances above that, tapering off to about 10.')
sns.kdeplot(data=df, x='NumWebPurchases', fill=True)
plt.title('Web Purchases Kernel Density')
plt.show()
sns.kdeplot(data=df, x='NumCatalogPurchases', fill=True)
plt.title('Catalog Purchases Kernel Density')
plt.show()
sns.kdeplot(data=df, x='NumStorePurchases', fill=True)
plt.title('Store Purchases Kernel Density')
plt.show()
print('Sales through different channels showed a somewhat skewed distribution, though Store purchases definitely showed a broader and higher range of marks.')
sns.kdeplot(data=df, x='NumWebVisitsMonth', fill=True)
plt.title('Monthly Web Visits Kernel Density')
plt.show()
sns.kdeplot(data=df, x='NumDealsPurchases', fill=True)
plt.title('Deal Purchases Kernel Density')
plt.show()
print('Highest density observed at 2 deals purchased, with distribution tapering off approaching 8, and then nearly flat approaching 15.')
sns.kdeplot(data=df, x='Z_CostContact', fill=True)
plt.title('Contact Cost Kernel Density')
plt.show()
print('Irrelevant.')
sns.kdeplot(data=df, x='Z_Revenue', fill=True)
plt.title('Revenue Kernel Density')
plt.show()
print('Irrelevant.')
sns.kdeplot(data=df, x='Age', fill=True)
plt.title('Age Kernel Density')
plt.show()
print('Slightly skewed, but near normal distribution of ages observed.')

In [None]:
education = pd.pivot_table(df, values='ID', columns='Education', aggfunc='count')
sns.barplot(data=education)
plt.title('Education Counts')
plt.show()
marital = pd.pivot_table(df, values='ID', columns='Marital_Status', aggfunc='count')
sns.barplot(data=marital)
plt.title('Marital Status Counts')
plt.show()
print('The vast majroity of customers are couples with a college degree.')

In [None]:
sns.scatterplot(df['Year_Birth'])
plt.title('Birth Year Scatter Plot')
plt.show()
print('Healthy spread of birth years with an expected distribution, with only 3 exceptions.')
sns.scatterplot(df['Income'])
plt.title('Income Scatter Plot')
plt.show()
print('Vast majority of incomes fell within a normal range, with only a few slightly outside of it and 1 outlier.')
sns.scatterplot(df['Recency'])
plt.title('Recency Scatter Plot')
plt.show()
print('Wide range of values observed.')
sns.scatterplot(df['MntWines'])
plt.title('Wine Sold Scatter Plot')
plt.show()
sns.scatterplot(df['MntFruits'])
plt.title('Fruit Sold Scatter Plot')
plt.show()
sns.scatterplot(df['MntMeatProducts'])
plt.title('Meat Products Sold Scatter Plot')
plt.show()
sns.scatterplot(df['MntFishProducts'])
plt.title('Fish Products Sold Scatter Plot')
plt.show()
sns.scatterplot(df['MntSweetProducts'])
plt.title('Sweet Products Sold Scatter Plot')
plt.show()
sns.scatterplot(df['MntGoldProds'])
plt.title('Gold Products Sold Scatter Plot')
plt.show()
print('All products showed a similar distribution of quantities sold, with wine and meat products outperforming all other categories.')
sns.scatterplot(df['NumDealsPurchases'])
plt.title('Deals Purchased Scatter Plot')
plt.show()
print('Majority of customers accepted 6 or fewer deals, with a healthy amount of customers accepting even more.')
sns.scatterplot(df['NumWebPurchases'])
plt.title('Web Purchases Scatter Plot')
plt.show()
sns.scatterplot(df['NumCatalogPurchases'])
plt.title('Catalog Purchases Scatter Plot')
plt.show()
sns.scatterplot(df['NumStorePurchases'])
plt.title('Store Purchases Scatter Plot')
plt.show()
print('Catalog and web purchases were about the same, while store purchases varied more and also showed more popularity.')
sns.scatterplot(df['NumWebVisitsMonth'])
plt.title('Monthly Web Visits Scatter Plot')
plt.show()
print('Majority of customers had 9 or fewer web visits monthly, though several customers did visit a little more frequently.')
sns.scatterplot(df['Z_CostContact'])
plt.title('Contact Cost Scatter Plot')
plt.show()
print('Only one value observed: 3.')
sns.scatterplot(df['Z_Revenue'])
plt.title('Revenue Scatter Plot')
plt.show()
print('Only one value observed: 11.')
sns.scatterplot(df['Age'])
plt.title('Age Scatter Plot')
plt.show()
print('Healthy spread of ages with an expected distribution, with only 3 exceptions.')

### Bivariate Analysis
- Explore the relationship between numerical variables and the target variable (Response) using scatter plots or correlation matrices
- Explore the relationship between categorical variables and the target variable using bar plots or chi-square tests
- Explore the relationship between numerical and categorical variables using box plots or violin plots

In [None]:
df.corr(numeric_only=True)

#### No relevant correlation observed between any numerical category and Response

In [None]:
sns.barplot(data=df, y='Response', hue='Marital_Status')
plt.title('Response by Marital Status')
plt.show()
sns.barplot(data=df, y='Response', hue='Education')
plt.title('Response by Education')
plt.show()

In [None]:
cont_table = pd.crosstab(df['Marital_Status'], df['Response'])

print(cont_table)

In [None]:
chi2, p, dof, expected = chi2_contingency(cont_table)
print("Chi-square statistic:", chi2)
print("P-value:", p)
print("Degrees of freedom:", dof)
print("Expected frequencies table:\n", expected)

alpha = 0.05
if p <= alpha:
    print('Reject the null hypothesis.\nNo Correlation between Marital Status and Response.')
else:
    print('Fail to reject the null hypothesis.\nMarital Status appears to have impact on Response.')

In [None]:
cont_table2 = pd.crosstab(df['Education'], df['Response'])

print(cont_table2)

In [None]:
chi2, p, dof, expected = chi2_contingency(cont_table2)
print("Chi-square statistic:", chi2)
print("P-value:", p)
print("Degrees of freedom:", dof)
print("Expected frequencies table:\n", expected)

alpha = 0.05
if p <= alpha:
    print('Reject the null hypothesis.\nNo Correlation between Education and Response.')
else:
    print('Fail to reject the null hypothesis.\nEducation appears to have impact on Response.')

In [None]:
sns.violinplot(data=df, y='Income', hue='Education')
plt.title('Income by Education')
plt.show()
print('No significant different in average income based on Education, except for those with a basic education, which had a lower average income.')
sns.violinplot(data=df, y='Income', hue='Marital_Status')
plt.title('Income by Marital Status')
plt.show()
print('No significant difference in average income based on Marital Status. Though the few that felt marital status is "absurd" appear to have a slightly higher average income.')

In [None]:
df.info()

In [None]:
sns.violinplot(data=df, y='Kidhome', hue='Marital_Status')
plt.title('Kids in Home by Marital Status')
plt.show()
sns.violinplot(data=df, y='Teenhome', hue='Marital_Status')
plt.title('Teens in Home by Marital Status')
plt.show()

- Appears that most customers do not have kids in their home, and those that do mostly had only 1, with just a few reporting 2 in the home.
- Appears as though more customers had teens in the home, which supports a predominantly middle-aged demographic.
- Marital status does not appear to have a significant impact on whether or not customers have kids or teens in the home.

In [None]:
sns.violinplot(data=df, y='Kidhome', hue='Education')
plt.title('Kids in Home by Education')
plt.show()
sns.violinplot(data=df, y='Teenhome', hue='Education')
plt.title('Teens in Home by Education')
plt.show()

- Education level does not appear to have an impact on whether or not customers have children in the home.
- Customers with a college education appear to be more likely to have teens in the home.

In [None]:
sns.violinplot(data=df, y='AcceptedCmp1', hue='Marital_Status')
plt.title('Campaign 1 Success by Marital Status')
plt.show()
campaign1pivot = pd.pivot_table(df, values='AcceptedCmp1', columns='Marital_Status', aggfunc='sum')
display(campaign1pivot)
sns.violinplot(data=df, y='AcceptedCmp2', hue='Marital_Status')
plt.title('Campaign 2 Success by Marital Status')
plt.show()
campaign2pivot = pd.pivot_table(df, values='AcceptedCmp2', columns='Marital_Status', aggfunc='sum')
display(campaign2pivot)
sns.violinplot(data=df, y='AcceptedCmp3', hue='Marital_Status')
plt.title('Campaign 3 Success by Marital Status')
plt.show()
campaign3pivot = pd.pivot_table(df, values='AcceptedCmp3', columns='Marital_Status', aggfunc='sum')
display(campaign3pivot)
sns.violinplot(data=df, y='AcceptedCmp4', hue='Marital_Status')
plt.title('Campaign 4 Success by Marital Status')
plt.show()
campaign4pivot = pd.pivot_table(df, values='AcceptedCmp4', columns='Marital_Status', aggfunc='sum')
display(campaign4pivot)
sns.violinplot(data=df, y='AcceptedCmp5', hue='Marital_Status')
plt.title('Campaign 5 Success by Marital Status')
plt.show()
campaign5pivot = pd.pivot_table(df, values='AcceptedCmp5', columns='Marital_Status', aggfunc='sum')
display(campaign5pivot)

Campaigns performed proportionally equal based on Marital Status.

In [None]:
sns.violinplot(data=df, y='AcceptedCmp1', hue='Education')
plt.title('Campaign 1 Success by Education')
plt.show()
campaign1pivot = pd.pivot_table(df, values='AcceptedCmp1', columns='Education', aggfunc='sum')
display(campaign1pivot)
sns.violinplot(data=df, y='AcceptedCmp2', hue='Education')
plt.title('Campaign 2 Success by Education')
plt.show()
campaign2pivot = pd.pivot_table(df, values='AcceptedCmp2', columns='Education', aggfunc='sum')
display(campaign2pivot)
sns.violinplot(data=df, y='AcceptedCmp3', hue='Education')
plt.title('Campaign 3 Success by Education')
plt.show()
campaign3pivot = pd.pivot_table(df, values='AcceptedCmp3', columns='Education', aggfunc='sum')
display(campaign3pivot)
sns.violinplot(data=df, y='AcceptedCmp4', hue='Education')
plt.title('Campaign 4 Success by Education')
plt.show()
campaign4pivot = pd.pivot_table(df, values='AcceptedCmp4', columns='Education', aggfunc='sum')
display(campaign4pivot)
sns.violinplot(data=df, y='AcceptedCmp5', hue='Education')
plt.title('Campaign 5 Success by Education')
plt.show()
campaign5pivot = pd.pivot_table(df, values='AcceptedCmp5', columns='Education', aggfunc='sum')
display(campaign5pivot)

Campaigns performed proportinately equal based on Education Level.

In [None]:
campaigns = {'Campaign' : ['Campaign 1', 'Campaign 2', 'Campaign 3', 'Campaign 4', 'Campaign 5'], 
             'Hits': [df['AcceptedCmp1'].sum(), df['AcceptedCmp2'].sum(), df['AcceptedCmp3'].sum(), df['AcceptedCmp4'].sum(), df['AcceptedCmp5'].sum()]}
camptable = pd.DataFrame(campaigns)
display(camptable)
sns.barplot(data=camptable, x='Campaign', y='Hits', hue='Campaign')
plt.title('Campaign Success')
plt.show()

In [None]:
sales = {'Product' : ['Wines', 'Fruits', 'Meat Products', 'Fish Products', 'Sweet Products', 'Gold Products'],
        'Quantity Sold' : [df['MntWines'].sum(), df['MntFruits'].sum(), df['MntMeatProducts'].sum(), df['MntFishProducts'].sum(), df['MntSweetProducts'].sum(), df['MntGoldProds'].sum()]}
salestable = pd.DataFrame(sales)
display(salestable)
sns.barplot(data=salestable, x='Product', y='Quantity Sold', hue='Product')
plt.title('Products Sold')
plt.show()

### Observations
- Write an analysis report on performing EDA using Python in the context of building a fraud detection system for Retail Analyitcs
- (pretty sure this capstone had nothing to do with fraud detection...)

In [None]:
print('''The dataset was imported without any null values. The only null values previously found were only in the income column, and they were replaced with the average income.
Overall, income levels of our customers were fairly consistent with the exception of 1 outlier, and a few that were somewhat above the average.
The average age of our customers is about 55, with the majority of them falling between the ages of 50 and 65.
Additionally, the majority of our customers are college graduates, and about a third pursued post-graduate education.
The largest portion of our customers are either married or together. 

Of the 5 marketing campaigns that were observed, all did fairly well except for Campaign 2.
Overall, campaign performances were proportionate to the distribution of our customer base for both marital status and education level.
Amoung our post-graduate customers, those with PHD's responded most favorably.
Those with a basic education did not respond well to any marketing campaigns.

Further analysis of the marketing campaigns will be necessary to determine what made Campaign 4 so successful, and to determine why Campaign 2 was unsuccessful.
Marketing efforts should be geared towards college-educated couples around 50 to 60 years of age.
Marketing efforts should continue to focus most on our wines and meat products. 
It would be prudent to consider other products that go well with wine as that is our best performing product.
''')