# Facebook ad-sales analysis

### Packages & Load data

In [None]:
import pandas as pd
import numpy as np 
import seaborn as sns

In [None]:
data = pd.read_csv('../input/kag-concersion-data/KAG_conversion_data.csv') 

### Data overview

In [None]:
data.head()

#### Decriptive Stats

In [None]:
data.describe()

In [None]:
data.gender.unique()

In [None]:
data.interest.value_counts().count()

In [None]:
data.xyz_campaign_id.value_counts().count()

In [None]:
data.fb_campaign_id.value_counts().count()

In [None]:
data.info()

In [None]:
#change 'interest' data type to object as they reprecent a category rather than a value
data.astype({'interest': 'object'}).dtypes

In [None]:
data.age.unique()

### Feature engineering

In [None]:
# #change gender to binary F=1, M=0

# data['gender_binary'] = np.where(data['gender'].isin(['F']),1,0)

In [None]:
# data['gender_binary']

In [None]:
data.Total_Conversion.unique()

In [None]:
#setindex

data.set_index('ad_id',inplace=True)

In [None]:
data.columns

#### Correlation Matrix

In [None]:
sns.set(rc={'figure.figsize':(11.7,8.27)})

In [None]:
corr = data.corr()

# plot the heatmap



sns.heatmap(corr,annot= True,cmap='Blues', 
        xticklabels=corr.columns,
        yticklabels=corr.columns)

#### Calculated Fields

In [None]:
# Calcutate click-through-rate and 
#  (CTR = ((Clicks / impr) * 100), 
data["CTR %"] = ((data["Clicks"] / data["Impressions"]) * 100)


In [None]:
#Cost-per-click
#   CPC = Spent / Clicks)
data["CPC"] = data["Spent"] / data["Clicks"]

In [None]:
#Conversion Rate - CVR %

data["CVR %"] = data["Total_Conversion"] / data["Impressions"]

In [None]:
data.head()

#### Visualization with calculated fields

In [None]:
corr = data.corr()

# plot the heatmap

sns.heatmap(corr,annot= True,cmap='BuPu', 
        xticklabels=corr.columns,
        yticklabels=corr.columns)

In [None]:
data.describe()

In [None]:
#create funcion for columns histogram

def hist_function(df, column):
    return df[column].plot.hist()

In [None]:
#import libraries

import matplotlib.pyplot as plt

In [None]:
# loop through columns and visualize histograms per columns
my_list = []
for col in data.columns:
    if data[col].dtypes == int:
        my_list.append(hist_function(data, col))
        plt.title(col)
        plt.show()
        

In [None]:
# Ad spend per xyz_campaign_id
AdSpend = data.groupby(['xyz_campaign_id']).agg({'Spent':'sum'})
AdSpend.plot.bar(y='Spent', figsize=(12, 4))
plt.ylabel('AdSpend')
plt.title('AdSpend Per xyz Campaign ID')
plt.margins(0.1)
plt.subplots_adjust(bottom = 0.10)
plt.show()

In [None]:
"""the majority of the money is spent on the  #1178 campaign ID"""

In [None]:
# Ad spend per xyz_campaign_id
AdSpend = data.groupby(['xyz_campaign_id']).agg({'Total_Conversion':'sum'})
AdSpend.plot.bar(y='Total_Conversion', figsize=(12, 4))
plt.ylabel('Conversion')
plt.title('Conversion Per xyz Campaign ID')
plt.margins(0.1)
plt.subplots_adjust(bottom = 0.10)
plt.show()

In [None]:
"""the #1178 campaign ID had the majority of the conversions """

In [None]:
# # Conversion per xyz_campaign_id
# AdSpend = data.groupby(['xyz_campaign_id']).agg({'Total_Conversion':'count'})
# AdSpend.plot.bar(y='Total_Conversion', figsize=(12, 4))
# plt.ylabel('Conversion')
# plt.title('Conversion Per xyz Campaign ID')
# plt.margins(0.1)
# plt.subplots_adjust(bottom = 0.10)
# plt.show()

In [None]:
# CTR per xyz_campaign_id
AdSpend = data.groupby(['xyz_campaign_id']).agg({'CTR %':'mean'})
AdSpend.plot.bar(y='CTR %', figsize=(12, 4))
plt.ylabel('CTR %')
plt.title('CTR % Per xyz Campaign ID')
plt.margins(0.1)
plt.subplots_adjust(bottom = 0.10)
plt.show()

In [None]:
# CTR per Gender
AdSpend = data.groupby(['gender']).agg({'CTR %':'mean'})
AdSpend.plot.bar(y='CTR %', figsize=(12, 4))
plt.ylabel('CTR %')
plt.title('CTR % per Gender')
plt.margins(0.1)
plt.subplots_adjust(bottom = 0.10)
plt.show()

In [None]:
"""Females had a higher CTR, but the difference is not significant to conclude anything at this stage"""

In [None]:
# Conversion per Gender
AdSpend = data.groupby(['gender']).agg({'Total_Conversion':'sum'})
AdSpend.plot.bar(y='Total_Conversion', figsize=(12, 4))
plt.ylabel('Total_Conversion')
plt.title('Conversion per Gender')
plt.margins(0.1)
plt.subplots_adjust(bottom = 0.10)
plt.show()

In [None]:
"""the Female group has a slightly higher sum of conversions too"""

In [None]:
# Click-through-rate by Gender

sns.boxplot(y='CTR %', x='gender', 
                 data=data, 
                 palette="colorblind",
                 hue='gender')

In [None]:
# AdSpend by xyz_campaign_id

sns.boxplot(y='Spent', x='xyz_campaign_id', 
                 data=data, 
                 palette="colorblind",
                 hue='xyz_campaign_id')

In [None]:
"""as the majority of the money spent on the 1178 campaign, we will be focusing on this from now on, however we should check whether the Spend is efficient against other campaigns"""

### Regression analysis 

In [None]:
#filtering the data by xyz_campaign_id

filtered_data1178 =  data[data['xyz_campaign_id'] == 1178]
filtered_data916 =  data[data['xyz_campaign_id'] == 916]
filtered_data936 =  data[data['xyz_campaign_id'] == 936]

In [None]:
filtered_data1178.head()

In [None]:
filtered_data916.head()

In [None]:
# correlation between spent and the number of conversions shows that the more money we spend the more conversions we receive

filtered_data1178[['Spent', 'Total_Conversion']].corr()

In [None]:
sns.lmplot(x="Spent", y="Total_Conversion", data=filtered_data1178, height=7, aspect=1.5)

In [None]:
sns.lmplot(x="Spent", y="Total_Conversion", data=filtered_data916, height=7, aspect=1.5)

In [None]:
sns.lmplot(x="Spent", y="Total_Conversion", data=filtered_data936, height=7, aspect=1.5)

In [None]:
#inport libraries
from scipy import stats
from scipy.stats import linregress

In [None]:
#campaign ID 936 

x = filtered_data936["Spent"] 
y = filtered_data936["Total_Conversion"]

In [None]:
#Slope of the regression line.
#Intercept of the regression line.
#rvaluefloat - Correlation coefficient.
#p-value
#Standard error of the estimated slope (gradient), under the assumption of residual normality.

s,i,r,p,st = linregress(x,y)

In [None]:
#936 values
stats.linregress(x,y)

In [None]:
#campaign ID 936 

x = filtered_data916["Spent"] 
y = filtered_data916["Total_Conversion"]

In [None]:
s,i,r,p,st = linregress(x,y)

In [None]:
#campaign ID 916 values
stats.linregress(x,y)

In [None]:
#campaign ID 1178 

x = filtered_data1178["Spent"] 
y = filtered_data1178["Total_Conversion"]

In [None]:
s,i,r,p,st = linregress(x,y)

In [None]:
#campaign ID 1178 values
stats.linregress(x,y)

#### Conclusion:

The slope of the 1178 campaign is the steepest, thus it has the best  Spend to Conversion ratio from the 3 campaigns

## Analysis of campaign ID 1178

In [None]:
filtered_data1178.head()

### Analysis by Age and Gender

In [None]:
filtered_data1178.head()

In [None]:
filtered_data1178.Total_Conversion.value_counts()

In [None]:
# Number of conversions by Age-group

filtered_data1178.groupby(['age']).sum()['Total_Conversion']

In [None]:
# Number of conversions by Age-group

Conversion_per_age = filtered_data1178.groupby(['age']).sum()['Total_Conversion']
Conversion_per_age.plot.bar(y='age', figsize=(12, 4))
plt.ylabel('Conversions')
plt.title('Conversions by Age')
plt.margins(0.1)
plt.subplots_adjust(bottom = 0.10)
plt.show()

In [None]:
# Number of Conversions by Gender
filtered_data1178.groupby(['gender']).sum()['Total_Conversion']

In [None]:
# Number of Conversions by Gender
Conversion_per_age = filtered_data1178.groupby(['gender']).sum()['Total_Conversion']
Conversion_per_age.plot.bar(y='age', figsize=(12, 4))
plt.ylabel('Conversions')
plt.title('Conversions by Gender')
plt.margins(0.1)
plt.subplots_adjust(bottom = 0.10)
plt.show()

In [None]:
# Conversions by Age category
sns.catplot(x="age", y="Total_Conversion", hue="age",data=filtered_data1178)

In [None]:
# sns.catplot(x="age", y="Total_Conversion",jitter= False, hue="age",data=filtered_data1178)

### Interest group analysis

In [None]:
# occurance of interest by group 

fig_dims = (15,6)
fig, ax = plt.subplots(figsize=fig_dims)
sns.countplot(x = 'interest',data = filtered_data1178)
plt.show()

### ANOVA Test of Interest

In [None]:
Interest_categories = filtered_data1178.interest.unique().tolist()

In [None]:
Interest_categories

In [None]:
Interest_data = []

for interests in Interest_categories:
    Interest_data.append(filtered_data1178.loc[(filtered_data1178["interest"] == interests)]["CVR %"])

In [None]:
len(Interest_data)

In [None]:
Interest_data[0]

In [None]:
Interest_data

In [None]:
from scipy import stats

stats.f_oneway(*Interest_data)

#### colclusion: 
the 0.0009416503097439706 p-value shows that the interest targeting has statistically significace when it comes to improving Conversion Rate

### Conversion Rate optimization by Interest

In [None]:
Conversion_per_age = filtered_data1178.groupby(['interest']).mean()['CVR %']
Conversion_per_age.plot.bar(y='interest', figsize=(12, 4))
plt.ylabel('CVR %')
plt.title('CVR % by interest')
plt.margins(0.1)
plt.subplots_adjust(bottom = 0.10)
plt.show()

### ANOVA Test of Age Groups

In [None]:
Age_groups = filtered_data1178.age.unique().tolist()

In [None]:
Age_groups

In [None]:
Age_category_data = []

for ages in Age_groups:
    Age_category_data.append(filtered_data1178.loc[(filtered_data1178["age"] == ages)]["CVR %"])

In [None]:
len(Age_category_data)

In [None]:
Age_category_data

In [None]:
from scipy import stats

stats.f_oneway(*Age_category_data)

#### conclusion:

The P value shows that Age targeting has a very high relevance to improve Conversion Rates

In [None]:
Conversion_per_age = filtered_data1178.groupby(['age']).mean()['CVR %']
Conversion_per_age.plot.bar(y='age', figsize=(12, 4))
plt.ylabel('CVR %')
plt.title('CVR % by Age')
plt.margins(0.1)
plt.subplots_adjust(bottom = 0.10)
plt.show()

### 2-sample hypothesis test by Gender (Conversion Rate)

In [None]:
# filter by gender
Gender_groups = filtered_data1178.gender.unique().tolist()

In [None]:
Gender_groups

In [None]:
# split groups
Male_group = filtered_data1178.loc[(filtered_data1178["gender"] == 'M')]["CVR %"]

In [None]:
Female_group = filtered_data1178.loc[(filtered_data1178["gender"] == 'F')]["CVR %"]

In [None]:
Male_group

In [None]:
Female_group

In [None]:
#two independent samples test

stats.ttest_ind(Male_group,Female_group)

#### conclusion:

The P-value of 0.1075239428447942 show that there the value has no statistical significance, thus it indicates a strong evidence for null hypotesis. 
This means we retain the null hypothesis and reject the hypotesis that Targeting by gender would have any effect on the Conversion Rate performance. 