#<p style="font-family: Fantasy, fantasy; line-height: 3.3; font-size: 40px; letter-spacing: 5px; text-align: center; color: #009ACD">Clustering: Facebook ads campaigns</p>
![](https://cdn.searchenginejournal.com/wp-content/uploads/2019/04/shutterstock_456779230.png)

<p style="font-family: Fantasy, fantasy; line-height: 1.3;font-size: 30px; letter-spacing: 5px;  color: #009ACD">Dataset information </p>

1.) ad_id: an unique ID for each ad.

2.) xyzcampaignid: an ID associated with each ad campaign of XYZ company.

3.) fbcampaignid: an ID associated with how Facebook tracks each campaign.

4.) age: age of the person to whom the ad is shown.

5.) gender: gender of the person to whim the add is shown

6.) interest: a code specifying the category to which the person’s interest belongs (interests are as mentioned in the person’s Facebook public profile).

7.) Impressions: the number of times the ad was shown.

8.) Clicks: number of clicks on for that ad.

9.) Spent: Amount paid by company xyz to Facebook, to show that ad.

10.) Total conversion: Total number of people who enquired about the product after seeing the ad.

11.) Approved conversion: Total number of people who bought the product after seeing the ad.


<p style="font-family: Fantasy, fantasy; line-height: 1.3;font-size: 30px; letter-spacing: 5px;  color: #009ACD">Task </p>

Grouping ads based on their IMPRESSION, CLICKS, SPENT.

## **Importing libraries**

In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')

## **Data reading**

In [None]:
df = pd.read_csv('../input/clicks-conversion-tracking/KAG_conversion_data.csv')
print(df.columns)
print(df.shape)
df.head()

## **Data information & exploration**

In [None]:
df.columns = df.columns.str.upper()
df.columns

In [None]:
df.info()

In [None]:
# Investigating variables
for i in df.columns:
    if df[i].nunique() < 10:
        print(f'The column "{i}" is __{df[i].dtype}__ \nhas __{df[i].nunique()}__ unique values: \n{df[i].value_counts()}')
        print(10*'==')
    else:
        print(f'The column "{i}" is __{df[i].dtype}__ \nhas __{df[i].nunique()}__ unique values')
        print(10*'==')

In [None]:
# Summary statistics
df.describe()

In [None]:
print('Total ADS: {}'.format(df.shape[0]))

print('Number of ADS with 0 clicks: {}'.format(len(df.loc[df['CLICKS'] == 0])))
print('Number of 0 paids to FACEBOOK for showing ADS: {}'.format(len(df.loc[df['SPENT'] == 0])))
print('Number of ADS with 0 enquires about their product: {}'.format(len(df.loc[df['TOTAL_CONVERSION'] == 0])))
print('Number of ADS with 0 buys: {}'.format(len(df.loc[df['APPROVED_CONVERSION'] == 0])))

In [None]:
# Features variance 
df.var()

In [None]:
df_c = df.copy()
df_c.head()

In [None]:
categories = ['AD_ID', 'FB_CAMPAIGN_ID','AGE', 'GENDER', 'XYZ_CAMPAIGN_ID', 'INTEREST']
df_c[categories] = df_c[categories].astype('category')

In [None]:
# Distribution of categorical variables, except AD_ID & FB_CAMPAIGN_ID
x=0
fig=plt.figure(figsize=(20,15))
plt.subplots_adjust(wspace = 0.5)

for i in df_c[categories[2:]]:
    ax = plt.subplot(241+x)
    ax = sns.countplot(data=df_c, y=i, color = '#A194B6')
    plt.grid(axis='x')
    ax.set_title(f'Distribution of {i}')
    x+=1

In [None]:
fig=plt.figure(figsize=(12,7))

sns.countplot(data=df_c, x='AGE', hue='GENDER', palette=['#C873FB', '#5663FF'])
plt.title('Age distribution by GENDER')
plt.grid(axis='y')

In [None]:
fig=plt.figure(figsize=(15,7))

sns.countplot(data=df_c, x='INTEREST', hue='GENDER', palette=['#C873FB', '#5663FF'])
plt.title('Interest topic distribution by GENDER')
plt.grid(axis='y')

In [None]:
fig=plt.figure(figsize=(12,7))

sns.countplot(data=df_c, x='XYZ_CAMPAIGN_ID', hue='GENDER', palette=['#C873FB', '#5663FF'])
plt.title('XYZ_CAMPAIGN_ID distribution by GENDER')
plt.grid(axis='y')

In [None]:
# Correlation matrix
corr = df_c.corr()

mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)]=True
with sns.axes_style('white'):
    fig, ax = plt.subplots(figsize=(18,10))
    sns.heatmap(corr,  mask=mask, annot=True, cmap='coolwarm', center=0, square=True)

In [None]:
# Relationships between numerical features
sns.pairplot(df_c, corner=True)
plt.suptitle("Relathionships between numerical features", x=0.5 ,y=0.95, size=18, weight='bold')

In [None]:
sns.pairplot(df_c, corner=True, hue='GENDER', palette=['#C873FB', '#5663FF'])
plt.suptitle("Relathionships between numerical features by GENDER", x=0.5 ,y=0.95, size=18, weight='bold')

In [None]:
sns.pairplot(df_c, corner=True, hue = 'AGE')
plt.suptitle("Relathionships between numerical features by AGE category", x=0.5 ,y=0.95, size=18, weight='bold')

In [None]:
# Show axes
plt.rcParams['axes.spines.left'] = True
plt.rcParams['axes.spines.right'] = True
plt.rcParams['axes.spines.top'] = True
plt.rcParams['axes.spines.bottom'] = True

# Distribution of our numerical variables using boxplots
fig, axes = plt.subplots(2, 3, figsize=(14, 7), constrained_layout =True)
plt.subplots_adjust(wspace = 0.5, hspace=0.8)
axes[-1, -1].axis('off') # hide axes
 
ax = sns.boxplot(ax = axes[0,0], data=df_c, x='IMPRESSIONS', color='#5E4489')
ax.set_title(f'Distribution of "IMPRESSIONS"')
 
ax1 = sns.boxplot(ax = axes[0,1], data=df_c, x='CLICKS', color='#5E4489')
ax1.set_title(f'Distribution of "CLICKS"')
 
ax2 = sns.boxplot(ax = axes[0,2], data=df_c, x='SPENT', color='#5E4489')
ax2.set_title(f'Distribution of "SPENT"')
 
ax3 = sns.boxplot(ax = axes[1,0], data=df_c, x='TOTAL_CONVERSION', color='#5E4489')
ax3.set_title(f'Distribution of "TOTAL_CONVERSION"')
 
ax4 = sns.boxplot(ax = axes[1,1], data=df_c, x='APPROVED_CONVERSION', color='#5E4489')
ax4.set_title(f'Distribution of "APPROVED_CONVERSION"')

We observe that our features are highly skewed to the lower values.

## **Answering few questions**
What is the MEAN spending on ads by age group?

What is the MEAN spending on ads by gender? 

In [None]:
avg_spending_age = df_c.groupby('AGE')['SPENT'].mean().sort_values().reset_index()
avg_spending_gender = df_c.groupby('GENDER')['SPENT'].mean().sort_values().reset_index()
avg_spengind_xyz = df_c.groupby('XYZ_CAMPAIGN_ID')['SPENT'].mean().sort_values().reset_index()

fig, axes = plt.subplots(1,3, figsize=[15,6], constrained_layout=True)
plt.subplots_adjust(wspace = 0.5, hspace=0.8)

# First plot
ax = sns.barplot(ax = axes[0], data=avg_spending_age, x='AGE', y='SPENT', color = '#A194B6')
ax.set_title('Mean spending on ads by AGE')
ax.set_yticks([])
ax.set_ylabel('')
i = 0
for p in ax.patches:
    height = p.get_height()
    ax.text(p.get_x()+p.get_width()/2., height + 1,
        round(avg_spending_age['SPENT'][i], 2),ha="center")
    i += 1

# Second plot
ax1 = sns.barplot(ax = axes[1], data=avg_spending_gender, x='GENDER', y='SPENT', color = '#5E4489')
ax1.set_title('Mean spending on ads by GENDER')
ax1.set_yticks([])
ax1.set_ylabel('')
i = 0
for p in ax1.patches:
    height = p.get_height()
    ax1.text(p.get_x()+p.get_width()/2., height + 1,
        round(avg_spending_gender['SPENT'][i], 2),ha="center")
    i += 1

# Third plot
ax2 = sns.barplot(ax = axes[2], data=avg_spengind_xyz, x='XYZ_CAMPAIGN_ID', y='SPENT', color = '#A194B6')
ax2.set_title('Mean spending on ads by XYZ_CAMPAIGN_ID')
ax2.set_yticks([])
ax2.set_ylabel('')
i = 0
for p in ax2.patches:
    height = p.get_height()
    ax2.text(p.get_x()+p.get_width()/2., height + 1,
        round(avg_spengind_xyz['SPENT'][i], 2),ha="center")
    i += 1

What is the MEAN spending on ads by topic?

In [None]:
# Hide spines (black border of the plot)
plt.rcParams['axes.spines.left'] = False
plt.rcParams['axes.spines.right'] = False
plt.rcParams['axes.spines.top'] = False
plt.rcParams['axes.spines.bottom'] = False

avg_spending_topic = df_c.groupby('INTEREST')['SPENT'].mean().reset_index()

fig, axes = plt.subplots(1,1, figsize=[18,6], constrained_layout=True)

# First plot
ax = sns.barplot(ax = axes, data=avg_spending_topic, x='INTEREST', y='SPENT', color = '#A194B6')
ax.set_title('Mean spending on ads by INTEREST topic')
ax.set_yticks([])
i = 0
ax.set_ylabel('')
for p in ax.patches:
    height = p.get_height()
    ax.text(p.get_x()+p.get_width()/2., height + 1,
        round(avg_spending_topic['SPENT'][i], 2),ha="center")

## **Clustering**

In [None]:
df_c.head()

In [None]:
total_conversion_df = df_c.groupby('FB_CAMPAIGN_ID')[['IMPRESSIONS', 'CLICKS', 'SPENT']].mean()
data = pd.DataFrame(total_conversion_df)
data

## **Preprocessing**

In [None]:
features = total_conversion_df.values

In [None]:
# Standardization
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)
scaled_features

## **Choosing the number of K**

ELBOW METHOD

In [None]:
ks = range(2, 6)
inertias = []

for k in ks:
    model = KMeans(n_clusters= k)
    clusters =  model.fit(scaled_features)
    inertias.append(model.inertia_)
    
# Plot ks vs inertias
plt.figure(figsize=(12,6))
plt.plot(ks, inertias, '-o')
plt.xlabel('number of clusters, k')
plt.ylabel('inertia')
plt.xticks(ks)
plt.show()

SILHOUETTE SCORE

In [None]:
ks = range(2,6)
results = []

for k in ks:
    model = KMeans(n_clusters= k)
    cluster_l = model.fit_predict(scaled_features)
    silh_avg = silhouette_score(scaled_features, cluster_l)
    results.append([k, silh_avg])

result = pd.DataFrame(results, columns = ['n_clusters', 'silhouette_score'])
pivot_km = pd.pivot_table(result, index='n_clusters', values='silhouette_score')

plt.figure(figsize=(5,8))
sns.heatmap(pivot_km, annot=True, linewidths=.5, fmt='.3f', cmap = 'rocket_r')

According to our results, 2 seems to be the number of clusters with highest silhouette score for our features.

Let's consider using 3 clusters.

In [None]:
MODEL = KMeans(n_clusters=3)
MODEL.fit(scaled_features)

In [None]:
data['Cluster'] = MODEL.predict(scaled_features)
data.head()

In [None]:
plt.figure(figsize=(15,8))
sns.scatterplot(data=data, x='IMPRESSIONS', y='CLICKS', hue = 'Cluster', palette=['#FF0000', '#00FF00', '#000080'])
plt.show()

In [None]:
plt.figure(figsize=(15,8))
sns.scatterplot(data=data, x='IMPRESSIONS', y='SPENT', hue = 'Cluster',  palette=['#FF0000', '#00FF00', '#000080'])
plt.show()

In [None]:
plt.figure(figsize=(15,8))
sns.scatterplot(data=data, x='SPENT', y='CLICKS', hue = 'Cluster', palette=['#FF0000', '#00FF00', '#000080'])
plt.show()

In [None]:
px.scatter_3d(data_frame=data, x='IMPRESSIONS', y='CLICKS', z='SPENT', color='Cluster', color_continuous_scale=['#FF0000', '#00FF00', '#000080'])