# <font color='#ef233c'> Ads </font> Data Analysis

---

In [None]:
!pip install pyjanitor

In [None]:
import pandas as pd
import numpy as np
import janitor
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
import plotly.io as pio
import missingno as msno
import warnings
warnings.filterwarnings(action='ignore')
pio.renderers.default = 'kaggle'

filepath = '/kaggle/input/cvdcvd-vd/Social_Network_Ads.csv'
def get_ads_data(filepath=filepath):
    """
    :returns: DataFrame Pandas Object
    """
    dataframe = pd.read_csv(filepath)
    dataframe = dataframe \
        .clean_names() \
        .remove_empty() \
        .rename_column('estimatedsalary', 'estimated_salary')
    
    return dataframe


# plot template
pio.templates.default = 'plotly_white'
colors = ["#2b2d42",
    "#8d99ae","#edf2f4",
    "#ef233c","#d90429"]

# load dataframe
ads_df = get_ads_data()

In [None]:
# view dataframe
ads_df.head()

In [None]:
def get_age_groups():
    
    # pandas cut
    age_cat = pd.cut(ads_df['age'], bins=6,
        retbins=True)

    # get cut as array
    age_cat = np.asarray(age_cat)[0]
    
    # add bins to dataframe
    age_bins = age_cat.astype('string')
    age_bins.name = 'age_group'
    age_df_added_bin = pd.concat([ads_df, age_bins], axis=1)
    
    # clean age group
    age_group = age_df_added_bin['age_group']\
        .astype('string')\
        .str.strip('(]')\
        .str.split(',', expand=True)\
        .rename(columns={0: 'from', 1: 'to'})

    # round values
    age_group['from'] = round(
        age_group['from'].astype('float'), 0)
    age_group['to'] = round(
        age_group['to'].astype('float'), 0)

    # change type to int -> object
    age_group['from'] = age_group['from'].astype('int')
    age_group['to'] = age_group['to'].astype('int')
    age_group['from'] = age_group['from'].astype('string')
    age_group['to'] = age_group['to'].astype('string')
    age_group = age_group['from'] + ' to ' + age_group['to']
    
    # concat
    age_group = pd.concat([ads_df, age_group], axis=1)
    age_group = age_group.rename(columns={0: 'age_group'})
    
    return age_group

In [None]:
# use function to get age groups
ads_df = get_age_groups()

In [None]:
# check missing data
msno.bar(ads_df, figsize=(9,6), color=colors[3]);

In [None]:
# check for duplicated data
print("Are the observations in the data unique?", ads_df.user_id.is_unique)

# summary statistics for feature age and estimated salary
print("Summary Stat:")
ads_df[['age', 'estimated_salary']].describe().T

# <font color='#ef233c'>Age</font> Group

---

**Age Group Binning**
(18 to 25] which is 18 > age = 25

In [None]:
def plot_ages(dataframe):
    """
    :returns: Plotly Bar for Age
    """
    plot_ages_data = dataframe.age_group\
    .value_counts()\
    .reset_index()\
    .sort_naturally('index')
    
    title = '<b>Ad-Clicks</b><br> Age Group'
    # set color
    age_color = [colors[2]] * 6
    age_color[2] = colors[4]
    
    # create figure
    fig = go.Figure()
    fig.add_trace(go.Bar(
        x=plot_ages_data['index'],
        y=plot_ages_data['age_group'],
        text=plot_ages_data['age_group']))
    
    # tweak layout
    fig.update_traces(marker_color=age_color, 
        marker=dict(line=dict(color='black')))
    fig.update_yaxes(visible=False)
    fig.update_xaxes(title='<b>Age Group</b>')
    fig.update_layout(width=780, margin={
        'pad': 10}, title=title)
    show_figure = fig.show()
    
    return show_figure

plot_ages(ads_df)

# <font color='#ef233c'>Gender</font> Clicks

---

In [None]:
def plot_gender():
    gender_counts = ads_df.gender.value_counts()
    title = '<b>Ad Clicks</b><br> by Gender'
    
    fig = go.Figure()
    fig.add_trace(go.Bar(
        x=gender_counts.index,
        y=gender_counts.values,
        text=gender_counts.values,
        textposition='outside'))
    
    fig_color = [colors[3], colors[2]]
    # tweak layout
    fig.update_traces(marker_color=fig_color, 
        marker=dict(line=dict(color='black')))
    fig.update_yaxes(visible=False)
    fig.update_xaxes(title='<b>Gender</b>')
    fig.update_layout(width=780, margin={
        'pad': 10}, title=title)
    
    fig.add_hline(y=196, line_dash='dot')
    show_figure = fig.show()
    
    return show_figure

plot_gender()

# <font color='#ef233c'>Salary</font> / Gender

---

In [None]:
def plot_salary_gender():
    """
    :returns: Plotly Box Plot for Salary and Gender
    """
    title = '<b>Salary | Gender</b>'
    
    male_salary = ads_df[ads_df.gender == 'Male']['estimated_salary']
    female_salary = ads_df[ads_df.gender == 'Female']['estimated_salary']
    
    # create figure
    fig = go.Figure()
    
    # add male trace
    fig.add_trace(go.Box(
        x=male_salary,
        name='Male',
        marker_color=colors[1]))
    
    # add female trace
    fig.add_trace(go.Box(
        x=female_salary,
        name='Female',
        marker_color=colors[4]))
    
    # tweak layout
    fig.update_yaxes(visible=False)
    fig.update_xaxes(title='<b>Annual Income ($)</b>')
    fig.update_layout(width=780, margin={
        'pad': 10}, title=title)
    fig.update_traces(boxpoints='all')
    show_figure = fig.show()
    
    return show_figure

plot_salary_gender()

# <font color='#ef233c'>Salary</font> / Age Group

---

In [None]:
def plot_salary_age(dataframe):
    """
    :returns: Plotly Box Plot for Salary and Age
    """
    dataframe = dataframe.sort_naturally('age_group')
    
    # create plot using express
    title = '<b>Salary | Age</b>'
    age_salary_plot = px.box(
        dataframe, 
        y='age_group', 
        x='estimated_salary')
    
    # tweak layout
    age_salary_plot\
        .update_xaxes(title='<b>Annual Income($)</b>')\
        .update_yaxes(title='<b>Age Group</b>')\
        .update_layout(title=title, width=780)\
        .update_traces(marker_color=colors[4])
    
    show_figure = age_salary_plot.show()
    
    return show_figure

plot_salary_age(ads_df)

# <font color='#ef233c'>Class Label</font> | Purchased

---


In [None]:
# drop user key
ads_df = ads_df.drop(columns='user_id')

# seperate features
def get_features():
    """
    returns: 
        feature(DataFrame), 
        class_feature(Series)
    """
    feature = ads_df.loc[:, 'gender':'purchased'].\
        drop(columns='purchased')

    feature = pd.concat(
        [feature, ads_df['age_group']],
        axis=1)
    
    class_feature = ads_df['purchased']
    return feature, class_feature

feature, class_feature = get_features()

# get class feature counts
class_counts = class_feature.value_counts()
class_counts.index = ['Did not Purchase', 'Purchased']

# filter class
purchased = ads_df[ads_df.purchased == 1]
not_purchased = ads_df[ads_df.purchased != 1]

# <font color='#ef233c'>Ad-Click</font> | Purchased : Conversion Rate

---

In [None]:
def plot_big_numbers():
    ad_clicks = class_counts.sum()
    did_not_purchase = class_counts['Did not Purchase']
    purchased = class_counts.Purchased
    ratio_user_purchase = round(
    class_counts.Purchased / class_counts['Did not Purchase'] * 100, 2)
    
    title='<b>What is the Ad-click Conversion Rate?</b>'
    fig = go.Figure()
    fig.add_annotation(
            text='Ad-Clicks', 
            x=0,y=1,
            showarrow=False)\
        .add_annotation(
            text='Did not Purchase',
            y=1,
            showarrow=False)\
        .add_annotation(
            text='Purchased',
            x=5, y=1,
            showarrow=False)\
        .add_annotation(
            text=f'<b>{ad_clicks}</b>',
            x=0,
            font=dict(size=45, color=colors[1]),
            showarrow=False)\
        .add_annotation(
            text=f'{did_not_purchase}',
            font=dict(size=45, color=colors[1]),
            showarrow=False)\
        .add_annotation(
            text=f'<b>{purchased}</b>',
            x=5,
            font=dict(size=45, color=colors[1]),
            showarrow=False)
    
    # conversion number
    fig.add_annotation(
        text=f'<b>{ratio_user_purchase}%</b>',
        font=dict(size=50, color=colors[4]), y=3)
    fig.add_annotation(
        text='<b>Conversion Rate</b>', 
        font=dict(size=14, color=colors[4]),
        y=3,
        x=2.2,
        showarrow=False)
    
    fig.update_yaxes(visible=False)
    fig.update_xaxes(visible=False)
    fig.update_layout(title=title, width=780)
    show_figure = fig.show()
    
    return show_figure

plot_big_numbers()

# <font color='#ef233c'>Ad-Click</font> | Purchased : Purchasing Power

---

In [None]:
def plot_class_age():
    """
    :returns: Plotly Bar Plot for Class Label and Age
    """
    title = "<b>Which Age Group</b>: Purchase our Product?"
    # create base index for xaxes
    base_index = ads_df.age_group.value_counts()\
        .reset_index()\
        .sort_naturally('index')\
        .set_index('index').index
    
    # no purchase count
    not_p_counts = not_purchased.age_group.value_counts()\
        .reset_index().sort_naturally('index')
    
    # purchase count
    p_counts = purchased.age_group.value_counts()\
        .reset_index().sort_naturally('index')
    
    # create figure
    fig = go.Figure()
    # add traces
    fig.add_trace(go.Bar(
        x=base_index,
        y=not_p_counts.age_group,
        name='Did not Purchased',
        marker_color=colors[2]))

    fig.add_trace(go.Bar(
        x=base_index,
        y=p_counts.age_group,
        name='Purchased',
        marker_color=colors[3]))
    
    # tweak layout
    fig.update_traces(
        marker=dict(line=dict(color='black')))
    fig.update_yaxes(visible=False)
    fig.update_xaxes(title='<b>Age Group</b>')
    fig.update_layout(width=780, margin={
        'pad': 10}, title=title)
    
    show_figure = fig.show()
    return show_figure

plot_class_age()

In [None]:
def plot_purchase_power():
    """
    :returns: Plotly Scatter Plot Purchase Power
    """
    title = """ <b>Purchasing Power</b><br> of Individuals who purchased our products"""
    
    # get median salary
    purchase_power = purchased.\
        groupby('age_group')['estimated_salary'].median()

    # create figure
    fig = go.Figure()
    # add trace
    fig.add_trace(go.Scatter(
        x=purchase_power.index,
        y=purchase_power.values,
        mode='markers',
        marker=dict(size=purchase_power.values/2000,
            line_width=0.8, color=purchase_power.values,
            line_color='black', opacity=1, colorscale=np.flip(colors))))
    
    fig.update_xaxes(title='<b>Age Group</b>')
    fig.update_yaxes(title='<b>Annual Income($)</b>')
    fig.update_layout(width=780, margin={
        'pad': 10}, title=title)
    
    return fig.show()

plot_purchase_power()

# <font color='#ef233c'>Ad-Click</font> | Purchased : Statistical Analysis

In [None]:
def plot_corr_mat():
    """
    :returns: Plotly annotated heatmap, correlation.
    """
    title = '<b>Feature Correlation</b>'
    
    # create correlation table
    array_corr = np.asarray(ads_df.corr())

    # custom cmap
    cmaps = ["590d22",
        "800f2f","a4133c","c9184a",
        "ff4d6d","ff758f","ff8fa3",
        "ffb3c1","ffccd5","fff0f3"]
    cmaps = ['#' + hex_colors for hex_colors in cmaps]

    # get text names 
    text_names = list(ads_df.select_dtypes('int').\
        columns)

    # create annotated_plot
    fig = ff.create_annotated_heatmap(
        z=np.round(array_corr, 2),
        x=text_names,
        y=text_names,
        colorscale=cmaps)

    # tweak layout
    fig.update_layout(width=780, margin={
        'pad': 10}, title=title)

    show_figure = fig.show()
    return show_figure

plot_corr_mat()

In [None]:
def scatter_class_label(dep_var, plt_title)-> go.Figure:
    """
    :returns: Scatter Plotly Figure of Dependent Var
    """
    title = plt_title
    fig = px.scatter(ads_df, x=dep_var, y='purchased')
    fig.update_layout(title=title, width=780)
    fig.update_traces(marker_color=colors[4])
    show_figure = fig.show()
    
    return show_figure    

In [None]:
scatter_class_label('age', '<b>Age</b> and Purchased')

In [None]:
scatter_class_label('estimated_salary', '<b>Age</b> and Purchased')

# <font color='#ef233c'>Model</font> Building | Logistic Regression

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
ads_df = ads_df.iloc[:, :4]

## Encoding

In [None]:
# pandas get dummies
ads_df = pd.get_dummies(ads_df)

# re-arrange dataframe
ads_df = ads_df[['gender_Female', 'gender_Male', 'age', 'estimated_salary', 'purchased']]

# label dependent variables
x_columns = ['x_' + columns for columns in ads_df.columns]
ads_df.columns = x_columns

# label independent variable
ads_df = ads_df.rename(columns={'x_purchased': 'y_purchased'})

## Train-Test Split

In [None]:
X = ads_df.loc[:, :'x_estimated_salary']
y = ads_df['y_purchased']

In [None]:
preview = pd.concat([y, X], axis=1)
preview.sample(15, random_state=10).style.background_gradient(subset=['y_purchased'], cmap='Pastel1_r')

In [None]:
# Split our data
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=50)

In [None]:
print(f"""
    Train Test Split Output:
    {'-' *  50}
    Training Data: {X_train.size} 
    Test Data: {X_test.size}
    y_Training Data: {y_train.size}
    y_Test Data: {y_test.size}
    """)

## Standardized Data

In [None]:
from sklearn.preprocessing import StandardScaler

scale = StandardScaler()
X_train = scale.fit_transform(X_train)
X_test = scale.transform(X_test)

## Logistic Regression Model

In [None]:
# create logistic regression model
lr = LogisticRegression(max_iter=1000).fit(X_train, y_train)
pred = lr.predict(X_test)
print(f'logistic model prediction_score: {lr.score(X_train, y_train)}')

In [None]:
def plot_confusion_matrix(y_test, pred)-> go.Figure:
    """
    :returns: Plotly Confusion Matrix Heatmap
    """
    title = '<b>Logistic Regression Model |</b><br>Confusion Matrix'
    
    # call confusion matrix function
    cm = confusion_matrix(y_test, pred)
    
    # use custom cmaps
    cmaps = ["590d22",
        "800f2f","a4133c","c9184a",
        "ff4d6d","ff758f","ff8fa3",
        "ffb3c1","ffccd5","fff0f3"]
    cmaps = ['#' + hex_colors for hex_colors in cmaps]
    
    # create column names
    axes_cols = [
        ['Positive', 'Negative'],
        ['Positive', 'Negative']]
    
    # create figure
    fig = ff.create_annotated_heatmap(
        cm, x=axes_cols[1],
        y=axes_cols[0],
        colorscale=np.flip(cmaps))
    
    # tweak layout
    fig.update_layout(
        title=title,
        height=700,
        width=780,
        margin=dict(pad=10))
    fig.update_xaxes(title='Predicted')
    fig.update_yaxes(title='Actual')
    
    show_figure = fig.show()
    return show_figure

In [None]:
    plot_confusion_matrix(y_test, pred)