# Conversion Rate Challenge 🎯

In [None]:
# Data manipulation
import pandas as pd

# Data viz
import plotly.express as px
import plotly.figure_factory as ff

In [3]:
data_train = pd.read_csv("../data/conversion_data_train_unproc.csv")
data_test = pd.read_csv("../data/conversion_data_test_unproc.csv")

## Preprocessing

In [4]:
data_train["new_user"] = data_train["new_user"].map({0:"No",  1:"Yes"})
data_test["new_user"] = data_test["new_user"].map({0:"No",  1:"Yes"})

In [5]:
data_train.to_csv("../data/conversion_data_train.csv", index=False)
data_test.to_csv("../data/conversion_data_test.csv", index=False)

## EDA

In [6]:
print("Data : ")
display(data_train.head())

print("Basic statistics : ")
display(data_train.describe(include="all"))
print()

print("Data infos : ")
display(data_train.info())

Data : 


Unnamed: 0,country,age,new_user,source,total_pages_visited,converted
0,China,22,Yes,Direct,2,0
1,UK,21,Yes,Ads,3,0
2,Germany,20,No,Seo,14,1
3,US,23,Yes,Seo,3,0
4,US,28,Yes,Direct,3,0


Basic statistics : 


Unnamed: 0,country,age,new_user,source,total_pages_visited,converted
count,284580,284580.0,284580,284580,284580.0,284580.0
unique,4,,2,3,,
top,US,,Yes,Seo,,
freq,160124,,195066,139477,,
mean,,30.564203,,,4.873252,0.032258
std,,8.266789,,,3.341995,0.176685
min,,17.0,,,1.0,0.0
25%,,24.0,,,2.0,0.0
50%,,30.0,,,4.0,0.0
75%,,36.0,,,7.0,0.0



Data infos : 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284580 entries, 0 to 284579
Data columns (total 6 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   country              284580 non-null  object
 1   age                  284580 non-null  int64 
 2   new_user             284580 non-null  object
 3   source               284580 non-null  object
 4   total_pages_visited  284580 non-null  int64 
 5   converted            284580 non-null  int64 
dtypes: int64(3), object(3)
memory usage: 13.0+ MB


None

Clean dataset with no missing values.

---

### Target variable

In [7]:
df_converted = data_train["converted"].map({0:"No", 1:"Yes"}).value_counts().reset_index()

fig_1 = px.bar(
    df_converted,
    x="converted",
    y="count",
    text="count",
    color="converted",
    color_discrete_map={"No":px.colors.qualitative.Plotly[1], "Yes":px.colors.qualitative.G10[0]}
)

fig_1.update_layout(
    title="Target variable distribution",
    xaxis_title="Converted",
    yaxis_title="Count",
    showlegend=False
)

fig_1.show()

The distribution of the target variable is very unbalanced :
- 275 400 not converted (96.8%)
- 9 180 converted (3.2%)

---

### Variables distribution

In [8]:
# The dataset is huge, we'll use a sample for visualization
data_sample = data_train.sample(10000)
data_sample["converted"] = data_sample["converted"].map({0:"No",  1:"Yes"})

#### Numerical variables

In [9]:
numerical_features = data_sample.select_dtypes(exclude=["object"]).columns

for feature in numerical_features:
    fig = px.histogram(
        data_sample,
        x=feature,
        color_discrete_sequence=px.colors.qualitative.G10
    )

    fig.update_layout(
        title=f"{feature} distribution",
        xaxis_title=feature,
        yaxis_title="Count")
    
    fig.show()

#### Categorical variables

In [10]:
categorical_features = data_sample.drop("converted", axis=1).select_dtypes(include=["object"]).columns

for feature in categorical_features:
    df_grouped = data_sample[feature].value_counts().reset_index()
    df_grouped.columns = [feature, "count"]

    fig = px.bar(
        df_grouped,
        x=feature,
        y="count",
        text="count",
        color_discrete_sequence=px.colors.qualitative.G10
    )

    fig.update_layout(
        title=f"{feature} distribution",
        xaxis_title=feature,
        yaxis_title="Count")
    
    fig.show()

---

### Variables distribution according to the target variable

#### Numerical variables

In [11]:
numerical_features = data_sample.select_dtypes(exclude=["object"]).columns

for feature in numerical_features:
    fig = px.box(
        data_sample,
        y=feature,
        color="converted",
        color_discrete_map={"No":px.colors.qualitative.Plotly[1], "Yes":px.colors.qualitative.G10[0]}
    )

    fig.update_layout(
        title=f"{feature} distribution",
        xaxis_title=feature,
        yaxis_title="Count")
    
    fig.show()

#### Categorical variables

In [12]:
categorical_features = data_sample.drop("converted", axis=1).select_dtypes(include=["object"]).columns

for feature in categorical_features:
    df_grouped = data_sample.groupby([feature, "converted"]).size().reset_index(name="count")

    fig = px.bar(
        df_grouped,
        x=feature,
        y="count",
        barmode="group",
        color="converted",
        text="count",
        color_discrete_map={"No":px.colors.qualitative.Plotly[1], "Yes":px.colors.qualitative.G10[0]}
    )

    fig.update_layout(
        title=f"{feature} distribution",
        xaxis_title=feature,
        yaxis_title="Count")
    
    fig.show()

---

### Bivariate analysis

In [13]:
fig_2 = px.scatter_matrix(
    data_sample
)

fig_2.update_layout(
    title="Bivariate analysis",
    showlegend = False,
    autosize=False,
    height=1200,
    width = 1200
)

fig_2.show()

---

### Correlation matrix

In [14]:
corr_matrix = data_train.select_dtypes(exclude=["object"]).corr().round(2)

fig_3 = ff.create_annotated_heatmap(
    corr_matrix.values,
    x=corr_matrix.columns.tolist(),
    y=corr_matrix.index.tolist()

)
fig_3.show()

Thanks to the correlation matrix, we can see that the variable total_pages_visited is strongly correlated with the target variable, so we will use it to create our baseline.