<a href="https://colab.research.google.com/github/samarthkhurana2000/User-Engagement-Analysis-Using-Python/blob/main/user_engagement_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
# import the libraries that will be used for data analysis
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio

In [7]:
# Read the CSV File
df=pd.read_csv('/content/bounce-rate.csv')


In [8]:
# we will check the data by using Head
df.head()

Unnamed: 0,Client ID,Sessions,Avg. Session Duration,Bounce Rate
0,577847600.0,367,00:01:35,87.19%
1,1583822000.0,260,00:01:04,29.62%
2,1030699000.0,237,00:00:02,99.16%
3,1025030000.0,226,00:02:22,25.66%
4,1469968000.0,216,00:01:23,46.76%


In [9]:
# we will check the information about the dataset mai ly the data type of the attributes
# we have seen both "avg session duration" and "Bounce Rate" are numerical but here in the data set is shown as object
# so we will change  the data type of both "avg session duration" and "bounce rte" to float
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999 entries, 0 to 998
Data columns (total 4 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Client ID              999 non-null    float64
 1   Sessions               999 non-null    int64  
 2   Avg. Session Duration  999 non-null    object 
 3   Bounce Rate            999 non-null    object 
dtypes: float64(1), int64(1), object(2)
memory usage: 31.3+ KB


In [10]:
# we will check that the data contains null value or not
df.isnull().sum()

Client ID                0
Sessions                 0
Avg. Session Duration    0
Bounce Rate              0
dtype: int64

In [11]:
df.columns

Index(['Client ID', 'Sessions', 'Avg. Session Duration', 'Bounce Rate'], dtype='object')

In [12]:
# In below code we remove the first character from each value in 'Avg Session Duration' column representing  a unit of time
df['Avg. Session Duration'] = df['Avg. Session Duration'].str[1:]
# Then,we converted the values of 'Avg session duration' to time delta format
df['Avg. Session Duration'] = pd.to_timedelta(df['Avg. Session Duration'])
#  Then after that we convert time delta format values to minutes and converting the 'avg session duration' into numerical values
df['Avg. Session Duration'] = df['Avg. Session Duration'] / pd.Timedelta(minutes=1)
# at last we remove the percentage sign from each value in the  'Bounce Rate' column and conver them into the float values
df['Bounce Rate'] = df['Bounce Rate'].str.rstrip('%').astype('float')


In [13]:
# this code is to check whether above codes to change the data types and further information is executed or not
df.head()


Unnamed: 0,Client ID,Sessions,Avg. Session Duration,Bounce Rate
0,577847600.0,367,1.583333,87.19
1,1583822000.0,260,1.066667,29.62
2,1030699000.0,237,0.033333,99.16
3,1025030000.0,226,2.366667,25.66
4,1469968000.0,216,1.383333,46.76


In [14]:
# 'Client ID' is of no use , so we will drop this column
df.drop('Client ID', axis=1, inplace=True)



In [15]:
# Descriptive statistics of the data
df.describe()

Unnamed: 0,Sessions,Avg. Session Duration,Bounce Rate
count,999.0,999.0,999.0
mean,32.259259,3.63652,65.307978
std,24.658588,4.040562,22.99727
min,17.0,0.0,4.88
25%,21.0,0.891667,47.37
50%,25.0,2.466667,66.67
75%,35.0,4.816667,85.19
max,367.0,30.666667,100.0


In [16]:
# Calculation of Correlation
correlation_=df.corr()

In [17]:
# Visualize the Correlation
correlation_figure=px.imshow(correlation_,labels=dict(x='Features',y='Features',color='Correlation'))
correlation_figure.update_layout(title='Correlation')


In [18]:
# Define the class for each bounce rate(high,medium,low)
high_bounce_rate = 70
low_bounce_rate = 30
# # Put the clients on each class based on the Bounce Rates

df['bounce_rate_class'] = pd.cut(
    df['Bounce Rate'],
    bins=[0, low_bounce_rate, high_bounce_rate, 100],
    labels=['Low', 'Medium', 'High'],
    right=False
)
#  Count the no. of clients in each class( that we made on the basis of the bounce rates)
class_counts = df['bounce_rate_class'].value_counts().sort_index()
#  Last code is to visualize the segment
class_figure = px.bar(class_counts, labels={'index': 'Bounce Rate Segment',
                                             'value': 'Number of Clients'},
                     title='Class of Clients based on Bounce Rates')
class_figure.show()


In [19]:
# average session duration for each bounce rate class
class_average_duration = df.groupby('bounce_rate_class')['Avg. Session Duration'].mean()

# bar chart to compare user engagement
engagement_figure = go.Figure(data=go.Bar(
    x=class_average_duration.index,
    y=class_average_duration,
    text=class_average_duration.round(2),
    textposition='auto'
))

# Customize the layout of the chart
engagement_figure.update_layout(
    title='Comparison of User Engagement by Bounce Rate Class',
    xaxis=dict(title='Bounce Rate Class'),
    yaxis=dict(title='Average Session Duration (minutes)')
)

# Display the chart
engagement_figure.show()


In [20]:
# Total Session duration for each client
df['Total_session_duration']=df['Sessions']*df['Avg. Session Duration']
# Sort the data frame  by total session duration in descending order
df_sorted_total_session=df.sort_values('Total_session_duration',ascending=False)
# Top Most Users
df_sorted_total_session.head(10)

# scatter plot to analyze the relationship between bounce rate and average session duration
scatter_figure=px.scatter(df,x='Bounce Rate',
                          y='Avg. Session Duration',
                          title='Relationship between bounce rate and average session duration')
scatter_figure.update_layout(xaxis=dict(title='bounce rate'),
                             yaxis=dict(title='average session duration'))
scatter_figure.show()

In [21]:

# Define the retention segment on the basis of the sessions
def get_retention_segment(row):
    if row['Sessions'] >= 32: # 32 is mean of sessions
        return 'Frequent Users'
    else:
        return 'Occasional Users'
# Create a new column for retention segment
df['retention_segment'] = df.apply(get_retention_segment, axis=1)

# calculate average of bounce rate for each retention segment
avg_segment_bounce_rates = df.groupby('retention_segment')['Bounce Rate'].mean().reset_index()

#Bar chart to visualize average bounce rate on the basis of the retention segment

bar_figure = px.bar(avg_segment_bounce_rates, x='retention_segment', y='Bounce Rate',
                 title='Average Bounce Rate by Retention Segment',
                 labels={'retention_segment': 'Retention Segment', 'Bounce Rate': 'Average Bounce Rate'})
bar_figure.show()

In [22]:
# count the number of users in each retention segment

retention_segment_counts = df['retention_segment'].value_counts()

# pie chart to anlyze no of user in frequent class and ocassional class

pie_chart = px.pie(
    retention_segment_counts,
    values=retention_segment_counts.values,
    names=retention_segment_counts.index,
    color=retention_segment_counts.index,
    title='User Retention Rate'
)

# Display the pie chart
pie_chart.show()