In [2]:
import pandas as pd

In [15]:
# Note: The column names are assumed as 's' (start) and 'e' (end)
csv_df=pd.read_csv('site_data.csv')
csv_df.columns = ['s', 'e']

# Calculating the transitions using groupby and count
transitions_df = csv_df.groupby(['s', 'e']).size().reset_index(name='count')

# Normalizing the counts to get probabilities (preserving the current behavior)
transitions_df['probability'] = transitions_df.groupby('s', group_keys=False)['count'].apply(lambda x: x / x.sum())

# Initial state distribution
initial_state_dist = transitions_df[transitions_df['s'] == -1][['e', 'probability']]

# Bounce rate calculation (assuming 'B' as the bounce state)
bounce_rates = transitions_df[transitions_df['e'] == 'B'][['s', 'probability']]

# Finding the page with the highest bounce rate
max_bounce_page = bounce_rates.loc[bounce_rates['probability'].idxmax()]

print("Initial state distribution:")
print(initial_state_dist)
print("\nBounce Rates for Each Page:")
print(bounce_rates)
print("\nPage with the Highest Bounce Rate:")
print("Page:", max_bounce_page['s'])
print("Bounce rate:", max_bounce_page['probability'])


Initial state distribution:
   e  probability
0  0     0.102992
1  1     0.103495
2  2     0.095085
3  3     0.095135
4  4     0.102438
5  5     0.097804
6  6     0.098006
7  7     0.099718
8  8     0.101481
9  9     0.103848

Bounce Rates for Each Page:
     s  probability
20   0     0.127967
32   1     0.125940
44   2     0.126496
56   3     0.127434
68   4     0.125576
80   5     0.123696
92   6     0.120815
104  7     0.123717
116  8     0.125296
128  9     0.131762

Page with the Highest Bounce Rate:
Page: 9.0
Bounce rate: 0.13176232104396302
