< Analyzing Club Activity and Performance >


Step 1 : Importing necessary libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

Step 2 : Load the dataset

In [None]:
# Reading the CSV file into a DataFrame
df = pd.read_csv('AppsClub_events_data.csv')

# Setting the 'Session ID' column as the index
df.set_index('Session ID', inplace=True)

Step 3 : Handling missing values

In [None]:
# Detecting columns with missing values
missing_values = df.isna().sum()
print("Missing values in each column:")
print(missing_values)

# Handle missing values:

# For filling missing Attendees Count
df['Attendees Count'] = df['Attendees Count'].fillna(df['Attendees Count'].mean())

# For filling missing Rating values
df['Rating'] = df['Rating'].fillna(df['Rating'].mean())

# For categorical data, fill with the most frequent value
df['Online/Offline'] = df['Online/Offline'].fillna(df['Online/Offline'].mode()[0])

# Verify that missing values have been handled
print("\nMissing values after handling:")
print(df.isna().sum())

Step 4 : Event Analysis

In [None]:
# Group rows by event name
event_groups = df.groupby('Event Name')

# Calculate the average number of attendees for each event
avg_attendees_by_event = event_groups['Attendees Count'].mean().sort_values(ascending=False)
print("Average number of attendees by event:")
print(avg_attendees_by_event)

# Calculate the average rating for each event
avg_rating_by_event = event_groups['Rating'].mean().sort_values(ascending=False)
print("\nAverage rating by event:")
print(avg_rating_by_event)

# Compare online vs. offline events
online_offline_groups = df.groupby('Online/Offline')

# Calculate average attendees count for online vs. offline events
avg_attendees_by_mode = online_offline_groups['Attendees Count'].mean()
print("\nAverage attendees by event mode (Online/Offline):")
print(avg_attendees_by_mode)

# Calculate average rating for online vs. offline events
avg_rating_by_mode = online_offline_groups['Rating'].mean()
print("\nAverage rating by event mode (Online/Offline):")
print(avg_rating_by_mode)

Step 5 : Identify the most and least popular events

In [None]:
# Sort events by attendance count in descending order
events_by_attendance = df.groupby('Event Name')['Attendees Count'].mean().sort_values(ascending=False)

# Top 5 most attended events
top_5_events = events_by_attendance.head(5)
print("Top 5 most attended events:")
print(top_5_events)

# Bottom 5 least attended events
bottom_5_events = events_by_attendance.tail(5)
print("\nBottom 5 least attended events:")
print(bottom_5_events)

# Create a bar chart for all events by attendance count
plt.figure(figsize=(14, 8))
events_by_attendance.plot(kind='bar', color='skyblue')
plt.title('Events Ranked by Average Attendance')
plt.xlabel('Event Name')
plt.ylabel('Average Attendees Count')
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

Step 6: Analyze event duration vs. feedback score

In [None]:
# Create a scatter plot for duration vs. rating
plt.figure(figsize=(10, 6))
plt.scatter(df['Duration'], df['Rating'], alpha=0.7, c='blue', edgecolors='black')
plt.title('Relationship Between Event Duration and Feedback Score')
plt.xlabel('Duration (minutes)')
plt.ylabel('Rating')
plt.grid(True, linestyle='--', alpha=0.7)

# Calculate and plot the trend line
z = np.polyfit(df['Duration'], df['Rating'], 1)
p = np.poly1d(z)
plt.plot(df['Duration'], p(df['Duration']), "r--", alpha=0.7, 
         label=f'Trend line: y={z[0]:.4f}x+{z[1]:.4f}')

plt.legend()
plt.tight_layout()
plt.show()

Step 7: Count events organized by each member

In [None]:
# Count occurrences of each speaker
speaker_counts = df['Speaker'].value_counts()
print("Number of events organized by each member:")
print(speaker_counts)

Step 8: Identify the most active member

In [None]:
# Find the maximum number of events
max_events = speaker_counts.max()

# Find the speaker with the highest count
most_active_speaker = speaker_counts.idxmax()

print(f"The most active member is {most_active_speaker} who organized {max_events} events.")

Step 9: Best-rated organizing member

In [None]:
# Calculate the average rating per speaker
avg_rating_by_speaker = df.groupby('Speaker')['Rating'].mean().sort_values(ascending=False)

# Find the highest rating and the corresponding speaker
best_rated_speaker = avg_rating_by_speaker.idxmax()
best_rating = avg_rating_by_speaker.max()

print(f"The best-rated organizing member is {best_rated_speaker} with an average rating of {best_rating:.2f}.")

Step 10: Visualizing new attendee trend over time

In [None]:
# Convert date column to datetime format for proper sorting
df['Date'] = pd.to_datetime(df['Date'], format='%d-%b-%Y')

# Sort the dataset by date
df_sorted = df.sort_values('Date')

# Create a numeric representation of time (e.g., days since first event)
# First, make sure Date is in datetime format
df_sorted['Date'] = pd.to_datetime(df_sorted['Date'], format='%d-%b-%Y')
df_sorted['Days'] = (df_sorted['Date'] - df_sorted['Date'].min()).dt.days

# Now calculate correlation between Days and New Attendees
correlation = df_sorted['New Attendees'].corr(df_sorted['Days'])

# Determine trend
trend_message = "increasing" if correlation > 0 else "decreasing"

# Create the plot
plt.figure(figsize=(10, 6))
plt.plot(df_sorted['Date'], df_sorted['New Attendees'], marker='o', linestyle='-')
plt.title(f'New Attendees Over Time (Trend: {trend_message})')
plt.xlabel('Date')
plt.ylabel('Number of New Attendees')
plt.xticks(rotation=45)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Print trend information
print(f"The number of new attendees is generally {trend_message} over time.")
print(f"Correlation coefficient: {correlation:.2f}")