# Google.com Loading Times Analysis

This notebook performs a comprehensive analysis of Google.com loading times with 1, 2, 3, 4, and 5 concurrent users.

## Questions to Answer:
- Are there times of the year or of the day with higher/lower loading times?
- What are the average loading times for different user counts?
- How do loading times vary over time?

In [None]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Set style for better-looking plots
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 8)

## 1. Load and Explore the Data

In [None]:
# Load the data
df = pd.read_csv('loading_times.csv', names=['Timestamp', 'Users', 'Avg_Loading_Time'])

# Convert Timestamp to datetime
df['Timestamp'] = pd.to_datetime(df['Timestamp'])

# Display basic information
print("Dataset Shape:", df.shape)
print("\nFirst few rows:")
df.head(10)

In [None]:
# Basic statistics
print("Basic Statistics:")
print(df.describe())
print("\nData types:")
print(df.dtypes)
print("\nMissing values:")
print(df.isnull().sum())

## 2. Extract Time-based Features

In [None]:
# Extract time-based features
df['Date'] = df['Timestamp'].dt.date
df['Hour'] = df['Timestamp'].dt.hour
df['DayOfWeek'] = df['Timestamp'].dt.day_name()
df['Month'] = df['Timestamp'].dt.month_name()
df['Year'] = df['Timestamp'].dt.year
df['DayOfYear'] = df['Timestamp'].dt.dayofyear

print("Enhanced dataset with time features:")
df.head()

## 3. Average Loading Times by Number of Users

In [None]:
# Calculate average loading times by number of users
avg_by_users = df.groupby('Users')['Avg_Loading_Time'].agg(['mean', 'median', 'std', 'min', 'max', 'count'])
avg_by_users.columns = ['Mean', 'Median', 'Std Dev', 'Min', 'Max', 'Count']
print("Average Loading Times by Number of Users (in milliseconds):")
print(avg_by_users.round(2))

In [None]:
# Visualize average loading times by number of users
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Bar plot of mean loading times
avg_by_users['Mean'].plot(kind='bar', ax=axes[0], color='steelblue')
axes[0].set_title('Mean Loading Time by Number of Users', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Number of Concurrent Users')
axes[0].set_ylabel('Mean Loading Time (ms)')
axes[0].tick_params(axis='x', rotation=0)
axes[0].grid(axis='y', alpha=0.3)

# Box plot of loading times distribution
df.boxplot(column='Avg_Loading_Time', by='Users', ax=axes[1])
axes[1].set_title('Loading Time Distribution by Number of Users', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Number of Concurrent Users')
axes[1].set_ylabel('Loading Time (ms)')
axes[1].get_figure().suptitle('')  # Remove default title

plt.tight_layout()
plt.show()

## 4. Loading Times by Hour of Day

In [None]:
# Average loading times by hour of day
avg_by_hour = df.groupby('Hour')['Avg_Loading_Time'].agg(['mean', 'count'])
print("Average Loading Times by Hour of Day:")
print(avg_by_hour.round(2))

In [None]:
# Visualize loading times by hour of day
fig, axes = plt.subplots(2, 1, figsize=(16, 10))

# Line plot of mean loading times by hour
avg_by_hour['mean'].plot(kind='line', ax=axes[0], marker='o', color='darkgreen', linewidth=2)
axes[0].set_title('Mean Loading Time by Hour of Day (UTC)', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Hour of Day (0-23)')
axes[0].set_ylabel('Mean Loading Time (ms)')
axes[0].grid(alpha=0.3)
axes[0].set_xticks(range(0, 24))

# Heatmap of loading times by hour and number of users
pivot_hour_users = df.pivot_table(values='Avg_Loading_Time', index='Hour', columns='Users', aggfunc='mean')
sns.heatmap(pivot_hour_users, annot=True, fmt='.0f', cmap='YlOrRd', ax=axes[1], cbar_kws={'label': 'Loading Time (ms)'})
axes[1].set_title('Loading Time Heatmap: Hour of Day vs Number of Users', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Number of Concurrent Users')
axes[1].set_ylabel('Hour of Day (UTC)')

plt.tight_layout()
plt.show()

## 5. Loading Times by Day of Week

In [None]:
# Average loading times by day of week
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
avg_by_day = df.groupby('DayOfWeek')['Avg_Loading_Time'].agg(['mean', 'count'])
avg_by_day = avg_by_day.reindex([day for day in day_order if day in avg_by_day.index])
print("Average Loading Times by Day of Week:")
print(avg_by_day.round(2))

In [None]:
# Visualize loading times by day of week
fig, ax = plt.subplots(figsize=(14, 6))
avg_by_day['mean'].plot(kind='bar', ax=ax, color='coral')
ax.set_title('Mean Loading Time by Day of Week', fontsize=14, fontweight='bold')
ax.set_xlabel('Day of Week')
ax.set_ylabel('Mean Loading Time (ms)')
ax.tick_params(axis='x', rotation=45)
ax.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

## 6. Loading Times by Month

In [None]:
# Average loading times by month
avg_by_month = df.groupby('Month')['Avg_Loading_Time'].agg(['mean', 'count'])
print("Average Loading Times by Month:")
print(avg_by_month.round(2))

In [None]:
# Visualize loading times by month
fig, ax = plt.subplots(figsize=(14, 6))
avg_by_month['mean'].plot(kind='bar', ax=ax, color='teal')
ax.set_title('Mean Loading Time by Month', fontsize=14, fontweight='bold')
ax.set_xlabel('Month')
ax.set_ylabel('Mean Loading Time (ms)')
ax.tick_params(axis='x', rotation=45)
ax.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

## 7. Time Series Analysis

In [None]:
# Plot time series of loading times for each user count
fig, ax = plt.subplots(figsize=(16, 8))

for users in sorted(df['Users'].unique()):
    user_data = df[df['Users'] == users].sort_values('Timestamp')
    ax.plot(user_data['Timestamp'], user_data['Avg_Loading_Time'], 
            label=f'{users} User(s)', marker='o', markersize=3, alpha=0.7)

ax.set_title('Loading Times Over Time by Number of Users', fontsize=14, fontweight='bold')
ax.set_xlabel('Date/Time')
ax.set_ylabel('Loading Time (ms)')
ax.legend()
ax.grid(alpha=0.3)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## 8. Daily Aggregated Analysis

In [None]:
# Daily average loading times
daily_avg = df.groupby('Date')['Avg_Loading_Time'].mean().reset_index()
daily_avg['Date'] = pd.to_datetime(daily_avg['Date'])

fig, ax = plt.subplots(figsize=(16, 6))
ax.plot(daily_avg['Date'], daily_avg['Avg_Loading_Time'], marker='o', linewidth=2, color='purple')
ax.set_title('Daily Average Loading Time', fontsize=14, fontweight='bold')
ax.set_xlabel('Date')
ax.set_ylabel('Average Loading Time (ms)')
ax.grid(alpha=0.3)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## 9. Summary Statistics and Insights

In [None]:
print("=" * 80)
print("SUMMARY INSIGHTS")
print("=" * 80)

print("\n1. OVERALL STATISTICS:")
print(f"   - Total measurements: {len(df):,}")
print(f"   - Date range: {df['Timestamp'].min().date()} to {df['Timestamp'].max().date()}")
print(f"   - Overall mean loading time: {df['Avg_Loading_Time'].mean():.2f} ms")
print(f"   - Overall median loading time: {df['Avg_Loading_Time'].median():.2f} ms")
print(f"   - Overall std deviation: {df['Avg_Loading_Time'].std():.2f} ms")

print("\n2. LOADING TIMES BY NUMBER OF USERS:")
for users in sorted(df['Users'].unique()):
    user_data = df[df['Users'] == users]['Avg_Loading_Time']
    print(f"   - {users} user(s): Mean = {user_data.mean():.2f} ms, Median = {user_data.median():.2f} ms")

print("\n3. PEAK LOADING TIMES:")
peak_hour = avg_by_hour['mean'].idxmax()
print(f"   - Highest average loading time occurs at hour {peak_hour}:00 UTC ({avg_by_hour.loc[peak_hour, 'mean']:.2f} ms)")

lowest_hour = avg_by_hour['mean'].idxmin()
print(f"   - Lowest average loading time occurs at hour {lowest_hour}:00 UTC ({avg_by_hour.loc[lowest_hour, 'mean']:.2f} ms)")

if len(avg_by_day) > 0:
    peak_day = avg_by_day['mean'].idxmax()
    print(f"   - Highest average loading time occurs on {peak_day} ({avg_by_day.loc[peak_day, 'mean']:.2f} ms)")
    
    lowest_day = avg_by_day['mean'].idxmin()
    print(f"   - Lowest average loading time occurs on {lowest_day} ({avg_by_day.loc[lowest_day, 'mean']:.2f} ms)")

print("\n4. CORRELATION BETWEEN USERS AND LOADING TIME:")
correlation = df['Users'].corr(df['Avg_Loading_Time'])
print(f"   - Correlation coefficient: {correlation:.4f}")
if correlation > 0.5:
    print("   - Strong positive correlation: More users generally lead to higher loading times")
elif correlation > 0:
    print("   - Weak positive correlation: Some tendency for more users to increase loading times")
else:
    print("   - No clear correlation between number of users and loading times")

print("\n" + "=" * 80)

## 10. Advanced Visualizations

In [None]:
# Violin plot showing distribution of loading times for each user count
fig, ax = plt.subplots(figsize=(14, 6))
sns.violinplot(data=df, x='Users', y='Avg_Loading_Time', ax=ax, palette='Set2')
ax.set_title('Distribution of Loading Times by Number of Users', fontsize=14, fontweight='bold')
ax.set_xlabel('Number of Concurrent Users')
ax.set_ylabel('Loading Time (ms)')
ax.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Scatter plot with regression line
fig, ax = plt.subplots(figsize=(12, 6))
sns.regplot(data=df, x='Users', y='Avg_Loading_Time', ax=ax, scatter_kws={'alpha':0.3}, line_kws={'color':'red'})
ax.set_title('Loading Time vs Number of Users (with Regression Line)', fontsize=14, fontweight='bold')
ax.set_xlabel('Number of Concurrent Users')
ax.set_ylabel('Loading Time (ms)')
ax.grid(alpha=0.3)
plt.tight_layout()
plt.show()