# LinkNYC Weekly Usage Data Analysis

Comprehensive analysis of LinkNYC weekly usage patterns, including data profiling, statistical analysis, and visualization.


## Setup and Data Loading

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Set up plotting style
plt.style.use('default')
sns.set_palette("husl")
%matplotlib inline

In [None]:
def parse_time_to_minutes(time_str):
    """Convert HH:MM:SS format to total minutes"""
    try:
        parts = str(time_str).split(':')
        if len(parts) == 3:
            hours, minutes, seconds = map(int, parts)
            return hours * 60 + minutes + seconds / 60
        return np.nan
    except:
        return np.nan

# Load the data
data_file = '../data/linknyc_weekly_usage__updated_.csv'
df = pd.read_csv(data_file)

# Clean column names
df.columns = df.columns.str.strip()

# Convert date column
df['Report_Date'] = pd.to_datetime(df.iloc[:, 0])

# Extract time components from average session length
session_col = df.columns[2]  # Average Session Length column
df['Avg_Session_Minutes'] = df[session_col].apply(parse_time_to_minutes)

print(f"Data loaded successfully!")
print(f"Shape: {df.shape}")
df.head()

## Dataset Overview

In [None]:
print("=" * 80)
print("LINKNYC WEEKLY USAGE DATA PROFILE")
print("=" * 80)

print(f"\n📊 DATASET OVERVIEW")
print(f"{'─' * 40}")
print(f"Shape: {df.shape[0]:,} rows × {df.shape[1]} columns")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print(f"Date range: {df['Report_Date'].min().strftime('%Y-%m-%d')} to {df['Report_Date'].max().strftime('%Y-%m-%d')}")
print(f"Time span: {(df['Report_Date'].max() - df['Report_Date'].min()).days} days")

## Data Quality Assessment

In [None]:
# Column information
print(f"\n📋 COLUMNS")
print(f"{'─' * 40}")
for i, col in enumerate(df.columns):
    dtype = df[col].dtype
    null_count = df[col].isnull().sum()
    null_pct = (null_count / len(df)) * 100
    print(f"{i+1:2d}. {col[:40]:<40} {str(dtype):<12} {null_count:>6} nulls ({null_pct:5.1f}%)")

# Data quality metrics
print(f"\n🔍 DATA QUALITY")
print(f"{'─' * 40}")
total_cells = df.shape[0] * df.shape[1]
missing_cells = df.isnull().sum().sum()
print(f"Total cells: {total_cells:,}")
print(f"Missing cells: {missing_cells:,} ({missing_cells/total_cells*100:.2f}%)")
print(f"Complete rows: {df.dropna().shape[0]:,} ({df.dropna().shape[0]/df.shape[0]*100:.1f}%)")

## Statistical Summary

In [None]:
# Statistical summary for numeric columns
numeric_cols = df.select_dtypes(include=[np.number]).columns
print(f"\n📈 STATISTICAL SUMMARY")
print(f"{'─' * 40}")
display(df[numeric_cols].describe().round(2))

## Key Insights

In [None]:
print(f"\n💡 KEY INSIGHTS")
print(f"{'─' * 40}")

# Sessions analysis
if 'Number of Sessions' in df.columns:
    sessions = df['Number of Sessions']
    print(f"• Peak weekly sessions: {sessions.max():,} ({df.loc[sessions.idxmax(), 'Report_Date'].strftime('%Y-%m-%d')})")
    print(f"• Lowest weekly sessions: {sessions.min():,} ({df.loc[sessions.idxmin(), 'Report_Date'].strftime('%Y-%m-%d')})")
    print(f"• Average weekly sessions: {sessions.mean():,.0f}")

    # Detect COVID impact
    covid_start = pd.to_datetime('2020-03-15')
    pre_covid = df[df['Report_Date'] < covid_start]['Number of Sessions'].mean()
    covid_period = df[(df['Report_Date'] >= covid_start) & (df['Report_Date'] < pd.to_datetime('2020-06-01'))]['Number of Sessions'].mean()
    if covid_period < pre_covid * 0.5:
        print(f"• COVID-19 impact detected: {((pre_covid - covid_period) / pre_covid * 100):.1f}% drop in sessions")

# Bandwidth analysis
if 'TB Downloaded' in df.columns and 'TB Uploaded' in df.columns:
    download_gb = df['TB Downloaded'] * 1024
    upload_gb = df['TB Uploaded'] * 1024
    print(f"• Peak weekly download: {download_gb.max():,.1f} GB")
    print(f"• Peak weekly upload: {upload_gb.max():,.1f} GB")
    print(f"• Download/Upload ratio: {download_gb.mean() / upload_gb.mean():.1f}:1")

# Unique users analysis
if 'Number of Unique Clients' in df.columns:
    unique_clients = df['Number of Unique Clients']
    print(f"• Peak unique weekly users: {unique_clients.max():,}")
    print(f"• Average unique weekly users: {unique_clients.mean():,.0f}")

# Session duration analysis
if 'Avg_Session_Minutes' in df.columns:
    avg_duration = df['Avg_Session_Minutes'].mean()
    print(f"• Average session duration: {avg_duration:.1f} minutes")

## Trends and Patterns

In [None]:
print(f"\n📊 TRENDS & PATTERNS")
print(f"{'─' * 40}")

# Calculate year-over-year growth
df['Year'] = df['Report_Date'].dt.year
if len(df['Year'].unique()) > 1:
    for year in sorted(df['Year'].unique())[1:]:
        current_year_avg = df[df['Year'] == year]['Number of Sessions'].mean()
        prev_year_avg = df[df['Year'] == year-1]['Number of Sessions'].mean()
        if not pd.isna(current_year_avg) and not pd.isna(prev_year_avg):
            growth = ((current_year_avg - prev_year_avg) / prev_year_avg) * 100
            print(f"• {year-1} to {year} average growth: {growth:+.1f}%")

# Seasonal patterns
df['Month'] = df['Report_Date'].dt.month
monthly_avg = df.groupby('Month')['Number of Sessions'].mean()
peak_month = monthly_avg.idxmax()
low_month = monthly_avg.idxmin()
month_names = ['', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
               'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
print(f"• Peak usage month: {month_names[peak_month]} ({monthly_avg[peak_month]:,.0f} avg sessions)")
print(f"• Lowest usage month: {month_names[low_month]} ({monthly_avg[low_month]:,.0f} avg sessions)")

## Visualizations

In [None]:
# Create comprehensive visualizations
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('LinkNYC Weekly Usage Data Analysis', fontsize=16, fontweight='bold')

# 1. Sessions over time
axes[0,0].plot(df['Report_Date'], df['Number of Sessions'], linewidth=2, color='#1f77b4')
axes[0,0].set_title('Weekly Sessions Over Time')
axes[0,0].set_ylabel('Number of Sessions')
axes[0,0].tick_params(axis='x', rotation=45)
axes[0,0].grid(True, alpha=0.3)

# Add COVID annotation
covid_date = pd.to_datetime('2020-03-15')
axes[0,0].axvline(x=covid_date, color='red', linestyle='--', alpha=0.7)
axes[0,0].text(covid_date, axes[0,0].get_ylim()[1]*0.8, 'COVID-19\nLockdown', 
              ha='center', va='center', color='red', fontsize=9)

# 2. Data usage over time
axes[0,1].plot(df['Report_Date'], df['TB Downloaded'] * 1024, label='Downloaded (GB)', linewidth=2)
axes[0,1].plot(df['Report_Date'], df['TB Uploaded'] * 1024, label='Uploaded (GB)', linewidth=2)
axes[0,1].set_title('Weekly Data Usage Over Time')
axes[0,1].set_ylabel('Data (GB)')
axes[0,1].legend()
axes[0,1].tick_params(axis='x', rotation=45)
axes[0,1].grid(True, alpha=0.3)

# 3. Unique clients distribution
axes[1,0].hist(df['Number of Unique Clients'], bins=30, alpha=0.7, edgecolor='black', color='green')
axes[1,0].set_title('Distribution of Weekly Unique Clients')
axes[1,0].set_xlabel('Number of Unique Clients')
axes[1,0].set_ylabel('Frequency')
axes[1,0].grid(True, alpha=0.3)

# 4. Session duration over time
axes[1,1].plot(df['Report_Date'], df['Avg_Session_Minutes'], linewidth=2, color='orange')
axes[1,1].set_title('Average Session Duration Over Time')
axes[1,1].set_ylabel('Duration (minutes)')
axes[1,1].tick_params(axis='x', rotation=45)
axes[1,1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## Monthly and Yearly Patterns

In [None]:
# Monthly patterns
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Monthly averages
monthly_avg = df.groupby('Month')['Number of Sessions'].mean()
ax1.bar(monthly_avg.index, monthly_avg.values, color='skyblue', edgecolor='navy')
ax1.set_title('Average Sessions by Month')
ax1.set_xlabel('Month')
ax1.set_ylabel('Average Sessions')
ax1.set_xticks(range(1, 13))
ax1.set_xticklabels(['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
                    'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])
ax1.grid(True, alpha=0.3)

# Yearly averages
yearly_avg = df.groupby('Year')['Number of Sessions'].mean()
ax2.bar(yearly_avg.index, yearly_avg.values, color='lightcoral', edgecolor='darkred')
ax2.set_title('Average Sessions by Year')
ax2.set_xlabel('Year')
ax2.set_ylabel('Average Sessions')
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## Data Export Summary

In [None]:
print("\n" + "="*60)
print("ANALYSIS SUMMARY")
print("="*60)
print(f"Analysis completed: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Dataset: LinkNYC Weekly Usage Data")
print(f"Records analyzed: {len(df):,}")
print(f"Time period: {df['Report_Date'].min().strftime('%Y-%m-%d')} to {df['Report_Date'].max().strftime('%Y-%m-%d')}")
print(f"Key finding: {'COVID-19 significantly impacted usage patterns' if covid_period < pre_covid * 0.5 else 'Steady usage patterns throughout period'}")