# Social Media Sentiment Analysis - AWS Tier 2

This notebook demonstrates:
1. Downloading sample social media data
2. Uploading to S3
3. Querying sentiment results from DynamoDB
4. Visualizing sentiment trends
5. Network analysis of user interactions
6. Hashtag co-occurrence analysis

**Duration:** 30-45 minutes  
**Cost:** $6-10 (includes AWS Comprehend API calls)

## Setup and Configuration

In [None]:
# Import libraries
import boto3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
from collections import Counter
from datetime import datetime, timedelta
import json
import os

# Configure plotting
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')
%matplotlib inline

print("Libraries imported successfully!")

In [None]:
# AWS Configuration
AWS_REGION = 'us-east-1'
BUCKET_NAME = 'social-media-data-XXXX'  # Replace with your bucket name
DYNAMODB_TABLE = 'SocialMediaPosts'

# Initialize AWS clients
s3_client = boto3.client('s3', region_name=AWS_REGION)
dynamodb = boto3.resource('dynamodb', region_name=AWS_REGION)
table = dynamodb.Table(DYNAMODB_TABLE)

print(f"AWS clients initialized for region: {AWS_REGION}")
print(f"S3 Bucket: {BUCKET_NAME}")
print(f"DynamoDB Table: {DYNAMODB_TABLE}")

## 1. Create Sample Data

Generate sample social media posts for demonstration.

In [None]:
# Create sample posts
sample_posts = [
    {
        "post_id": "001",
        "text": "Just finished an amazing research project on climate change! Excited to share results soon. #research #climate #science",
        "timestamp": int(datetime.now().timestamp()) - 86400,
        "user_id": "user001",
        "username": "researcher_jane"
    },
    {
        "post_id": "002",
        "text": "Disappointed with the lack of funding for social science research. We need more support! #funding #socialscience",
        "timestamp": int(datetime.now().timestamp()) - 82800,
        "user_id": "user002",
        "username": "prof_smith"
    },
    {
        "post_id": "003",
        "text": "Great collaboration with @researcher_jane today! Looking forward to our next meeting. #collaboration",
        "timestamp": int(datetime.now().timestamp()) - 79200,
        "user_id": "user003",
        "username": "dr_anderson"
    },
    {
        "post_id": "004",
        "text": "New paper published! Check out our findings on social media sentiment analysis. Link in bio. #publication #research",
        "timestamp": int(datetime.now().timestamp()) - 75600,
        "user_id": "user001",
        "username": "researcher_jane"
    },
    {
        "post_id": "005",
        "text": "Frustrated with the peer review process. Why does it take so long? #academia #peerreview",
        "timestamp": int(datetime.now().timestamp()) - 72000,
        "user_id": "user004",
        "username": "grad_student"
    },
]

# Save to JSON file
with open('sample_posts.json', 'w') as f:
    json.dump(sample_posts, f, indent=2)

print(f"Created {len(sample_posts)} sample posts")
print("\nFirst post example:")
print(json.dumps(sample_posts[0], indent=2))

## 2. Upload Data to S3

Upload sample posts to S3, which will trigger Lambda processing.

In [None]:
# Upload to S3
s3_key = 'raw/sample_posts.json'

with open('sample_posts.json', 'r') as f:
    s3_client.put_object(
        Bucket=BUCKET_NAME,
        Key=s3_key,
        Body=f.read(),
        ContentType='application/json'
    )

print(f"Uploaded sample posts to s3://{BUCKET_NAME}/{s3_key}")
print("\nLambda function should automatically process these posts.")
print("Wait 10-30 seconds for processing to complete...")

In [None]:
# Wait for processing
import time
print("Waiting for Lambda processing...")
for i in range(10, 0, -1):
    print(f"{i}...", end=' ', flush=True)
    time.sleep(1)
print("\nDone! Now querying results...")

## 3. Query Results from DynamoDB

In [None]:
# Query all posts from DynamoDB
response = table.scan(Limit=100)
items = response.get('Items', [])

print(f"Retrieved {len(items)} posts from DynamoDB")

# Convert to DataFrame
df = pd.DataFrame(items)
print("\nDataFrame shape:", df.shape)
print("\nColumn names:", df.columns.tolist())
df.head()

In [None]:
# Display basic statistics
print("Sentiment Distribution:")
print(df['sentiment'].value_counts())
print("\nSentiment Scores Summary:")
print(df[['positive_score', 'negative_score', 'neutral_score']].describe())

## 4. Sentiment Visualization

In [None]:
# Plot sentiment distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Pie chart
sentiment_counts = df['sentiment'].value_counts()
axes[0].pie(sentiment_counts.values, labels=sentiment_counts.index, autopct='%1.1f%%')
axes[0].set_title('Sentiment Distribution')

# Bar chart
sentiment_counts.plot(kind='bar', ax=axes[1], color=['green', 'red', 'blue', 'orange'])
axes[1].set_title('Sentiment Counts')
axes[1].set_xlabel('Sentiment')
axes[1].set_ylabel('Count')
axes[1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

In [None]:
# Plot sentiment scores distribution
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

axes[0].hist(df['positive_score'], bins=20, color='green', alpha=0.7)
axes[0].set_title('Positive Score Distribution')
axes[0].set_xlabel('Score')
axes[0].set_ylabel('Frequency')

axes[1].hist(df['negative_score'], bins=20, color='red', alpha=0.7)
axes[1].set_title('Negative Score Distribution')
axes[1].set_xlabel('Score')
axes[1].set_ylabel('Frequency')

axes[2].hist(df['neutral_score'], bins=20, color='blue', alpha=0.7)
axes[2].set_title('Neutral Score Distribution')
axes[2].set_xlabel('Score')
axes[2].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

## 5. Hashtag Analysis

In [None]:
# Extract all hashtags
all_hashtags = []
for hashtags in df['hashtags']:
    if isinstance(hashtags, list):
        all_hashtags.extend(hashtags)

hashtag_counts = Counter(all_hashtags)
print("Top 10 Hashtags:")
for hashtag, count in hashtag_counts.most_common(10):
    print(f"  #{hashtag}: {count}")

In [None]:
# Plot top hashtags
top_hashtags = dict(hashtag_counts.most_common(10))

plt.figure(figsize=(12, 6))
plt.bar(range(len(top_hashtags)), list(top_hashtags.values()))
plt.xticks(range(len(top_hashtags)), [f'#{h}' for h in top_hashtags.keys()], rotation=45, ha='right')
plt.xlabel('Hashtag')
plt.ylabel('Frequency')
plt.title('Top 10 Hashtags')
plt.tight_layout()
plt.show()

In [None]:
# Hashtag sentiment correlation
hashtag_sentiment = {}

for _, row in df.iterrows():
    if isinstance(row['hashtags'], list):
        for hashtag in row['hashtags']:
            if hashtag not in hashtag_sentiment:
                hashtag_sentiment[hashtag] = []
            hashtag_sentiment[hashtag].append(row['positive_score'])

# Calculate average positive score per hashtag
hashtag_avg_sentiment = {h: np.mean(scores) for h, scores in hashtag_sentiment.items() if len(scores) > 0}

print("\nHashtag Average Positive Sentiment:")
for hashtag, avg_score in sorted(hashtag_avg_sentiment.items(), key=lambda x: x[1], reverse=True)[:5]:
    print(f"  #{hashtag}: {avg_score:.3f}")

## 6. User Mention Network Analysis

In [None]:
# Build mention network
G = nx.DiGraph()

for _, row in df.iterrows():
    username = row.get('username', 'unknown')
    mentions = row.get('mentions', [])
    
    # Add nodes
    G.add_node(username)
    
    # Add edges for mentions
    if isinstance(mentions, list):
        for mention in mentions:
            G.add_edge(username, mention)

print(f"Network statistics:")
print(f"  Nodes (users): {G.number_of_nodes()}")
print(f"  Edges (mentions): {G.number_of_edges()}")

In [None]:
# Visualize network
if G.number_of_nodes() > 0:
    plt.figure(figsize=(12, 8))
    
    # Calculate node sizes based on degree
    degrees = dict(G.degree())
    node_sizes = [300 + degrees[node] * 200 for node in G.nodes()]
    
    # Layout
    pos = nx.spring_layout(G, k=0.5, iterations=50)
    
    # Draw network
    nx.draw_networkx_nodes(G, pos, node_size=node_sizes, node_color='lightblue', alpha=0.7)
    nx.draw_networkx_edges(G, pos, edge_color='gray', alpha=0.5, arrows=True, arrowsize=20)
    nx.draw_networkx_labels(G, pos, font_size=10)
    
    plt.title('User Mention Network')
    plt.axis('off')
    plt.tight_layout()
    plt.show()
else:
    print("No mention network found in data")

In [None]:
# Network centrality measures
if G.number_of_nodes() > 0:
    degree_centrality = nx.degree_centrality(G)
    
    print("\nMost central users (by degree):")
    for user, centrality in sorted(degree_centrality.items(), key=lambda x: x[1], reverse=True)[:5]:
        print(f"  @{user}: {centrality:.3f}")

## 7. Temporal Analysis (if timestamps available)

In [None]:
# Convert timestamps to datetime
if 'timestamp' in df.columns:
    df['datetime'] = pd.to_datetime(df['timestamp'], unit='s')
    df = df.sort_values('datetime')
    
    print("Time range of posts:")
    print(f"  Earliest: {df['datetime'].min()}")
    print(f"  Latest: {df['datetime'].max()}")
    
    # Plot sentiment over time
    plt.figure(figsize=(14, 6))
    
    for sentiment in ['POSITIVE', 'NEGATIVE', 'NEUTRAL']:
        sentiment_df = df[df['sentiment'] == sentiment]
        plt.plot(sentiment_df['datetime'], sentiment_df['positive_score'], 
                marker='o', label=sentiment, alpha=0.7)
    
    plt.xlabel('Time')
    plt.ylabel('Positive Score')
    plt.title('Sentiment Scores Over Time')
    plt.legend()
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
else:
    print("Timestamp column not found")

## 8. Summary Report

In [None]:
print("=" * 80)
print("SOCIAL MEDIA SENTIMENT ANALYSIS SUMMARY")
print("=" * 80)
print(f"\nTotal posts analyzed: {len(df)}")
print(f"\nSentiment Distribution:")
for sentiment, count in df['sentiment'].value_counts().items():
    percentage = (count / len(df)) * 100
    print(f"  {sentiment}: {count} ({percentage:.1f}%)")

print(f"\nAverage Sentiment Scores:")
print(f"  Positive: {df['positive_score'].mean():.3f}")
print(f"  Negative: {df['negative_score'].mean():.3f}")
print(f"  Neutral: {df['neutral_score'].mean():.3f}")

print(f"\nTop 5 Hashtags:")
for hashtag, count in hashtag_counts.most_common(5):
    print(f"  #{hashtag}: {count}")

print(f"\nNetwork Statistics:")
print(f"  Users: {G.number_of_nodes()}")
print(f"  Mentions: {G.number_of_edges()}")

print("\n" + "=" * 80)

## 9. Export Results

In [None]:
# Export to CSV
df.to_csv('sentiment_results.csv', index=False)
print("Results exported to sentiment_results.csv")

# Export summary statistics
summary = {
    'total_posts': len(df),
    'sentiment_distribution': df['sentiment'].value_counts().to_dict(),
    'average_scores': {
        'positive': float(df['positive_score'].mean()),
        'negative': float(df['negative_score'].mean()),
        'neutral': float(df['neutral_score'].mean())
    },
    'top_hashtags': dict(hashtag_counts.most_common(10))
}

with open('sentiment_summary.json', 'w') as f:
    json.dump(summary, f, indent=2)

print("Summary exported to sentiment_summary.json")

## Next Steps

1. **Upload more data**: Try uploading larger datasets (100+ posts)
2. **Real data**: Use Twitter API or other sources for real social media data
3. **Advanced analysis**: Topic modeling, emotion detection, trend analysis
4. **Athena queries**: Query exported data in S3 using SQL
5. **Dashboard**: Create interactive dashboard with QuickSight
6. **Cleanup**: Don't forget to delete AWS resources when done (see cleanup_guide.md)

**Remember to clean up AWS resources to avoid charges!**