<a href="https://colab.research.google.com/github/socialx-analytics/bi-11-sept-25/blob/main/007_unstructured_data_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Section 1: Setup and Load Data**

In [None]:
import subprocess
import sys

packages = [
    "transformers",
    "scikit-learn",
    "plotly",
    "bertopic",
    "sentence-transformers",
    "umap-learn",
    "hdbscan",
    "wordcloud"
]

for package in packages:
    subprocess.check_call([sys.executable, "-m", "pip", "install", package, "--quiet"])

In [None]:
import pandas as pd
import re
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
from wordcloud import WordCloud, STOPWORDS

# For sentiment analysis
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# For clustering
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.manifold import MDS
import plotly.graph_objects as go
import plotly.colors as pcolors

# For topic modeling
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance

# Import community detection
from networkx.algorithms.community import greedy_modularity_communities

## **Import Data**

In [None]:
# Read the CSV file
df = pd.read_csv("tweet_digitalcurrency_text.csv").head(1000)

# Display the first 5 rows
df.head(5)

In [None]:
# Display column names
df.columns.tolist()

In [None]:
# Display dataset info
df.info()

# **Section 2: Social Network Analysis (SNA)**

## **Create Edgelist from Twitter Mentions**

In [None]:
# Create empty list to store edges
edge_list = []

# Loop through each tweet in the DataFrame
for index, row in df.iterrows():
    try:
        # Get the username of the tweet author
        account_name = row["screen_name"]

        # Get the tweet text
        tweet = row["text"]

        # Convert tweet to string to handle any data type issues
        tweet = str(tweet)

        # Find all mentions in the tweet (usernames starting with @)
        mentions = re.findall(r"@(\w+)", tweet)

        # Create edges between the author and each mentioned user
        # Add @ symbol to both source and target
        edges = [(f"@{account_name}", f"@{mention}") for mention in mentions]

        # Add these edges to our edge list
        edge_list.extend(edges)
    except Exception as e:
        continue

# Convert edge list to DataFrame with source and target columns
edgelist = pd.DataFrame(edge_list, columns=["source", "target"])

# Display the edgelist
edgelist.head(10)

In [None]:
print(f"Total edges found: {len(edgelist)}")
edgelist

## **Network Construction and Visualization**

In [None]:
# Construct Network from edgelist
G = nx.from_pandas_edgelist(edgelist, source="source", target="target")

# Create visualization
plt.figure(figsize=(30, 30))

# Visualize Network
nx.draw(
    G,
    with_labels=True,
    node_color="skyblue",
    node_size=1200,
    arrowstyle="->",
    arrowsize=20,
    edge_color="r",
    font_size=9,
    pos=nx.kamada_kawai_layout(G),
)
plt.show()

## **Network Metrics and Measurement**

### **Network Topology Measurement**

In [None]:
print(f"Number of Nodes: {nx.number_of_nodes(G)}")
print(f"Number of Edges: {nx.number_of_edges(G)}")
print(f"Graph Density: {nx.density(G):.4f}")

### **Centrality Measurements**

In [None]:
# Degree Centrality
degree = nx.degree_centrality(G)

# Top 10 nodes by Degree
sorted_degree = sorted(nx.degree(G), key=lambda x: x[1], reverse=True)[0:10]
for node, deg in sorted_degree:
    print(f"  {node}: {deg}")

In [None]:
# Betweenness Centrality
betweenness = nx.betweenness_centrality(G)

# Top 10 nodes by Betweenness Centrality
sorted_betweenness = sorted(
    nx.betweenness_centrality(G, normalized=True).items(),
    key=lambda x: x[1],
    reverse=True,
)[0:10]
for node, score in sorted_betweenness:
    print(f"  {node}: {score:.4f}")

In [None]:
# Closeness Centrality
closeness = nx.closeness_centrality(G)

# Top 10 nodes by Closeness Centrality
sorted_closeness = sorted(
    nx.closeness_centrality(G).items(), key=lambda x: x[1], reverse=True
)[0:10]
for node, score in sorted_closeness:
    print(f"  {node}: {score:.4f}")

In [None]:
# Visualize network based on degree
plt.figure(figsize=(30, 30))

# Set Degree Dictionary
d = dict(degree)

# Convert dict_keys to list for nodelist
nodelist = list(d.keys())

# Visualize Network based on degree
nx.draw(
    G,
    with_labels=True,
    node_color="skyblue",
    nodelist=nodelist,  # Use the list instead of dict_keys
    node_size=[v * 90000 for v in d.values()],
    arrowstyle="->",
    arrowsize=20,
    edge_color="r",
    font_size=10,
    pos=nx.kamada_kawai_layout(G),
)
plt.show()

## **Community Detection**

In [None]:
# Use greedy modularity communities for community detection
communities_m = sorted(greedy_modularity_communities(G), key=len, reverse=True)
print(f"Number of communities found: {len(communities_m)}")

# Set Node Community Function
def set_node_community(G, communities_m):
    """Add community to node attributes"""
    for c, v_c in enumerate(communities_m):
        for v in v_c:
            # Add 1 to save 0 for external edges
            G.nodes[v]["community"] = c + 1

# Set Colour Function
def get_color(i, r_off=1, g_off=1, b_off=1):
    """Assign a color to a vertex."""
    n = 16
    low, high = 0.1, 0.9
    span = high - low
    r = low + span * (((i + r_off) * 3) % n) / (n - 1)
    g = low + span * (((i + g_off) * 5) % n) / (n - 1)
    b = low + span * (((i + b_off) * 7) % n) / (n - 1)
    return (r, g, b)

# Set Node Communities
community = set_node_community(G, communities_m)

# Set Node Color
node_color = [get_color(G.nodes[v]["community"]) for v in G.nodes]

# Visualize Network based on community
plt.figure(figsize=(30, 30))
nx.draw(
    G,
    with_labels=True,
    node_color=node_color,
    node_size=1200,
    arrowstyle="->",
    arrowsize=20,
    edge_color="r",
    font_size=10,
    pos=nx.kamada_kawai_layout(G),
)
plt.show()

# **Section 3: Text Mining**

## **Word Cloud**

In [None]:
# Extract text column from dataframe
tweet_texts = df["text"]

# Preprocessing for wordcloud
# Remove URLs
tweet_texts_clean = tweet_texts.str.replace(r'http\S+|www.\S+', '', regex=True)

# Remove mentions and hashtags (optional, comment out if you want to keep them)
tweet_texts_clean = tweet_texts_clean.str.replace(r'@\w+|#\w+', '', regex=True)

# Custom stopwords for digital currency context
custom_stopwords = set([
    'digital', 'currency', 'crypto', 'bitcoin', 'btc',
    'cryptocurrency', 'blockchain', 'coin', 'token',
    'rt', 'amp', 'https', 'http', 'www', 'com'
])

# Combine all tweets into one large text
all_tweets = " ".join(tweet_texts_clean.astype(str))

# Generate word cloud with custom stopwords
stopwords = STOPWORDS.union(custom_stopwords)

wordcloud = WordCloud(
    width=800,
    height=400,
    background_color="white",
    colormap="viridis",
    max_words=100,
    stopwords=stopwords
).generate(all_tweets)

# Display the word cloud
plt.figure(figsize=(20, 10))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title("Word Cloud of Digital Currency Tweets", fontsize=20)
plt.tight_layout(pad=0)
plt.show()

## **Text Clustering with K-Means**

In [None]:
# Use cleaned text from sentiment analysis
clustering_texts = df["text"].str.lower()

# Clean text - remove URLs and mentions
clustering_texts = clustering_texts.str.replace(r'http\S+|www.\S+', '', regex=True)
clustering_texts = clustering_texts.str.replace(r'@\w+', '', regex=True)
clustering_texts = clustering_texts.dropna()
clustering_texts = clustering_texts.reset_index(drop=True)

print(f"Total texts for clustering: {len(clustering_texts)}")

In [None]:
# Generate TF-IDF embeddings
vectorizer = TfidfVectorizer(
    max_features=1000,
    stop_words='english',
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.95
)
tfidf_matrix = vectorizer.fit_transform(clustering_texts).toarray()

In [None]:
# Find optimal number of clusters using silhouette score
silhouette_scores = []
K = range(2, min(10, len(tfidf_matrix)))

for k in K:
    kmeans = KMeans(n_clusters=k, init="k-means++", random_state=42)
    kmeans.fit(tfidf_matrix)
    score = silhouette_score(tfidf_matrix, kmeans.labels_)
    silhouette_scores.append(score)

optimal_k = K[np.argmax(silhouette_scores)]
print(f"Optimal number of clusters: {optimal_k}")

In [None]:
# Perform clustering with optimal k
kmeans = KMeans(n_clusters=optimal_k, init="k-means++", random_state=42)
cluster_labels = kmeans.fit_predict(tfidf_matrix)

# Add cluster labels to dataframe
df_clustered = df.copy()
df_clustered["cluster"] = cluster_labels

# Get cluster distribution
cluster_counts = df_clustered['cluster'].value_counts().sort_index()
for cluster, count in cluster_counts.items():
    percentage = (count / len(df_clustered)) * 100
    print(f"Cluster {cluster}: {count} tweets ({percentage:.1f}%)")

In [None]:
# MDS for 2D visualization
mds = MDS(n_components=2, random_state=42, n_init=4)
vis_dims = mds.fit_transform(tfidf_matrix)

# Create visualization dataframe
df_vis = df_clustered.copy()
df_vis["mds_x"] = vis_dims[:, 0]
df_vis["mds_y"] = vis_dims[:, 1]

In [None]:
# Create interactive plotly visualization
fig = go.Figure()
colors = pcolors.qualitative.Plotly
colors = colors * (optimal_k // len(colors) + 1)

for cluster in range(optimal_k):
    cluster_data = df_vis[df_vis["cluster"] == cluster]

    if len(cluster_data) == 0:
        continue

    cluster_color = colors[cluster]

    # Get sample texts for hover
    sample_texts = cluster_data["text"].str[:100] + "..."

    fig.add_trace(
        go.Scatter(
            x=cluster_data["mds_x"],
            y=cluster_data["mds_y"],
            mode="markers",
            name=f"Cluster {cluster}",
            marker=dict(
                color=cluster_color,
                size=8,
                opacity=0.7,
            ),
            text=sample_texts,
            hovertemplate=(
                f"<b>Cluster {cluster}</b><br>"
                "Text: %{text}<extra></extra>"
            ),
        )
    )

fig.update_layout(
    title=dict(
        text=f"Text Clustering Visualization (K={optimal_k})",
        x=0.5,
        xanchor="center",
    ),
    xaxis_title="MDS Component 1",
    yaxis_title="MDS Component 2",
    height=600,
    showlegend=True,
    hovermode='closest'
)

# Save interactive plot
fig.write_html("clustering_visualization.html")
fig.show()

In [None]:
# Display static plot using matplotlib
plt.figure(figsize=(12, 8))
scatter = plt.scatter(df_vis["mds_x"], df_vis["mds_y"],
                     c=df_vis["cluster"],
                     cmap='viridis',
                     alpha=0.6,
                     s=50)
plt.colorbar(scatter, label='Cluster')
plt.xlabel('MDS Component 1')
plt.ylabel('MDS Component 2')
plt.title(f'Text Clustering Visualization (K={optimal_k})')
plt.tight_layout()
plt.show()

In [None]:
# Show top terms for each cluster
feature_names = vectorizer.get_feature_names_out()

for cluster_id in range(optimal_k):
    # Get texts in this cluster
    cluster_mask = cluster_labels == cluster_id
    cluster_tfidf = tfidf_matrix[cluster_mask]

    # Calculate mean TF-IDF scores for this cluster
    if len(cluster_tfidf) > 0:
        mean_tfidf = cluster_tfidf.mean(axis=0)
        top_indices = mean_tfidf.argsort()[-10:][::-1]
        top_terms = [feature_names[i] for i in top_indices]

        print(f"\nCluster {cluster_id}: {top_terms[:5]}")

In [None]:
# Get sample tweets from each cluster
for cluster_id in range(optimal_k):
    cluster_tweets = df_clustered[df_clustered['cluster'] == cluster_id]['text'].head(2)
    print(f"\nCluster {cluster_id}:")
    for i, tweet in enumerate(cluster_tweets, 1):
        print(f"  {i}. {tweet[:100]}...")

In [None]:
# Save clustered data
output_clustered = "tweet_digitalcurrency_clustered.csv"
df_clustered.to_csv(output_clustered, index=False)

## **Sentiment Analysis with BERT**

In [None]:
# Download Pretrained Model for sentiment
pretrained = "cardiffnlp/twitter-roberta-base-sentiment"

# Set Model and Tokenizer
model = AutoModelForSequenceClassification.from_pretrained(pretrained)
tokenizer = AutoTokenizer.from_pretrained(pretrained)

# Set Pipeline
sentiment_analysis = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

# Function to clean text for sentiment analysis
def clean_text_for_sentiment(text):
    # Remove URLs
    text = re.sub(r'http\S+|www.\S+', '', text)
    # Remove extra whitespace
    text = ' '.join(text.split())
    return text[:512]  # Limit to 512 characters for BERT

# Function to perform sentiment analysis
def get_sentiment(text):
    try:
        if pd.isna(text) or text.strip() == '':
            return 'neutral', 0.0

        cleaned_text = clean_text_for_sentiment(str(text))
        result = sentiment_analysis(cleaned_text)[0]

        # Map labels to consistent format
        label_map = {
            'LABEL_0': 'negative',
            'LABEL_1': 'neutral',
            'LABEL_2': 'positive',
            'NEGATIVE': 'negative',
            'NEUTRAL': 'neutral',
            'POSITIVE': 'positive'
        }

        sentiment = label_map.get(result['label'], result['label'].lower())
        score = result['score']

        return sentiment, score
    except:
        return 'neutral', 0.0

In [None]:
# Apply sentiment analysis to all tweets
sentiments = []
scores = []

for idx, text in enumerate(df['text']):
    if idx % 100 == 0:
        print(f"Processing tweet {idx}/{len(df)}...")

    sentiment, score = get_sentiment(text)
    sentiments.append(sentiment)
    scores.append(score)

# Add sentiment results to dataframe
df['sentiment'] = sentiments
df['sentiment_score'] = scores

# Display sample results
df[['text', 'sentiment', 'sentiment_score']].head(10)

In [None]:
# Sentiment Distribution
sentiment_counts = df['sentiment'].value_counts()
sentiment_counts

In [None]:
# Create bar chart visualization
plt.figure(figsize=(10, 6))
colors = {'positive': 'green', 'neutral': 'gray', 'negative': 'red'}
sentiment_counts.plot(kind='bar', color=[colors.get(x, 'blue') for x in sentiment_counts.index])
plt.title('Sentiment Distribution of Digital Currency Tweets', fontsize=16)
plt.xlabel('Sentiment', fontsize=12)
plt.ylabel('Number of Tweets', fontsize=12)
plt.xticks(rotation=0)

# Add value labels on bars
for i, v in enumerate(sentiment_counts):
    plt.text(i, v + 50, str(v), ha='center', va='bottom')

plt.tight_layout()
plt.show()

# Calculate percentage distribution
for sentiment, count in sentiment_counts.items():
    percentage = (count / len(df)) * 100
    print(f"{sentiment}: {count} tweets ({percentage:.1f}%)")

In [None]:
# Save results to CSV
output_filename = "tweet_digitalcurrency_with_sentiment.csv"
df.to_csv(output_filename, index=False)

## **Topic Modeling with BERTopic**

In [None]:
# Use the text column and clean it
input_docs = df["text"].str.lower()

# Remove URLs and clean text
input_docs = input_docs.str.replace(r'http\S+|www.\S+', '', regex=True)
input_docs = input_docs.str.replace(r'@\w+', '', regex=True)  # Remove mentions
input_docs = input_docs.dropna()
input_docs = input_docs.reset_index(drop=True)

print(f"Total documents for topic modeling: {len(input_docs)}")

In [None]:
# Create embeddings using multilingual model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")  # Faster English model
embeddings = embedding_model.encode(input_docs.tolist(), show_progress_bar=True)

In [None]:
# Set up UMAP for dimensionality reduction
umap_model = UMAP(
    n_neighbors=15,
    n_components=5,
    min_dist=0.0,
    metric='cosine',
    random_state=42
)

# Set up HDBSCAN for clustering
hdbscan_model = HDBSCAN(
    min_cluster_size=30,  # Adjust based on dataset size
    metric='euclidean',
    cluster_selection_method='eom',
    prediction_data=True
)

# Pre-reduce embeddings for visualization
reduced_embeddings = UMAP(
    n_neighbors=15,
    n_components=2,
    min_dist=0.0,
    metric='cosine',
    random_state=42
).fit_transform(embeddings)

# Representation models for better topic descriptions
keybert = KeyBERTInspired()
mmr = MaximalMarginalRelevance(diversity=0.3)
representation_model = {
    "KeyBERT": keybert,
    "MMR": mmr,
}

In [None]:
# Create and train BERTopic model
topic_model = BERTopic(
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    representation_model=representation_model,
    top_n_words=10,
    verbose=True
)

# Train model
topics, probs = topic_model.fit_transform(input_docs, embeddings)

In [None]:
# Get topic information
topic_info = topic_model.get_topic_info()
print(f"Number of topics found: {len(topic_info) - 1}")  # Exclude noise topic (-1)

# Display top topics
topic_info[topic_info.Topic != -1].head(10)[['Topic', 'Count', 'Name']]

In [None]:
# Function to get topic distribution
def get_topic_distribution(topic_model, input_docs):
    # Get document info
    doc_info = topic_model.get_document_info(input_docs)

    # Count documents per topic
    topic_counts = doc_info['Topic'].value_counts().sort_index()

    # Total documents
    total_docs = len(doc_info)
    total_docs_wo_noise = len(doc_info[doc_info['Topic'] != -1])

    # Get keywords for each topic
    topic_keywords = doc_info.groupby('Topic')['Top_n_words'].first()

    # Create summary
    summary = pd.DataFrame({
        'Topic': topic_counts.index,
        'n_documents': topic_counts.values,
        'percentage': (topic_counts.values / total_docs) * 100,
        'keywords': topic_keywords.values
    })

    # Calculate percentage without noise
    summary['percentage_wo_noise'] = summary.apply(
        lambda row: (row['n_documents'] / total_docs_wo_noise * 100) if row['Topic'] != -1 else 0,
        axis=1
    )

    return summary

# Get topic distribution
topic_summary = get_topic_distribution(topic_model, input_docs)

# Display topic summary
topic_summary[topic_summary['Topic'] != -1].head(10)


In [None]:
# Save topic summary
topic_summary.to_csv('topic_summary.csv', index=False)

In [None]:
# Visualize topic distribution
plt.figure(figsize=(12, 6))
topic_counts = topic_summary[topic_summary['Topic'] != -1]['n_documents']
topic_labels = [f"Topic {i}" for i in topic_summary[topic_summary['Topic'] != -1]['Topic']]

plt.bar(range(len(topic_counts)), topic_counts)
plt.xlabel('Topic')
plt.ylabel('Number of Documents')
plt.title('Distribution of Documents across Topics')
plt.xticks(range(len(topic_counts)), topic_labels, rotation=45, ha='right')

# Add value labels on bars
for i, v in enumerate(topic_counts):
    plt.text(i, v + 10, str(v), ha='center', va='bottom')

plt.tight_layout()
plt.show()

In [None]:
# Create and save topic visualization
fig = topic_model.visualize_documents(
    input_docs,
    reduced_embeddings=reduced_embeddings,
    hide_annotations=True,
    hide_document_hover=False,
    custom_labels=True
)

# Save visualization
fig.write_html("topic_visualization.html")
fig.show()

In [None]:
# Get most representative documents for top topics
for topic in topic_summary[topic_summary['Topic'] != -1]['Topic'].head(5):
    print(f"\nTopic {topic}:")
    representative_docs = topic_model.get_representative_docs(topic)
    for i, doc in enumerate(representative_docs[:2]):  # Show 2 examples per topic
        print(f"  {i+1}. {doc[:100]}...")