# Importing Libraries

In [None]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import numpy as np
import datetime
import os
from community import community_louvain
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

In [None]:
pip install python-louvain



In [None]:
import community as community_louvain

# Importing Data

In [None]:
tweets = pd.read_excel("/content/tweets.xlsx")

In [None]:
tweets.head()

Unnamed: 0,Vertex 1,Vertex 2,Colour,Width,Style,Opacity,Visibility,Label,Label Text Colour,Label Font Size,...,Tweet,URLs in Tweet,Domains in Tweet,Hashtags in Tweet,Tweet Date (UTC),Twitter Page for Tweet,Latitude,Longitude,Imported ID,In-Reply Tweet ID
0,sanchitabhartiy,thexoxoday,,,,,,,,,...,RT @thexoxoday: Our #Contest is #live! #RT and...,,,contest live rt,2017-01-25 11:27:34,https://twitter.com/#!/sanchitabhartiy/status/...,,,824217142681862144,
1,ratneshnagori,divyamisra2,,,,,,,,,...,@thexoxoday @kunalgupta09 @ekta_k88 @kp_85 @sn...,,,,2017-01-25 11:31:10,https://twitter.com/#!/ratneshnagori/status/82...,,,824218047707652096,8.242167e+17
2,ratneshnagori,kp_85,,,,,,,,,...,@thexoxoday @kunalgupta09 @ekta_k88 @kp_85 @sn...,,,,2017-01-25 11:31:10,https://twitter.com/#!/ratneshnagori/status/82...,,,824218047707652096,8.242167e+17
3,ratneshnagori,soodabhinav08,,,,,,,,,...,@thexoxoday @kunalgupta09 @ekta_k88 @kp_85 @sn...,,,,2017-01-25 11:31:10,https://twitter.com/#!/ratneshnagori/status/82...,,,824218047707652096,8.242167e+17
4,ratneshnagori,snehalataj,,,,,,,,,...,@thexoxoday @kunalgupta09 @ekta_k88 @kp_85 @sn...,,,,2017-01-25 11:31:10,https://twitter.com/#!/ratneshnagori/status/82...,,,824218047707652096,8.242167e+17


# Creating a Directed graph

In [None]:
G = nx.DiGraph()
G_mentions = nx.DiGraph()
G_replies = nx.DiGraph()

## Adding all edges

In [None]:
for _, row in tweets.iterrows():
    if pd.notna(row['Vertex 1']) and pd.notna(row['Vertex 2']):
        relation = row['Relationship']

        if relation in ['Mentions', 'Replies to']:
            G.add_edge(row['Vertex 1'], row['Vertex 2'])  # Main graph

            if relation == 'Mentions':
                G_mentions.add_edge(row['Vertex 1'], row['Vertex 2'])
            elif relation == 'Replies to':
                G_replies.add_edge(row['Vertex 1'], row['Vertex 2'])

print(f"Combined graph: {G.number_of_nodes()} nodes, {G.number_of_edges()} edges")
print(f"Mentions graph: {G_mentions.number_of_nodes()} nodes, {G_mentions.number_of_edges()} edges")
print(f"Replies graph: {G_replies.number_of_nodes()} nodes, {G_replies.number_of_edges()} edges")

Combined graph: 2493 nodes, 3204 edges
Mentions graph: 2489 nodes, 3184 edges
Replies graph: 91 nodes, 94 edges


Removing the tweets only keeping teh mentions and repies - because for the tweets it will give unnecessary importance to thexoxoday as both vertex 1 and vertetx 2 will by thexoxoday its nit as if though theres any interaction

There are not many more edges than there are nodes - that means not too many connections - avg degree - 1.29 - relatively sparse network especially when considering that its a social media network where ideally it should be dense for a company that has engagement

very few replies made to xoxoday tweets more mentions for contests

# Calculating all Centrality Measures

## Wrapper Function to Safely Compute Centrality Measures

In [None]:
def safe_centrality(func, graph, **kwargs):
    try:
        return func(graph, **kwargs)
    except Exception as e:
        print(f"Error computing {func.__name__}: {e}")
        return {}

## Basic Centrality Measures

In [None]:
def compute_centralities(graph, name="Graph"):
    print(f"\nCalculating centralities for {name}...")

    print("  - In/Out Degree Centrality")
    in_degree = nx.in_degree_centrality(graph)
    out_degree = nx.out_degree_centrality(graph)

    print("  - Betweenness Centrality")
    betweenness = safe_centrality(nx.betweenness_centrality, graph)

    print("  - Closeness Centrality (on largest WCC)")
    if nx.is_weakly_connected(graph):
        closeness = safe_centrality(nx.closeness_centrality, graph)
    else:
        largest_wcc = max(nx.weakly_connected_components(graph), key=len)
        subgraph = graph.subgraph(largest_wcc)
        closeness = safe_centrality(nx.closeness_centrality, subgraph)

    print("  - Eigenvector Centrality")
    eigenvector = safe_centrality(nx.eigenvector_centrality, graph)

    print("  - PageRank")
    pagerank = safe_centrality(nx.pagerank, graph)

    print("  - HITS")
    hubs, authorities = safe_centrality(nx.hits, graph)

    print("  - Katz Centrality")
    katz = safe_centrality(nx.katz_centrality_numpy, graph)

    return {
        'in_degree': in_degree,
        'out_degree': out_degree,
        'betweenness': betweenness,
        'closeness': closeness,
        'eigenvector': eigenvector,
        'pagerank': pagerank,
        'hubs': hubs,
        'authorities': authorities,
        'katz': katz
    }

# Run for all three graphs
centralities_all = compute_centralities(G, "Combined")
centralities_mentions = compute_centralities(G_mentions, "Mentions")
centralities_replies = compute_centralities(G_replies, "Replies")




Calculating centralities for Combined...
  - In/Out Degree Centrality
  - Betweenness Centrality
  - Closeness Centrality (on largest WCC)
  - Eigenvector Centrality
  - PageRank
  - HITS
  - Katz Centrality

Calculating centralities for Mentions...
  - In/Out Degree Centrality
  - Betweenness Centrality
  - Closeness Centrality (on largest WCC)
  - Eigenvector Centrality
  - PageRank
  - HITS
  - Katz Centrality

Calculating centralities for Replies...
  - In/Out Degree Centrality
  - Betweenness Centrality
  - Closeness Centrality (on largest WCC)
  - Eigenvector Centrality
  - PageRank
  - HITS
  - Katz Centrality


## Combine All Measures into a Dataframe

In [None]:
def build_centrality_df(graph, centralities):
    nodes = list(graph.nodes())
    return pd.DataFrame({
        'node': nodes,
        'in_degree': [centralities['in_degree'].get(node, 0) for node in nodes],
        'out_degree': [centralities['out_degree'].get(node, 0) for node in nodes],
        'betweenness': [centralities['betweenness'].get(node, 0) for node in nodes],
        'closeness': [centralities['closeness'].get(node, 0) for node in nodes],
        'eigenvector': [centralities['eigenvector'].get(node, 0) for node in nodes],
        'pagerank': [centralities['pagerank'].get(node, 0) for node in nodes],
        'hub_score': [centralities['hubs'].get(node, 0) for node in nodes],
        'authority_score': [centralities['authorities'].get(node, 0) for node in nodes],
        'katz': [centralities['katz'].get(node, 0) for node in nodes]
    })


In [None]:
df_all = build_centrality_df(G, centralities_all)
df_mentions = build_centrality_df(G_mentions, centralities_mentions)
df_replies = build_centrality_df(G_replies, centralities_replies)


# Identify Top Users By Each Dataframe

In [None]:
def print_top_nodes(df, label="Graph"):
    def get_top_nodes(df, measure, n=10):
        return df.sort_values(by=measure, ascending=False).head(n)

    print(f"\nTop 10 Nodes for {label}:\n" + "-" * 40)

    print("\nIn-Degree:")
    print(get_top_nodes(df, 'in_degree')[['node', 'in_degree']])

    print("\nOut-Degree:")
    print(get_top_nodes(df, 'out_degree')[['node', 'out_degree']])

    print("\nBetweenness Centrality:")
    print(get_top_nodes(df, 'betweenness')[['node', 'betweenness']])

    print("\nCloseness Centrality:")
    print(get_top_nodes(df, 'closeness')[['node', 'closeness']])

    print("\nEigenvector Centrality:")
    print(get_top_nodes(df, 'eigenvector')[['node', 'eigenvector']])

    print("\nPageRank:")
    print(get_top_nodes(df, 'pagerank')[['node', 'pagerank']])

    print("\nHubs Score:")
    print(get_top_nodes(df, 'hub_score')[['node', 'hub_score']])

    print("\nAuthorities Score:")
    print(get_top_nodes(df, 'authority_score')[['node', 'authority_score']])

    print("\nKatz Centrality:")
    print(get_top_nodes(df, 'katz')[['node', 'katz']])



In [None]:
print_top_nodes(df_all, "Combined Graph")
print_top_nodes(df_mentions, "Mentions Graph")
print_top_nodes(df_replies, "Replies Graph")



Top 10 Nodes for Combined Graph:
----------------------------------------

In-Degree:
                node  in_degree
1         thexoxoday   0.099518
23          dewcool2   0.015249
54     iheartcontest   0.006019
108    pinkydholakia   0.005618
44     contestmantra   0.005618
55      contesttable   0.005618
52             broke   0.005217
53    photo_contests   0.005217
62   contestadventur   0.004815
101   india4contests   0.004815

Out-Degree:
                 node  out_degree
62    contestadventur    0.777689
2261           shwtz7    0.026485
1          thexoxoday    0.020064
240       girija_kriz    0.014446
2187        nehalroys    0.010835
307          kumar623    0.010433
2249    shraddha_bari    0.010032
251      blessedkamal    0.010032
270   karrivinodkumar    0.009631
2248  pratima_talreja    0.007624

Betweenness Centrality:
                 node  betweenness
62    contestadventur     0.080841
1          thexoxoday     0.079779
2248  pratima_talreja     0.001631
2249    s

Most influential accounts are the contest accounts probaly not reperesenting actual users or buyers but simply accounts that probably look and retweet contest tweets from different companies like the giveaway companies specifically looking and retweeting maybe to their followers - this gives the impressions to even new users that this company just tries to give contests because they arent doing well - like you shouldnt be that desperate of a company

in the top for replies to nodes there are less contest users and actual maybe general users and followers and acounts and buyers


users
dewcool2 - indegree, outdegree, betweenness, katz, authorities, eigenvector, pagerank,
shraddha_bari - outdegree, hubs
pratima_talreja - betweenness
girija_kriz
thefreejinn - eigenvector, katz, authorities
blessed_kamal


replies
nehayagnik - indegree,outdegree,betweenness, pagerank, hubs
mgossipqueen - indegree, betweenness, closeness
binee_kukreja - closeness, indegree, eigenvector, katz
bpb_mumbai - eigenvector, closeness, katz
deepakberiwala -eigenvector, pagerank, closeness, indegree, katz

lots of contests
iheartcontest
contesttable
contestmantra
contestmela
contests2share
contestadventur
contestindia
photocontests
contestkiduniya

interesting - users - mentioms
uber - pagerank
olacabs - pagerank

interesting - users - replies
uber - pagerank
uber_india - indegeree, katz
olacabs - pagerank

# Connected Components

In [None]:
def print_connected_components_info(graph, name):
    weakly_connected = list(nx.weakly_connected_components(graph))
    strongly_connected = list(nx.strongly_connected_components(graph))

    wcc_sizes = [len(comp) for comp in weakly_connected]
    scc_sizes = [len(comp) for comp in strongly_connected]

    print(f"\n{name} - Connected Components:")

    print(f"  Weakly connected components: {len(weakly_connected)}")
    print(f"    Largest size: {max(wcc_sizes)}")
    print(f"    Smallest size: {min(wcc_sizes)}")
    print(f"    Average size: {sum(wcc_sizes) / len(wcc_sizes):.2f}")

    print(f"  Strongly connected components: {len(strongly_connected)}")
    print(f"    Largest size: {max(scc_sizes)}")
    print(f"    Smallest size: {min(scc_sizes)}")
    print(f"    Average size: {sum(scc_sizes) / len(scc_sizes):.2f}")

# Run for all graphs
print_connected_components_info(G, "Full Graph (Mentions + Replies)")
print_connected_components_info(G_mentions, "Mentions Graph")
print_connected_components_info(G_replies, "Replies Graph")



Full Graph (Mentions + Replies) - Connected Components:
  Weakly connected components: 1
    Largest size: 2493
    Smallest size: 2493
    Average size: 2493.00
  Strongly connected components: 2406
    Largest size: 73
    Smallest size: 1
    Average size: 1.04

Mentions Graph - Connected Components:
  Weakly connected components: 1
    Largest size: 2489
    Smallest size: 2489
    Average size: 2489.00
  Strongly connected components: 2406
    Largest size: 70
    Smallest size: 1
    Average size: 1.03

Replies Graph - Connected Components:
  Weakly connected components: 3
    Largest size: 86
    Smallest size: 2
    Average size: 30.33
  Strongly connected components: 86
    Largest size: 5
    Smallest size: 1
    Average size: 1.06


In [None]:
# Remove 'xoxoday' node from the graph
G_no_xoxoday = G.copy()
G_no_xoxoday.remove_node('thexoxoday')

# Make copies of the Mentions and Replies graphs and remove 'xoxoday' from them
G_mentions_no_xoxoday = G_mentions.copy()
G_mentions_no_xoxoday.remove_node('thexoxoday')

G_replies_no_xoxoday = G_replies.copy()
G_replies_no_xoxoday.remove_node('thexoxoday')

# Recompute connected components for the graph without 'xoxoday'
print_connected_components_info(G_no_xoxoday, "Full Graph (Mentions + Replies) without xoxoday")
print_connected_components_info(G_mentions_no_xoxoday, "Mentions Graph without xoxoday")
print_connected_components_info(G_replies_no_xoxoday, "Replies Graph without xoxoday")



Full Graph (Mentions + Replies) without xoxoday - Connected Components:
  Weakly connected components: 132
    Largest size: 2314
    Smallest size: 1
    Average size: 18.88
  Strongly connected components: 2434
    Largest size: 38
    Smallest size: 1
    Average size: 1.02

Mentions Graph without xoxoday - Connected Components:
  Weakly connected components: 131
    Largest size: 2311
    Smallest size: 1
    Average size: 18.99
  Strongly connected components: 2432
    Largest size: 38
    Smallest size: 1
    Average size: 1.02

Replies Graph without xoxoday - Connected Components:
  Weakly connected components: 82
    Largest size: 3
    Smallest size: 1
    Average size: 1.10
  Strongly connected components: 89
    Largest size: 2
    Smallest size: 1
    Average size: 1.01


# Transivity / Clustering Coefficient

In [None]:
# Transitivity/Clustering coefficient
# Calculate average clustering coefficient for the Full Graph (Mentions + Replies)
avg_clustering_full = nx.average_clustering(G)
print(f"\nAverage clustering coefficient for Full Graph (Directed): {avg_clustering_full}")

# Calculate average clustering coefficient for the Mentions Graph
avg_clustering_mentions = nx.average_clustering(G_mentions)
print(f"\nAverage clustering coefficient for Mentions Graph (Directed): {avg_clustering_mentions}")

# Calculate average clustering coefficient for the Replies Graph
avg_clustering_replies = nx.average_clustering(G_replies)
print(f"\nAverage clustering coefficient for Replies Graph (Directed): {avg_clustering_replies}")



Average clustering coefficient for Full Graph (Directed): 0.045483681173639566

Average clustering coefficient for Mentions Graph (Directed): 0.044164230486861536

Average clustering coefficient for Replies Graph (Directed): 0.01099055179257086


In [None]:
# Clustering coefficient for Full Graph (Mentions + Replies) without 'xoxoday' (Directed)
clustering_coef_no_xoxoday_full = nx.average_clustering(G_no_xoxoday, weight=None)
print(f"Average clustering coefficient for Full Graph without 'xoxoday' (Directed): {clustering_coef_no_xoxoday_full}")

# Clustering coefficient for Mentions Graph without 'xoxoday' (Directed)
clustering_coef_no_xoxoday_mentions = nx.average_clustering(G_mentions_no_xoxoday, weight=None)
print(f"Average clustering coefficient for Mentions Graph without 'xoxoday' (Directed): {clustering_coef_no_xoxoday_mentions}")

# Clustering coefficient for Replies Graph without 'xoxoday' (Directed)
clustering_coef_no_xoxoday_replies = nx.average_clustering(G_replies_no_xoxoday, weight=None)
print(f"Average clustering coefficient for Replies Graph without 'xoxoday' (Directed): {clustering_coef_no_xoxoday_replies}")


Average clustering coefficient for Full Graph without 'xoxoday' (Directed): 0.03682600606137311
Average clustering coefficient for Mentions Graph without 'xoxoday' (Directed): 0.035981912796703895
Average clustering coefficient for Replies Graph without 'xoxoday' (Directed): 0.0


In [None]:
# Degree assortativity for the Full Graph (with and without 'xoxoday')
assortativity_full = nx.degree_assortativity_coefficient(G)
assortativity_full_no_xoxoday = nx.degree_assortativity_coefficient(G_no_xoxoday)

# Degree assortativity for the Mentions Graph (with and without 'xoxoday')
assortativity_mentions = nx.degree_assortativity_coefficient(G_mentions)
assortativity_mentions_no_xoxoday = nx.degree_assortativity_coefficient(G_mentions_no_xoxoday)

# Degree assortativity for the Replies Graph (with and without 'xoxoday')
assortativity_replies = nx.degree_assortativity_coefficient(G_replies)
assortativity_replies_no_xoxoday = nx.degree_assortativity_coefficient(G_replies_no_xoxoday)

# Print results
print(f"Degree assortativity for Full Graph (with 'xoxoday'): {assortativity_full}")
print(f"Degree assortativity for Full Graph (without 'xoxoday'): {assortativity_full_no_xoxoday}")

print(f"Degree assortativity for Mentions Graph (with 'xoxoday'): {assortativity_mentions}")
print(f"Degree assortativity for Mentions Graph (without 'xoxoday'): {assortativity_mentions_no_xoxoday}")

print(f"Degree assortativity for Replies Graph (with 'xoxoday'): {assortativity_replies}")
print(f"Degree assortativity for Replies Graph (without 'xoxoday'): {assortativity_replies_no_xoxoday}")


Degree assortativity for Full Graph (with 'xoxoday'): -0.388050420374134
Degree assortativity for Full Graph (without 'xoxoday'): -0.4329488018878232
Degree assortativity for Mentions Graph (with 'xoxoday'): -0.38744128173397335
Degree assortativity for Mentions Graph (without 'xoxoday'): -0.4332558356614266
Degree assortativity for Replies Graph (with 'xoxoday'): -0.6693109942772173
Degree assortativity for Replies Graph (without 'xoxoday'): nan


  return float((xy * (M - ab)).sum() / np.sqrt(vara * varb))


# Detecting Communities

In [None]:
from collections import Counter
import community.community_louvain as community_louvain  # ✅ correct import


def detect_communities(graph, label):
    print(f"\nDetecting communities in {label}...")

    # Louvain requires an undirected graph; this is still necessary
    undirected_graph = graph.to_undirected()
    partition = community_louvain.best_partition(undirected_graph)

    # Count community sizes
    community_sizes = Counter(partition.values())
    print(f"  Number of communities detected: {len(community_sizes)}")
    print(f"  Size of largest community: {max(community_sizes.values())}")

    return partition, community_sizes

# Run community detection on all six graphs
partition_full, _ = detect_communities(G, "Full Graph (with 'xoxoday')")
partition_full_no_xoxo, _ = detect_communities(G_no_xoxoday, "Full Graph (without 'xoxoday')")

partition_mentions, _ = detect_communities(G_mentions, "Mentions Graph (with 'xoxoday')")
partition_mentions_no_xoxo, _ = detect_communities(G_mentions_no_xoxoday, "Mentions Graph (without 'xoxoday')")

partition_replies, _ = detect_communities(G_replies, "Replies Graph (with 'xoxoday')")
partition_replies_no_xoxo, _ = detect_communities(G_replies_no_xoxoday, "Replies Graph (without 'xoxoday')")





Detecting communities in Full Graph (with 'xoxoday')...
  Number of communities detected: 11
  Size of largest community: 1860

Detecting communities in Full Graph (without 'xoxoday')...
  Number of communities detected: 141
  Size of largest community: 1860

Detecting communities in Mentions Graph (with 'xoxoday')...
  Number of communities detected: 12
  Size of largest community: 1860

Detecting communities in Mentions Graph (without 'xoxoday')...
  Number of communities detected: 142
  Size of largest community: 1860

Detecting communities in Replies Graph (with 'xoxoday')...
  Number of communities detected: 8
  Size of largest community: 76

Detecting communities in Replies Graph (without 'xoxoday')...
  Number of communities detected: 82
  Size of largest community: 3


Graph Type	With 'xoxoday'	Without 'xoxoday'	Largest Community Size	Fragmentation
Full (Mentions+Replies)	11 communities	143 communities	1860	↑ Higher without xoxo
Mentions	13 communities	140 communities	1860	↑ Higher without xoxo
Replies	8 communities	82 communities	76 → 3	Huge ↑ without xoxo


# Hashtag Analysis

In [None]:
if 'Hashtags in Tweet' in tweets.columns:
    print("\nAnalyzing hashtags...")

    # Extract all hashtags
    all_hashtags = []
    for hashtags in tweets['Hashtags in Tweet'].dropna():
        if isinstance(hashtags, str):
            # Assuming hashtags are stored as a comma-separated string
            tags = [tag.strip() for tag in hashtags.split(',')]
            all_hashtags.extend(tags)

    # Count hashtag frequency
    hashtag_counts = Counter(all_hashtags)

    print("Top 10 hashtags:")
    for hashtag, count in hashtag_counts.most_common(10):
        print(f"{hashtag}: {count}")

    # Create hashtag co-occurrence network
    hashtag_network = nx.Graph()

    for hashtags in tweets['Hashtags in Tweet'].dropna():
        if isinstance(hashtags, str):
            tags = [tag.strip() for tag in hashtags.split(',')]
            # Add edges between all pairs of hashtags in the same tweet
            for i, tag1 in enumerate(tags):
                for tag2 in tags[i+1:]:
                    if hashtag_network.has_edge(tag1, tag2):
                        hashtag_network[tag1][tag2]['weight'] += 1
                    else:
                        hashtag_network.add_edge(tag1, tag2, weight=1)

    # Calculate centrality measures for hashtags
    if hashtag_network.number_of_nodes() > 0:
        hashtag_degree = nx.degree_centrality(hashtag_network)
        hashtag_betweenness = nx.betweenness_centrality(hashtag_network)

        # Create dataframe of hashtag metrics
        hashtag_df = pd.DataFrame({
            'hashtag': list(hashtag_network.nodes()),
            'frequency': [hashtag_counts.get(tag, 0) for tag in hashtag_network.nodes()],
            'degree': [hashtag_degree.get(tag, 0) for tag in hashtag_network.nodes()],
            'betweenness': [hashtag_betweenness.get(tag, 0) for tag in hashtag_network.nodes()]
        })

        # Sort by frequency
        hashtag_df = hashtag_df.sort_values('frequency', ascending=False)

        print("\nTop hashtags by network centrality:")
        print(hashtag_df.head(10))



Analyzing hashtags...
Top 10 hashtags:
contest live rt: 186
contest: 151
live participate rt win contestalert contest: 103
contest rt win contestalert: 75
contestalert: 65
participate contest rt: 57
live contest rt: 46
contestalert contest: 43
contest contestalert: 29
contest live rt contestalert: 28


# Visualizations

In [None]:
import os
import matplotlib.pyplot as plt
import networkx as nx
import seaborn as sns
from collections import Counter

# Create output directory for visualizations
os.makedirs('network_visualizations', exist_ok=True)

# Convert to undirected graph for community detection
G_undirected = G.to_undirected()

# Perform community detection on the undirected graph
partition = community_louvain.best_partition(G_undirected)

# Calculate PageRank for the graph (this will give you a dictionary of node -> PageRank)
pagerank = nx.pagerank(G)

# 1. Network visualization with community colors
def plot_network(G, partition=None, filename='network.png', title='Network Visualization',
                 node_size_attr=None, layout=nx.spring_layout):
    plt.figure(figsize=(12, 12))
    pos = layout(G)

    # Node sizes based on centrality if provided
    if node_size_attr:
        node_size = [50 + 1000 * node_size_attr.get(node, 0) for node in G.nodes()]
    else:
        node_size = 50

    # Node colors based on community if provided
    if partition:
        colors = [partition.get(node, 0) for node in G.nodes()]
        nx.draw_networkx_nodes(G, pos, node_size=node_size,
                              node_color=colors, cmap=plt.cm.rainbow, alpha=0.8)
    else:
        nx.draw_networkx_nodes(G, pos, node_size=node_size, alpha=0.8)

    nx.draw_networkx_edges(G, pos, alpha=0.2, arrows=True)

    # Only label highest degree nodes for readability
    if node_size_attr:
        top_nodes = sorted(node_size_attr.items(), key=lambda x: x[1], reverse=True)[:20]
        labels = {node: node for node, _ in top_nodes if node in G}
        nx.draw_networkx_labels(G, pos, labels=labels, font_size=8)

    plt.title(title)
    plt.axis('off')
    plt.tight_layout()
    plt.savefig(f'network_visualizations/{filename}', dpi=300)
    plt.show()
    plt.close()



plot_network(G, partition=partition, filename='network_communities.png',
             title='Twitter Network with Communities', node_size_attr=pagerank)



In [None]:
pip install nltk scikit-learn wordcloud matplotlib

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
import re
import string
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from wordcloud import WordCloud

In [None]:
!pip install "numpy>=2.0.0,<3.0.0"



In [None]:
!pip install pandas matplotlib seaborn nltk scikit-learn wordcloud


## Libraries for Text Mining

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import re
import string
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

# NLP Libraries
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from nltk.collocations import BigramAssocMeasures, BigramCollocationFinder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from wordcloud import WordCloud
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics.pairwise import cosine_similarity


## Stopwords

In [None]:
# Define your custom STOPWORDS properly as a set
STOPWORDS = set("""i me my myself we our ours ourselves you your yours yourself yourselves
                he him his himself she her hers herself fotokiran blessedkamal calljain girijakriz
                it its itself they them their theirs themselves what which who whom this that these
                those am is are was were romaguptasinha contesttable like wants esrihari everyon be
                been being have has had having do does did doing a an the and but if or because as
                until while of at by corallista mehandisuresh vineetsonkar sayyedjenifer bhansalidigna
                for with about against thexoxoday between into through during before after above below
                to from up down in out pinkydholakia babubeg want microlight xoxoday acharyaempire on
                off over under again further then once here there when where why how all any both each
                few more most other iheartcontest deepaadhan plumparadise korakagaj particip some such
                no nor not contestalert only own same so than too very s t can will just don should now
                hi check guys dhillonshalini nsverma contestkiduniyaimport re import string from nltk contestadventur contestmantra contestmela contesthub
                thefreejinn shraddhabari snehalataj sureshnakoda go see indiancontests would aadiivaasii""".split())


##Pre-processing text

In [None]:
def preprocess_text(text):
    """Preprocess text with lemmatization and custom stopwords, printing @words and their original tweets."""
    lemmatizer = WordNetLemmatizer()

    # Convert text to lowercase
    text = text.lower()

    # Remove punctuation
    text = re.sub(f"[{string.punctuation}]", "", text)

    # Split text into tokens
    tokens = text.split()

    # Print usernames (starting with @) and the original tweet if any are found
    at_words = [word for word in tokens if word.startswith('@')]
    if at_words:
        print("Tweet with @words:", text)
        print("Removed @words:", at_words)
        print("-" * 60)

    # Filter out stopwords, non-alpha tokens, and words starting with @
    tokens = [word for word in tokens if word.isalpha() and word not in STOPWORDS and not word.startswith('@')]

    # Lemmatize each token (defaulting to noun)
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return " ".join(tokens)


## TF-IDF

In [None]:

# 1. TF-IDF Analysis
def perform_tfidf_analysis(documents):
    """Perform TF-IDF analysis on preprocessed documents"""
    vectorizer = TfidfVectorizer(
        preprocessor=lambda x: x,  # Text is already preprocessed
        tokenizer=str.split,       # Simple whitespace split
        stop_words=None            # No need for built-in stopwords since we already filtered
    )
    tfidf_matrix = vectorizer.fit_transform(documents)
    tfidf_array = tfidf_matrix.toarray()
    feature_names = vectorizer.get_feature_names_out()

    # Sum TF-IDF scores across all documents
    total_tfidf = np.sum(tfidf_array, axis=0)

    # Pair words with scores and sort
    word_scores = list(zip(feature_names, total_tfidf))
    word_scores_sorted = sorted(word_scores, key=lambda x: x[1], reverse=True)

    return tfidf_matrix, feature_names, word_scores_sorted, vectorizer

# Perform TF-IDF analysis
tfidf_matrix, feature_names, word_scores_sorted, vectorizer = perform_tfidf_analysis(documents_cleaned)

# Display top TF-IDF scores
print("Top TF-IDF scores across all tweets:")
for word, score in word_scores_sorted[:20]:
    print(f"{word}: {score:.4f}")

## Word Cloud

In [None]:
def generate_wordcloud_from_tfidf(word_scores):
    """Generate word cloud from TF-IDF scores"""
    # Create a dictionary of words and their scores
    tfidf_dict = {word: score for word, score in word_scores}

    # Generate word cloud from TF-IDF scores
    wordcloud = WordCloud(
        width=800,
        height=400,
        background_color='white'
    ).generate_from_frequencies(tfidf_dict)

    # Display the Word Cloud
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.title("Word Cloud Based on TF-IDF Scores")
    plt.tight_layout()
    plt.show()

    return wordcloud

# Generate word cloud from TF-IDF results
wordcloud = generate_wordcloud_from_tfidf(word_scores_sorted)


## N-Gram Analysis

In [None]:
def extract_ngrams(texts, n=2):
    """Extract n-grams from a list of preprocessed texts"""
    all_ngrams = []
    for text in texts:
        tokens = text.split()
        if len(tokens) >= n:
            all_ngrams.extend(list(ngrams(tokens, n)))
    return Counter(all_ngrams)

# Extract bigrams and trigrams
bigram_counts = extract_ngrams(documents_cleaned, 2)
trigram_counts = extract_ngrams(documents_cleaned, 3)

print("\nTop 10 bigrams:")
for gram, count in bigram_counts.most_common(10):
    print(f"{' '.join(gram)}: {count}")

print("\nTop 10 trigrams:")
for gram, count in trigram_counts.most_common(10):
    print(f"{' '.join(gram)}: {count}")


##  Topic Modeling with LDA (sklearn)

In [None]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

def perform_sklearn_lda(documents, num_topics=15, num_words=10):
    """Perform LDA topic modeling using sklearn"""
    # Preprocess documents
    documents_cleaned = [preprocess_text(doc) for doc in documents]

    # Create CountVectorizer for LDA
    count_vectorizer = CountVectorizer(
        preprocessor=lambda x: x,  # Text is already preprocessed
        tokenizer=str.split        # Simple whitespace split
    )
    count_matrix = count_vectorizer.fit_transform(documents_cleaned)
    feature_names = count_vectorizer.get_feature_names_out()

    # Train LDA model
    lda = LatentDirichletAllocation(
        n_components=num_topics,
        random_state=42,
        max_iter=10,
        learning_method='online'
    )
    lda.fit(count_matrix)

    # Extract topics
    topics = []
    for topic_idx, topic in enumerate(lda.components_):
        # Sort words by importance in topic
        top_features_idx = topic.argsort()[:-num_words-1:-1]
        top_features = [feature_names[i] for i in top_features_idx]
        topics.append(top_features)

    # Assign topics to documents
    doc_topics = lda.transform(count_matrix)
    main_topics = doc_topics.argmax(axis=1)

    return lda, topics, main_topics, count_vectorizer

# Perform LDA topic modeling
num_topics = 15  # Adjust based on your dataset size
lda_model, topics, doc_topics, count_vectorizer = perform_sklearn_lda(documents_cleaned, num_topics)

# Print topics
print(f"\nIdentified {num_topics} topics:")
for i, topic_words in enumerate(topics):
    print(f"Topic {i+1}: {', '.join(topic_words)}")

# Add topics back to original dataframe (if needed)
tweets['topic'] = pd.Series(doc_topics, index=tweets.index[:len(doc_topics)])




Identified 15 topics:
Topic 1: join, chulbullychidya, buddiez, modipreksha, ghunjain, contestsinindia, gpkm, sunitakatyal, contestsource, iampratikjadhav
Topic 2: join, rt, tofarzeen, tagging, shivzi, hashloverz, garbadandiya, hiddenkeys, smojawala, hoorparee
Topic 3: one, bygpass, pas, getfit, olacabs, nearbuy, flexibility, allget, complete, snapdeal
Topic 4: uberindia, join, offer, flight, love, hetalrawat, prateekuapdhya, nishajg, imbevda, atulkrin
Topic 5: market, offbeat, firm, plan, enter, marketing, bengalurubased, prreleasewatch, tajmahalpalace, wasidkh
Topic 6: join, nitincul, hellolalit, amazing, acharinimboo, chidiyagiri, mrrahuljoshi, iamkoolazzu, khannaronit, mgossipqueen
Topic 7: rt, contest, u, participate, coming, follow, shoutout, live, give, photocontests
Topic 8: rt, contest, friend, live, win, participate, follow, tag, tagging, u
Topic 9: join, r, rtadepally, irinrinki, sensiblemona, djshivamanuja, karrivinodkumar, itzpman, nosaiyoha, teehoo
Topic 10: join, vhetal,

## Basic Text Statistics

In [None]:
def compute_text_stats(texts):
    """Compute basic text statistics"""
    stats = {
        'total_documents': len(texts),
        'total_words': sum(len(text.split()) for text in texts),
        'unique_words': len(set(word for text in texts for word in text.split())),
    }
    return stats

# Compute basic text statistics
text_stats = compute_text_stats(documents_cleaned)
print("\nBasic Text Statistics:")
for stat, value in text_stats.items():
    print(f"{stat}: {value:.2f}" if isinstance(value, float) else f"{stat}: {value}")
