In [8]:
!pip install fast-pagerank numpy scipy pymongo

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip[0m


In [None]:
from pymongo import MongoClient

client = MongoClient("mongodb://localhost:27017/")
db = client["cdp_database"]
tweets_col = db["tweets"]
users_col = db["twitter_raw"]

kol_usernames = users_col.distinct("userName", {"elite": True})
start_time = 1731628800
end_time = 1734220800
filters = {
    "timestamp": {"$gte": start_time, "$lte": end_time},
    "authorName": {"$in": kol_usernames}
}


pipeline = [
    {"$match": filters},
    {"$group": {
        "_id": "$authorName",
        "tweet_count": {"$sum": 1}
    }},
    {"$group": {
        "_id": None,
        "unique_authors": {"$sum": 1},
        "total_tweets": {"$sum": "$tweet_count"}
    }}
]


result = list(tweets_col.aggregate(pipeline))
if result:
    total_users = result[0]["unique_authors"]
    total_tweets = result[0]["total_tweets"]
    print(f"Total users: {total_users}, Total tweets: {total_tweets}")
else:
    print("No data found.")


Total users: 19992, Total tweets: 1515138


In [10]:
count_i = 0
for i in kol_usernames:
    count_i += 1
print(count_i)

20210


In [11]:
data = {user: {"username": user, "mentioned": 0, "retweeted": 0} for user in kol_usernames}
print(data)

{'000__no': {'username': '000__no', 'mentioned': 0, 'retweeted': 0}, '008Nate': {'username': '008Nate', 'mentioned': 0, 'retweeted': 0}, '00scuba415': {'username': '00scuba415', 'mentioned': 0, 'retweeted': 0}, '00xJUAN': {'username': '00xJUAN', 'mentioned': 0, 'retweeted': 0}, '011101108a': {'username': '011101108a', 'mentioned': 0, 'retweeted': 0}, '0112358Stitch': {'username': '0112358Stitch', 'mentioned': 0, 'retweeted': 0}, '01573bit': {'username': '01573bit', 'mentioned': 0, 'retweeted': 0}, '06674eth': {'username': '06674eth', 'mentioned': 0, 'retweeted': 0}, '070guy': {'username': '070guy', 'mentioned': 0, 'retweeted': 0}, '089Reggie089': {'username': '089Reggie089', 'mentioned': 0, 'retweeted': 0}, '0913_marco': {'username': '0913_marco', 'mentioned': 0, 'retweeted': 0}, '0EllxJay': {'username': '0EllxJay', 'mentioned': 0, 'retweeted': 0}, '0FJAKE': {'username': '0FJAKE', 'mentioned': 0, 'retweeted': 0}, '0GAntD': {'username': '0GAntD', 'mentioned': 0, 'retweeted': 0}, '0XAKYL

In [12]:
filter_ = {
    "timestamp": {"$gte": start_time, "$lte": end_time},
    "authorName": {"$in": kol_usernames},
    "$or": [
        { "userMentions": { "$exists": True } },
        { "retweetedTweet": { "$exists": True } },
        {
            "$and": [
                { "userMentions": { "$exists": True } },
                { "retweetedTweet": { "$exists": True } }
            ]
        }
    ]
}
projection = {
    "authorName": 1,
    "userMentions": 1,
    "retweetedTweet": 1,
    "quotedTweet": 1
}
nodes = kol_usernames
edges = []
weights = []
post = 0
quote = 0
mention = 0
for tweet in tweets_col.find(filter_, projection=projection):
    author_name = tweet["authorName"]
    if "retweetedTweet" in tweet:
        original_author = tweet["retweetedTweet"]["authorName"]
        if author_name != original_author and original_author in kol_usernames:
            data[original_author]["retweeted"] += 1
            edges.append((author_name, original_author))
            weights.append(1)
            post += 1

    if "quotedTweet" in tweet:
        original_author = tweet["quotedTweet"]["authorName"]
        if author_name != original_author and original_author in kol_usernames:
            data[original_author]["retweeted"] += 1
            edges.append((author_name, original_author))
            weights.append(0.8)
            quote += 1

    if "userMentions" in tweet:
        for _id, username in tweet["userMentions"].items():
            if author_name != username and username in kol_usernames:
                data[username]["mentioned"] += 1
                edges.append((author_name, username))
                weights.append(0.6)
                mention += 1

print(f"Post: {post}, Quote: {quote}, Mention: {mention}")

Post: 200943, Quote: 35637, Mention: 352419


In [13]:
tweets_col.count_documents({
    "timestamp": {"$gte": start_time, "$lte": end_time},
    "authorName": {"$in": kol_usernames},
    "quotedTweet": { "$exists": True }
})

335239

In [14]:
new_data = {}
for k, v in data.items():
    new_data[k] = v["mentioned"] + v["retweeted"]

In [15]:
bins = [0, 10, 20, 40, 60, 80, 100, 200, 300, 400, 500, 1000, 20000]
bin_labels = [f"({bins[i]}-{bins[i+1]}]" for i in range(len(bins) - 1)]

counts = {label: 0 for label in bin_labels}
for username, value in new_data.items():
    for i in range(len(bins) - 1):
        if bins[i] < value <= bins[i + 1]:
            counts[bin_labels[i]] += 1
            break

for bin_range, count in counts.items():
    print(f"{bin_range}: {count}")

(0-10]: 7125
(10-20]: 2744
(20-40]: 2478
(40-60]: 1213
(60-80]: 684
(80-100]: 432
(100-200]: 860
(200-300]: 194
(300-400]: 103
(400-500]: 42
(500-1000]: 50
(1000-20000]: 23


In [33]:
import matplotlib.pyplot as plt

# Your processed data
counts = {
    "(0-10]": 7125,
    "(10-20]": 2744,
    "(20-40]": 2478,
    "(40-60]": 1213,
    "(60-80]": 684,
    "(80-100]": 432,
    "(100-200]": 860,
    "(200-300]": 194,
    "(300-400]": 103,
    "(400-500]": 42,
    "(500-1000]": 50,
    "(1000-20000]": 23
}

# Convert to lists for plotting
bins = list(counts.keys())
values = list(counts.values())

# Create a simple figure
plt.figure(figsize=(10, 6))

# Create a basic bar chart
plt.bar(bins, values, color='skyblue')

# Add simple labels
plt.xlabel('Mention and Retweet Frequency Ranges')
plt.ylabel('Number of Users')
plt.title('Distribution of Mentions and Retweets')

# Rotate x-axis labels for readability
plt.xticks(rotation=45, ha='right')

# Ensure layout fits well
plt.tight_layout()

# Save the figure
plt.savefig('mention_retweet_distribution.png')

In [30]:
import matplotlib.pyplot as plt

bins = list(counts.keys())
values = list(counts.values())

plt.figure(figsize=(12, 6))
plt.bar(bins, values, color='skyblue')

for i, value in enumerate(values):
    plt.text(i, value + max(values) * 0.01, str(value), ha='center', fontsize=10)

plt.xlabel('Edges', fontsize=12)
plt.ylabel('Nodes', fontsize=12)
plt.title('Exploratory Data Analysis', fontsize=14)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()

plt.show()


In [17]:
print(len(nodes), len(edges))

20210 588999


In [18]:
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np

# Create a graph
G = nx.DiGraph()

# Add all nodes first
for node in nodes:
    G.add_node(node)

# Add all edges with their weights
for (source, target), weight in zip(edges, weights):
    G.add_edge(source, target, weight=weight)

# Create visualization
plt.figure(figsize=(20, 20))

# Use a force-directed layout
pos = nx.spring_layout(G, k=1/np.sqrt(len(G)), iterations=50)

# Calculate node sizes based on degree centrality
degree_dict = dict(G.degree())
node_sizes = [v * 2 for v in degree_dict.values()]

# Draw the network
nx.draw_networkx_nodes(G, pos, node_size=node_sizes, alpha=0.6, node_color='skyblue')
nx.draw_networkx_edges(G, pos, alpha=0.1, arrows=False)  # Set alpha low for better visibility with many edges

plt.title(f"KOL Network\n{len(nodes)} nodes, {len(edges)} edges", pad=20)
plt.axis('off')

# Save with high resolution
plt.savefig('kol_network_full.png', dpi=300, bbox_inches='tight')
plt.close()

print(f"Network visualization saved as 'kol_network_full.png'")
print(f"Network statistics:")
print(f"Number of nodes: {len(G.nodes())}")
print(f"Number of edges: {len(G.edges())}")
print(f"Network density: {nx.density(G):.6f}")
print(f"Average degree: {sum(dict(G.degree()).values())/len(G):.2f}")

# Calculate and display top 10 nodes by degree
top_degrees = sorted(degree_dict.items(), key=lambda x: x[1], reverse=True)[:10]
print("\nTop 10 nodes by degree:")
for node, degree in top_degrees:
    print(f"{node}: {degree} connections")

Network visualization saved as 'kol_network_full.png'
Network statistics:
Number of nodes: 20210
Number of edges: 201101
Network density: 0.000492
Average degree: 19.90

Top 10 nodes by degree:
aixbt_agent: 1452 connections
blknoiz06: 997 connections
DegenerateNews: 787 connections
saylor: 734 connections
MustStopMurad: 731 connections
cz_binance: 718 connections
waleswoosh: 635 connections
jessepollak: 633 connections
LucaNetz: 620 connections
brian_armstrong: 591 connections


In [19]:
from scipy import sparse
from fast_pagerank import pagerank
from fast_pagerank import pagerank_power
import numpy as np

num_nodes = len(nodes)

node_to_index = {node: index for index, node in enumerate(nodes)}

edges_indices = [(node_to_index[u], node_to_index[v]) for u, v in edges]


A = np.array(edges_indices)
G = sparse.csr_matrix((weights, (A[:, 0], A[:, 1])), shape=(num_nodes, num_nodes))

print("Tính PageRank")
damping_factor = 0.85
pagerank_scores = pagerank(G, p=damping_factor)


Tính PageRank


In [20]:
pagerank_scores

array([1.75449571e-05, 5.78929984e-05, 9.06709123e-06, ...,
       9.06709123e-06, 9.49529449e-05, 1.00280675e-05])

In [21]:
import numpy as np

sorted_indices = np.argsort(pagerank_scores)[::-1]
sorted_scores = pagerank_scores[sorted_indices]
sorted_nodes = [nodes[i] for i in sorted_indices]


scaled_scores = sorted_scores * 10000

rank = 0
results = {}
for i, (username, score) in enumerate(zip(sorted_nodes, scaled_scores)):
    rank += 1
    # if rank > 10:
    #     break
    print(f"Hạng {i+1}: {username} - {score:.4f}")
    results[username] = score

Hạng 1: aixbt_agent - 81.4286
Hạng 2: saylor - 74.3875
Hạng 3: blknoiz06 - 44.7501
Hạng 4: cz_binance - 43.6360
Hạng 5: jessepollak - 34.5796
Hạng 6: notthreadguy - 33.9554
Hạng 7: brian_armstrong - 33.3316
Hạng 8: CynthiaMLummis - 32.2249
Hạng 9: truth_terminal - 30.6033
Hạng 10: chooserich - 30.4212
Hạng 11: pmarca - 27.5305
Hạng 12: LeonidasNFT - 25.9375
Hạng 13: 0xMert_ - 25.7376
Hạng 14: EleanorTerrett - 25.5369
Hạng 15: 0xzerebro - 24.1779
Hạng 16: DegenerateNews - 24.1332
Hạng 17: aeyakovenko - 22.9988
Hạng 18: LucaNetz - 22.5743
Hạng 19: AndyAyrey - 22.1054
Hạng 20: MustStopMurad - 21.8614
Hạng 21: Dennis_Porter_ - 21.6804
Hạng 22: shawmakesmagic - 21.2857
Hạng 23: beeple - 20.9978
Hạng 24: FinancialCmte - 20.6903
Hạng 25: frankdegods - 20.6339
Hạng 26: justinsuntron - 20.3502
Hạng 27: dolos_diary - 19.1142
Hạng 28: JSeyff - 18.9527
Hạng 29: waleswoosh - 18.8593
Hạng 30: leap_xyz - 18.8520
Hạng 31: iampaulgrewal - 18.8043
Hạng 32: CozomoMedici - 18.3408
Hạng 33: EricBalchunas -

In [22]:
print(results)

{'aixbt_agent': np.float64(81.42860313540118), 'saylor': np.float64(74.3875464871092), 'blknoiz06': np.float64(44.75014912272566), 'cz_binance': np.float64(43.63603887980668), 'jessepollak': np.float64(34.579603586200214), 'notthreadguy': np.float64(33.955366021685734), 'brian_armstrong': np.float64(33.331563587284016), 'CynthiaMLummis': np.float64(32.224885436271286), 'truth_terminal': np.float64(30.603347968145638), 'chooserich': np.float64(30.421238219417642), 'pmarca': np.float64(27.530482584754008), 'LeonidasNFT': np.float64(25.937458677771716), '0xMert_': np.float64(25.737645548276408), 'EleanorTerrett': np.float64(25.536862870218208), '0xzerebro': np.float64(24.177907303301154), 'DegenerateNews': np.float64(24.13321828945698), 'aeyakovenko': np.float64(22.998832407818906), 'LucaNetz': np.float64(22.574304152299987), 'AndyAyrey': np.float64(22.105354022389605), 'MustStopMurad': np.float64(21.86136469161699), 'Dennis_Porter_': np.float64(21.68041820679919), 'shawmakesmagic': np.fl

In [23]:
bins = [0, 1, 2, 3, 4, 5, 10, 20, 40, 60, 80, 100]
bin_labels = [f"({bins[i]}-{bins[i+1]}]" for i in range(len(bins) - 1)]

counts = {label: 0 for label in bin_labels}
for username, value in results.items():
    for i in range(len(bins) - 1):
        if bins[i] < value <= bins[i + 1]:
            counts[bin_labels[i]] += 1
            break

for bin_range, count in counts.items():
    print(f"{bin_range}: {count}")

(0-1]: 18353
(1-2]: 1112
(2-3]: 329
(3-4]: 144
(4-5]: 67
(5-10]: 135
(10-20]: 44
(20-40]: 22
(40-60]: 2
(60-80]: 1
(80-100]: 1


In [37]:
import matplotlib.pyplot as plt

# Data from the table
counts = {
    "(0-1]": 18353,
    "(1-2]": 1112,
    "(2-3]": 329,
    "(3-4]": 144,
    "(4-5]": 67,
    "(5-10]": 135,
    "(10-20]": 44,
    "(20-40]": 22,
    "(40-60]": 2,
    "(60-80]": 1,
    "(80-100]": 1
}

bins = list(counts.keys())
values = list(counts.values())

plt.figure(figsize=(12, 6))
plt.bar(bins, values, color='skyblue')

for i, value in enumerate(values):
    plt.text(i, value + max(values) * 0.01, str(value), ha='center', fontsize=10)

plt.xlabel('PageRank Score Range', fontsize=12)
plt.ylabel('Number of Users', fontsize=12)
plt.title('Distribution of PageRank Scores', fontsize=14)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()

# Save the figure
plt.savefig('score_distribution.png', dpi=300)

# Optional: show the plot
# plt.show()

In [35]:
import matplotlib.pyplot as plt

bins = list(counts.keys())
values = list(counts.values())

plt.figure(figsize=(12, 6))
plt.bar(bins, values, color='skyblue')

for i, value in enumerate(values):
    plt.text(i, value + max(values) * 0.01, str(value), ha='center', fontsize=10)

plt.xlabel('Score Range', fontsize=12)
plt.ylabel('Number of Nodes', fontsize=12)
plt.title('Exploratory Data Analysis', fontsize=14)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()

plt.savefig('score_distribution.png')

In [25]:
line_data = results

# Extracting keys and values for line plot
names = list(line_data.keys())
values = list(line_data.values())

# Plotting the line chart
plt.figure(figsize=(12, 6))
# plt.plot(names, values, marker='o', linestyle='-', color='skyblue')
# plt.scatter(names, values, color='skyblue')
plt.scatter(range(len(values)), values, color='skyblue')

# Adding labels and title
plt.xlabel('User', fontsize=12)
plt.ylabel('Score', fontsize=12)
# plt.title('Line Chart of Data', fontsize=14)
# plt.grid(True, linestyle='--', alpha=0.6)

# Show the plot
plt.tight_layout()
plt.show()

  plt.show()


In [26]:
mapping_follower = {}
for kol in results.keys():
    mapping_follower[kol] = users_col.find_one({"userName": kol})["followersCount"]



In [27]:
print(mapping_follower)

{'aixbt_agent': 404882, 'saylor': 3997324, 'blknoiz06': 653398, 'cz_binance': 9458130, 'jessepollak': 196773, 'notthreadguy': 263487, 'brian_armstrong': 1468911, 'CynthiaMLummis': 105664, 'truth_terminal': 252086, 'chooserich': 196901, 'pmarca': 1753090, 'LeonidasNFT': 245393, '0xMert_': 183867, 'EleanorTerrett': 169801, '0xzerebro': 114821, 'DegenerateNews': 325288, 'aeyakovenko': 488844, 'LucaNetz': 165816, 'AndyAyrey': 99371, 'MustStopMurad': 649828, 'Dennis_Porter_': 205999, 'shawmakesmagic': 143505, 'beeple': 818782, 'FinancialCmte': 43661, 'frankdegods': 333619, 'justinsuntron': 3726430, 'dolos_diary': 42140, 'JSeyff': 167554, 'waleswoosh': 124906, 'leap_xyz': 76673, 'iampaulgrewal': 83920, 'CozomoMedici': 303717, 'EricBalchunas': 342481, 'APompliano': 1697816, 'inversebrah': 335451, 'SolJakey': 168059, 'piovincenzo_': 83639, 'AltcoinDailyio': 1801100, 'natbrunell': 366345, 'AutismCapital': 666477, 'naval': 2657120, 'itsafwog': 37277, 'jerallaire': 149102, 'CharlotteFang77': 4951