Anggota Kelompok H:
1. Putri Maharani - 1301213093
2. Dhafa Nur Fadhilah - 1301213263
3. Ratin Kani - 1301213269

In [None]:
# Import
import numpy as np
import pandas as pd
import networkx as nx
from sklearn.preprocessing import MinMaxScaler

In [None]:
# Clone the GitHub repository
!git clone https://github.com/R-Kani/twitter_scraping.git

fatal: destination path 'twitter_scraping' already exists and is not an empty directory.


In [None]:
# Read csv
df = pd.read_csv('/content/twitter_scraping/telkomsel.csv')
df.head()

Unnamed: 0,conversation_id_str,created_at,favorite_count,full_text,id_str,image_url,in_reply_to_screen_name,lang,location,quote_count,reply_count,retweet_count,tweet_url,user_id,username
0,1850901858635936109,Tue Oct 29 09:55:55 +0000 2024,0,@gryffinmugs @gryffinmugs Makasih banyak udah ...,1851201286609445063,,gryffinmugs,in,Indonesia,0,0,0,https://x.com/Telkomsel/status/185120128660944...,255409050,Telkomsel
1,1851201096825651306,Tue Oct 29 09:55:09 +0000 2024,0,mana ni paket murah lagi masa cuma sekali akti...,1851201096825651306,,,in,Random,0,0,0,https://x.com/Pukisbukankue/status/18512010968...,302606926,Pukisbukankue
2,1851175230301225283,Tue Oct 29 09:54:44 +0000 2024,0,@cuteflxwrr Udah stabil lagi belum sinyalnya s...,1851200990353273318,,cuteflxwrr,in,Indonesia,0,0,0,https://x.com/Telkomsel/status/185120099035327...,255409050,Telkomsel
3,1850901858635936109,Tue Oct 29 09:53:41 +0000 2024,0,@Telkomsel Yoiii pelanggan tetap since 2017,1851200728427323638,,Telkomsel,in,alana,0,1,0,https://x.com/gryffinmugs/status/1851200728427...,1248788066493292544,gryffinmugs
4,1850901858635936109,Tue Oct 29 09:53:28 +0000 2024,0,@fikimridho Jangan ngambek dong Kak Piki. Kan ...,1851200672181735458,,fikimridho,in,Indonesia,0,0,0,https://x.com/Telkomsel/status/185120067218173...,255409050,Telkomsel


In [None]:
# Preprocessing
# Drop irrelavent columns
df.drop('location', axis=1, inplace=True)
df.drop('image_url', axis=1, inplace=True)

# If user_id is Nan fill it with random number
def generate_random_id():
    return np.random.randint(1000, 10000)

df['user_id'] = df['user_id'].apply(lambda x: generate_random_id() if pd.isna(x) else x)

# If username is nan fill it with the corresponding value from the user_id
def fill_username_with_user_id(row):
    return row['username'] if pd.notna(row['username']) else row['user_id']
df['username'] = df.apply(fill_username_with_user_id, axis=1)

In [None]:
# Initialize an undirected graph
G = nx.Graph()

# Build the graph based on user interactions
for _, row in df.iterrows():
    user_id = row['user_id']
    retweet_count = row['retweet_count']
    reply_count = row['reply_count']
    quote_count = row['quote_count']
    favorite_count = row['favorite_count']

    # Add the user as a node
    if not pd.isna(user_id):  # Check if user_id is NaN
        if not G.has_node(user_id):
            G.add_node(user_id, username=row['username'])

        # Create a list of weights for interactions
        interaction_weights = [
            (retweet_count, 'retweet'),
            (reply_count, 'reply'),
            (quote_count, 'quote'),
            (favorite_count, 'favorite')
        ]

        # Connect the user to others based on interactions in the row
        for other_user_id, interaction in zip(df['user_id'], interaction_weights):
            if user_id != other_user_id and not pd.isna(other_user_id):
                weight = interaction[0]
                if weight > 0:
                    if G.has_edge(user_id, other_user_id):
                        G[user_id][other_user_id]['weight'] += weight
                    else:
                        G.add_edge(user_id, other_user_id, weight=weight)



In [None]:
# Calculate centrality measures
degree_centrality = nx.degree_centrality(G)
betweenness_centrality = nx.betweenness_centrality(G, weight='weight')
closeness_centrality = nx.closeness_centrality(G)
eigenvector_centrality = nx.eigenvector_centrality(G, max_iter=1000)
katz_centrality = nx.katz_centrality(G, alpha=0.01, beta=1.0, max_iter=10000)

# Store centrality results in a DataFrame
centrality_df = pd.DataFrame({
    'User ID': degree_centrality.keys(),
    'Degree Centrality': degree_centrality.values(),
    'Betweenness Centrality': betweenness_centrality.values(),
    'Closeness Centrality': closeness_centrality.values(),
    'Eigenvector Centrality': eigenvector_centrality.values(),
    'Katz Centrality': katz_centrality.values()
})

# Add usernames to the output
centrality_df['Username'] = centrality_df['User ID'].apply(lambda user_id: G.nodes[user_id]['username'])
centrality_df = centrality_df[['User ID', 'Username', 'Degree Centrality',
                               'Betweenness Centrality', 'Closeness Centrality',
                               'Eigenvector Centrality', 'Katz Centrality']]

# Initialize a MinMaxScaler to scale between 0 and 1
scaler = MinMaxScaler()


centrality_df[['Degree Centrality', 'Betweenness Centrality',
               'Closeness Centrality', 'Eigenvector Centrality',
               'Katz Centrality']] = scaler.fit_transform(
    centrality_df[['Degree Centrality', 'Betweenness Centrality',
                   'Closeness Centrality', 'Eigenvector Centrality',
                   'Katz Centrality']]
)

# Sort to find most influential users
most_influential = centrality_df.sort_values(by=['Degree Centrality', 'Betweenness Centrality',
                                                 'Closeness Centrality', 'Eigenvector Centrality',
                                                 'Katz Centrality'], ascending=False)

most_influential.head(10)

Unnamed: 0,User ID,Username,Degree Centrality,Betweenness Centrality,Closeness Centrality,Eigenvector Centrality,Katz Centrality
1,302606926,Pukisbukankue,1.0,1.0,1.0,1.0,1.0
2,1248788066493292544,gryffinmugs,0.127404,0.084769,0.546816,0.179549,0.138627
0,255409050,Telkomsel,0.055288,0.066972,0.527076,0.105833,0.066705
96,1129672753651388416,H4T14K4LN4L42,0.007212,0.033133,0.514689,0.062016,0.018868
126,241422232,dimasiprtm,0.007212,0.001582,0.514689,0.062016,0.018868
336,1700943431969955840,kaerraviell,0.007212,0.001582,0.514689,0.062016,0.018868
66,1503095026066210816,bobpulaji_,0.007212,0.0,0.514689,0.062016,0.018868
118,1806039874979733504,shotobabies,0.007212,0.0,0.514689,0.062016,0.018868
136,1749690773375983616,MyTelkomselApp,0.007212,0.0,0.514689,0.062016,0.018868
157,1599695159440322561,diabara_,0.007212,0.0,0.514689,0.062016,0.018868


In [None]:
# Add centrality measures as attributes to each node
for user_id in G.nodes():
    G.nodes[user_id]['Degree Centrality'] = degree_centrality.get(user_id, 0)
    G.nodes[user_id]['Betweenness Centrality'] = betweenness_centrality.get(user_id, 0)
    G.nodes[user_id]['Closeness Centrality'] = closeness_centrality.get(user_id, 0)
    G.nodes[user_id]['Eigenvector Centrality'] = eigenvector_centrality.get(user_id, 0)
    G.nodes[user_id]['Katz Centrality'] = katz_centrality.get(user_id, 0)

# Create a DataFrame for nodes with attributes
nodes_df = pd.DataFrame.from_dict(dict(G.nodes(data=True)), orient='index').reset_index()
nodes_df.rename(columns={'index': 'id'}, inplace=True)
nodes_df.rename(columns={'username': 'Label'}, inplace=True)

# Save nodes DataFrame to CSV
nodes_df.to_csv("nodes.csv", index=False)

# Extract edges and weights into a DataFrame
edges_data = [(u, v, d['weight']) for u, v, d in G.edges(data=True)]
edges_df = pd.DataFrame(edges_data, columns=['source', 'target', 'weight'])

# Save edges DataFrame to CSV
edges_df.to_csv("edges.csv", index=False)

In [None]:
nodes_df

Unnamed: 0,id,Label,Degree Centrality,Betweenness Centrality,Closeness Centrality,Eigenvector Centrality,Katz Centrality
0,255409050,Telkomsel,0.044574,0.045203,0.425212,7.378085e-02,0.053003
1,302606926,Pukisbukankue,0.806202,0.674954,0.806737,6.971443e-01,0.221330
2,1248788066493292544,gryffinmugs,0.102713,0.057215,0.441137,1.251714e-01,0.065975
3,118287748,lalu_lula77,0.000000,0.000000,0.000000,2.922858e-130,0.040972
4,1254067328913829889,deirdre_xo,0.000000,0.000000,0.000000,2.922858e-130,0.040972
...,...,...,...,...,...,...,...
512,1760654096850399232,bagoldisokin,0.001938,0.000000,0.408971,3.363546e-02,0.043186
513,1740148471980822528,kualikummall,0.001938,0.000000,0.408971,3.363546e-02,0.043186
514,1842180984,reoktari,0.001938,0.000000,0.408971,3.363546e-02,0.043186
515,1739683467389243392,Suryonugro44,0.001938,0.000000,0.408971,3.363546e-02,0.043186


In [None]:
edges_df

Unnamed: 0,source,target,weight
0,255409050,302606926,283
1,255409050,1248788066493292544,47
2,255409050,1330804989761966082,1
3,255409050,1309542507210924032,1
4,255409050,1129672753651388416,1
...,...,...,...
484,1248788066493292544,4240744999,1
485,1248788066493292544,108819730,14
486,1248788066493292544,1748712090343538688,1
487,1248788066493292544,3281027354,1
