In [74]:
# Load library
import pandas as pd
import numpy as np
import networkx as nx
from functools import reduce
from lib.util import fetch_tweets

event = "charliehebdo"  # Ubah peristiwa dari PHEME dataset

In [75]:
data = fetch_tweets(event)

In [76]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38268 entries, 0 to 38267
Data columns (total 22 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   is_rumor              38268 non-null  int64  
 1   thread                38268 non-null  object 
 2   in_reply_tweet        36044 non-null  object 
 3   event                 38268 non-null  object 
 4   text                  38268 non-null  object 
 5   tweet_id              38268 non-null  object 
 6   is_source_tweet       38268 non-null  int64  
 7   in_reply_user         36047 non-null  object 
 8   user_id               38268 non-null  object 
 9   hashtags_count        38268 non-null  int64  
 10  retweet_count         38268 non-null  int64  
 11  favorite_count        38268 non-null  int64  
 12  mentions_count        38268 non-null  int64  
 13  user.tweets_count     38268 non-null  int64  
 14  user.verified         38268 non-null  int64  
 15  user.followers_coun

In [77]:
data.head()

Unnamed: 0,is_rumor,thread,in_reply_tweet,event,text,tweet_id,is_source_tweet,in_reply_user,user_id,hashtags_count,...,mentions_count,user.tweets_count,user.verified,user.followers_count,user.friends_count,hasperiod,number_punct,negativewordcount,positivewordcount,sentimentscore
0,0,552784600502915072,,charliehebdo,Charlie Hebdo became well known for publishing...,552784600502915072,1,,331658004,0,...,0,15128,1,41591,2268,0,0,0,0,0.0
1,0,552784600502915072,5.527846005029151e+17,charliehebdo,"Now 10 dead in a shooting there today RT ""@BBC...",552785249420447745,0,331658004.0,18370911,0,...,1,5064,0,4671,4954,0,4,0,0,-0.2
2,0,552784600502915072,5.527846005029151e+17,charliehebdo,@BBCDanielS @BBCWorld I'm guessing this is bei...,552786761534144512,0,331658004.0,2806109387,0,...,2,2170,0,59,113,0,2,0,0,0.285714
3,0,552784600502915072,5.527846005029151e+17,charliehebdo,@BBCDanielS @BBCWorld why would you mention th...,552786803884060672,0,331658004.0,146142164,1,...,2,12064,0,96357,385,0,1,0,0,0.0
4,0,552784600502915072,5.527846005029151e+17,charliehebdo,@BBCDanielS @BBCWorld perps identified?,552786954656710656,0,331658004.0,940853760,0,...,2,13256,0,751,1332,0,1,0,0,0.0


In [78]:
bool_columns = ["is_rumor", "is_source_tweet", "user.verified"]

data[bool_columns].astype(bool).describe(include="bool")

Unnamed: 0,is_rumor,is_source_tweet,user.verified
count,38268,38268,38268
unique,2,2,2
top,False,False,False
freq,30923,36189,36659


In [79]:
data.describe()

Unnamed: 0,is_rumor,is_source_tweet,hashtags_count,retweet_count,favorite_count,mentions_count,user.tweets_count,user.verified,user.followers_count,user.friends_count,hasperiod,number_punct,negativewordcount,positivewordcount,sentimentscore
count,38268.0,38268.0,38268.0,38268.0,38268.0,38268.0,38268.0,38268.0,38268.0,38268.0,38268.0,38268.0,38268.0,38268.0,38268.0
mean,0.191936,0.054327,0.275792,24.313578,11.87645,1.76709,24963.64,0.042046,90776.98,1316.908252,1.187441,5.338664,0.049676,0.099117,0.025037
std,0.393828,0.226666,0.700195,478.360416,260.287571,1.044231,113194.3,0.200696,937989.0,7674.764808,1.380235,4.420501,0.224261,0.313253,0.297044
min,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0
25%,0.0,0.0,0.0,0.0,0.0,1.0,1730.75,0.0,95.0,165.0,0.0,2.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,2.0,7055.0,0.0,400.0,448.0,1.0,4.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,1.0,2.0,22312.5,0.0,1422.0,1114.0,2.0,7.0,0.0,0.0,0.1
max,1.0,1.0,10.0,74130.0,37983.0,9.0,4352451.0,1.0,22720220.0,415293.0,26.0,58.0,3.0,3.0,1.0


In [80]:
def agg_tweets_by_user(df):
    
    shared = lambda x: 1 - len(set(x)) / len(x)
    shared.__name__ = "shared"

    funcs = [np.mean, sum, np.var]
    agg_props = {
        "favorite_count": funcs,
        "retweet_count": funcs,
        "hashtags_count": funcs + [shared],
        "user.tweets_count": funcs,
        "is_rumor": sum,
        "tweet_id": len,
        "user.verified": funcs,
        
        "hasperiod": funcs,
        "number_punct": funcs,
        "negativewordcount" : funcs,
        "positivewordcount" : funcs,
        "sentimentscore" : funcs,
    }
    rename = {
        "tweet_id": "thread_length"
    }

    def connected_component_subgraphs(G):
        for c in nx.connected_components(G):
            yield G.subgraph(c)

    def g(x):
        # Membuat ukuran user-to-user conversation component terbesar dari setiap thread yang ada        
        d = []
        thread_tweets = list(x["tweet_id"])
        G = nx.from_pandas_edgelist(df[df.tweet_id.isin(thread_tweets)], "thread", "in_reply_user")
        Gc = max(connected_component_subgraphs(G), key=len)
        d.append(nx.number_connected_components(G))
        d.append(nx.diameter(Gc))
        return pd.Series(d, index=["component_count", "largest_cc_diameter"])
    
    # Step 0: Membangun graf yang sudah didefinisikan
    graph = df.groupby("user_id").apply(g)
    
    # Step 1: Membangun fitur agregat
    agg = df.groupby("user_id")\
        .agg(agg_props)\
        .rename(columns=rename)
    
    agg.columns = [ "_".join(x) for x in agg.columns.ravel() ]

    dfs = [agg,graph]
    thrd_data = reduce(lambda left, right: pd.merge(left,right, on="user_id"), dfs)
    
       
    return thrd_data

In [81]:
thrds = agg_tweets_by_user(data)
thrds.info()

<class 'pandas.core.frame.DataFrame'>
Index: 18740 entries, 10003372 to 999907196
Data columns (total 35 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   favorite_count_mean     18740 non-null  float64
 1   favorite_count_sum      18740 non-null  int64  
 2   favorite_count_var      5439 non-null   float64
 3   retweet_count_mean      18740 non-null  float64
 4   retweet_count_sum       18740 non-null  int64  
 5   retweet_count_var       5439 non-null   float64
 6   hashtags_count_mean     18740 non-null  float64
 7   hashtags_count_sum      18740 non-null  int64  
 8   hashtags_count_var      5439 non-null   float64
 9   hashtags_count_shared   18740 non-null  float64
 10  user.tweets_count_mean  18740 non-null  float64
 11  user.tweets_count_sum   18740 non-null  int64  
 12  user.tweets_count_var   5439 non-null   float64
 13  is_rumor_sum            18740 non-null  int64  
 14  thread_length_len       18740 no

  agg.columns = [ "_".join(x) for x in agg.columns.ravel() ]


In [82]:
thrds.head()

Unnamed: 0_level_0,favorite_count_mean,favorite_count_sum,favorite_count_var,retweet_count_mean,retweet_count_sum,retweet_count_var,hashtags_count_mean,hashtags_count_sum,hashtags_count_var,hashtags_count_shared,...,negativewordcount_sum,negativewordcount_var,positivewordcount_mean,positivewordcount_sum,positivewordcount_var,sentimentscore_mean,sentimentscore_sum,sentimentscore_var,component_count,largest_cc_diameter
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10003372,0.0,0,,0.0,0,,1.0,1,,0.0,...,0,,0.0,0,,0.0,0.0,,1,1
1000379850,0.0,0,,0.0,0,,0.0,0,,0.0,...,0,,0.0,0,,-0.166667,-0.166667,,1,1
1001236291,1.0,2,0.0,0.0,0,0.0,0.0,0,0.0,0.5,...,0,0.0,0.0,0,0.0,0.202083,0.404167,0.029635,1,2
100132150,0.0,0,,0.0,0,,0.0,0,,0.0,...,0,,0.0,0,,0.0,0.0,,1,1
100150564,0.0,0,0.0,0.0,0,0.0,1.5,3,4.5,0.0,...,0,0.0,0.5,1,0.5,0.21875,0.4375,0.095703,2,1


In [83]:
thrds.shape

(18740, 35)

In [84]:
# Hitung intensitas score penyebar rumor
intensityscore = (thrds['is_rumor_sum'] / thrds['user.tweets_count_mean'])
thrds['intensityscore'] = intensityscore


In [85]:
thrds.head()

Unnamed: 0_level_0,favorite_count_mean,favorite_count_sum,favorite_count_var,retweet_count_mean,retweet_count_sum,retweet_count_var,hashtags_count_mean,hashtags_count_sum,hashtags_count_var,hashtags_count_shared,...,negativewordcount_var,positivewordcount_mean,positivewordcount_sum,positivewordcount_var,sentimentscore_mean,sentimentscore_sum,sentimentscore_var,component_count,largest_cc_diameter,intensityscore
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10003372,0.0,0,,0.0,0,,1.0,1,,0.0,...,,0.0,0,,0.0,0.0,,1,1,0.0
1000379850,0.0,0,,0.0,0,,0.0,0,,0.0,...,,0.0,0,,-0.166667,-0.166667,,1,1,0.0
1001236291,1.0,2,0.0,0.0,0,0.0,0.0,0,0.0,0.5,...,0.0,0.0,0,0.0,0.202083,0.404167,0.029635,1,2,0.0
100132150,0.0,0,,0.0,0,,0.0,0,,0.0,...,,0.0,0,,0.0,0.0,,1,1,0.0
100150564,0.0,0,0.0,0.0,0,0.0,1.5,3,4.5,0.0,...,0.0,0.5,1,0.5,0.21875,0.4375,0.095703,2,1,0.0


In [86]:
Y = []
target = thrds['intensityscore']
for val in target:
    if(val < 0.5):
        Y.append(0)
    else:
        Y.append(1)

In [87]:
thrds['rumorspreader'] = Y

In [88]:
thrds.head()

Unnamed: 0_level_0,favorite_count_mean,favorite_count_sum,favorite_count_var,retweet_count_mean,retweet_count_sum,retweet_count_var,hashtags_count_mean,hashtags_count_sum,hashtags_count_var,hashtags_count_shared,...,positivewordcount_mean,positivewordcount_sum,positivewordcount_var,sentimentscore_mean,sentimentscore_sum,sentimentscore_var,component_count,largest_cc_diameter,intensityscore,rumorspreader
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10003372,0.0,0,,0.0,0,,1.0,1,,0.0,...,0.0,0,,0.0,0.0,,1,1,0.0,0
1000379850,0.0,0,,0.0,0,,0.0,0,,0.0,...,0.0,0,,-0.166667,-0.166667,,1,1,0.0,0
1001236291,1.0,2,0.0,0.0,0,0.0,0.0,0,0.0,0.5,...,0.0,0,0.0,0.202083,0.404167,0.029635,1,2,0.0,0
100132150,0.0,0,,0.0,0,,0.0,0,,0.0,...,0.0,0,,0.0,0.0,,1,1,0.0,0
100150564,0.0,0,0.0,0.0,0,0.0,1.5,3,4.5,0.0,...,0.5,1,0.5,0.21875,0.4375,0.095703,2,1,0.0,0


In [89]:
fn = "data/extracted/%s.csv" % event
thrds.to_csv(fn, index=False)
"Wrote data to %s" % fn

'Wrote data to data/extracted/charliehebdo.csv'

In [90]:
df = pd.read_csv(fn)
df.head()

Unnamed: 0,favorite_count_mean,favorite_count_sum,favorite_count_var,retweet_count_mean,retweet_count_sum,retweet_count_var,hashtags_count_mean,hashtags_count_sum,hashtags_count_var,hashtags_count_shared,...,positivewordcount_mean,positivewordcount_sum,positivewordcount_var,sentimentscore_mean,sentimentscore_sum,sentimentscore_var,component_count,largest_cc_diameter,intensityscore,rumorspreader
0,0.0,0,,0.0,0,,1.0,1,,0.0,...,0.0,0,,0.0,0.0,,1,1,0.0,0
1,0.0,0,,0.0,0,,0.0,0,,0.0,...,0.0,0,,-0.166667,-0.166667,,1,1,0.0,0
2,1.0,2,0.0,0.0,0,0.0,0.0,0,0.0,0.5,...,0.0,0,0.0,0.202083,0.404167,0.029635,1,2,0.0,0
3,0.0,0,,0.0,0,,0.0,0,,0.0,...,0.0,0,,0.0,0.0,,1,1,0.0,0
4,0.0,0,0.0,0.0,0,0.0,1.5,3,4.5,0.0,...,0.5,1,0.5,0.21875,0.4375,0.095703,2,1,0.0,0


In [91]:
df.to_excel('data/analyzed/%s.xlsx' % event)