## Exports nodes and edges from tweets (Retweets, Mentions, or Replies) [JSON]
Exports nodes and edges from tweets (either from retweets or mentions) in json format that can be exported from SFM, and saves it in a file format compatible with various social network graph tools such as Gephi, Cytoscape, Kumu, etc. These are for directed graphs.

In [325]:
import sys
import json
import re
import numpy as np
from datetime import datetime
import pandas as pd  

tweetfile = '/home/soominpark/sfmproject/Work/Emma Briant case/elites_sample.json'



### 1. Export edges from Retweets, Mentions, or Replies
* Run one of three blocks of codes below for your purpose. 

In [28]:
# 1. Export edges from Retweets

fh = open(tweetfile, 'r')

userdata = pd.DataFrame(columns=('Id','Label','user_created_at','profile_image','followers_count','friends_count' ))
edges = pd.DataFrame(columns=('Source','Target','Strength'))

for line in fh:
    try:
        tweet = json.loads(line)
    except:
        continue
    if 'retweeted_status' not in tweet:
        continue
    
    userdata = userdata.append(pd.DataFrame([[tweet['user']['id_str'],
                                tweet['user']['screen_name'],
                                tweet['user']['created_at'],
                                tweet['user']['profile_image_url_https'],
                                tweet['user']['followers_count'],
                                tweet['user']['friends_count']]], columns=('Id','Label','user_created_at','profile_image','followers_count','friends_count')), ignore_index=True)
    userdata = userdata.append(pd.DataFrame([[tweet['retweeted_status']['user']['id_str'],
                                tweet['retweeted_status']['user']['screen_name'],
                                tweet['retweeted_status']['user']['created_at'],
                                tweet['retweeted_status']['user']['profile_image_url_https'],
                                tweet['retweeted_status']['user']['followers_count'],
                                tweet['retweeted_status']['user']['friends_count']]], columns=('Id','Label','user_created_at','profile_image','followers_count','friends_count')), ignore_index=True)                 
    edges = edges.append(pd.DataFrame([[tweet['user']['id_str'],
                                tweet['retweeted_status']['user']['id_str'],
                                str(datetime.strptime(tweet['created_at'], '%a %b %d %H:%M:%S +0000 %Y'))]]
                                , columns=('Source','Target','Strength')), ignore_index=True)           

In [369]:
# 2. Export edges from Mentions

fh = open(tweetfile, 'r')

userdata = pd.DataFrame(columns=('Id','Label','user_created_at','profile_image','followers_count','friends_count' ))
edges = pd.DataFrame(columns=('Source','Target','Strength'))

for line in fh:
    try:
        tweet = json.loads(line)
    except:
        continue
    if len(tweet['entities']['user_mentions']) == 0:
        continue
    
    for mention in tweet['entities']['user_mentions']:
        userdata = userdata.append(pd.DataFrame([[tweet['user']['id_str'],
                                tweet['user']['screen_name'],
                                tweet['user']['created_at'],
                                tweet['user']['profile_image_url_https'],
                                tweet['user']['followers_count'],
                                tweet['user']['friends_count']]], columns=('Id','Label','user_created_at','profile_image','followers_count','friends_count')), ignore_index=True)
        if len(userdata[userdata['Id'].str.contains(mention['id_str'])]) == 0:
            userdata = userdata.append(pd.DataFrame([[tweet['user']['id_str'],
                                tweet['user']['screen_name'],
                                np.nan,
                                np.nan,
                                np.nan,
                                np.nan]], columns=('Id','Label','user_created_at','profile_image','followers_count','friends_count')), ignore_index=True)
        edges = edges.append(pd.DataFrame([[tweet['user']['id_str'],
                                    mention['id_str'],
                                    str(datetime.strptime(tweet['created_at'], '%a %b %d %H:%M:%S +0000 %Y'))]]
                                    , columns=('Source','Target','Strength')), ignore_index=True)  

KeyboardInterrupt: 

In [368]:
# 3. Export edges from Replies

fh = open(tweetfile, 'r')

userdata = pd.DataFrame(columns=('Id','Label','user_created_at','profile_image','followers_count','friends_count' ))
edges = pd.DataFrame(columns=('Source','Target','Strength'))

for line in fh:
    try:
        tweet = json.loads(line)
    except:
        continue
    if tweet['in_reply_to_user_id_str'] is None:
        continue

    userdata = userdata.append(pd.DataFrame([[tweet['user']['id_str'],
                                tweet['user']['screen_name'],
                                tweet['user']['created_at'],
                                tweet['user']['profile_image_url_https'],
                                tweet['user']['followers_count'],
                                tweet['user']['friends_count']]], columns=('Id','Label','user_created_at','profile_image','followers_count','friends_count')), ignore_index=True)
    if len(userdata[userdata['Id'].str.contains(tweet['in_reply_to_user_id_str'])]) == 0:
            userdata = userdata.append(pd.DataFrame([[tweet['in_reply_to_user_id_str'],
                                tweet['in_reply_to_screen_name'],
                                np.nan,
                                np.nan,
                                np.nan,
                                np.nan]], columns=('Id','Label','user_created_at','profile_image','followers_count','friends_count')), ignore_index=True)
    edges = edges.append(pd.DataFrame([[tweet['user']['id_str'],
                                tweet['in_reply_to_user_id_str'],
                                str(datetime.strptime(tweet['created_at'], '%a %b %d %H:%M:%S +0000 %Y'))]]
                                , columns=('Source','Target','Strength')), ignore_index=True)

KeyboardInterrupt: 

In [329]:
tweets.shape

(491844, 21)

In [338]:
userdata.shape

(6566, 6)

In [339]:
edges.count()

Source      4559
Target      4559
Strength    4559
dtype: int64

In [340]:
userdata.tail(5)

Unnamed: 0,Id,Label,user_created_at,profile_image,followers_count,friends_count
6561,4146483742,MarkByers17,Mon Nov 09 03:43:03 +0000 2015,https://pbs.twimg.com/profile_images/779511806...,490.0,317.0
6562,328120703,kffie,Sat Jul 02 19:33:51 +0000 2011,https://abs.twimg.com/sticky/default_profile_i...,47.0,32.0
6563,14662569,WMUR9,,,,
6564,406619360,JuanAcosta44,Sun Nov 06 23:31:49 +0000 2011,https://pbs.twimg.com/profile_images/162627201...,45.0,777.0
6565,278087727,eusaboston,Wed Apr 06 15:46:50 +0000 2011,https://pbs.twimg.com/profile_images/781706058...,77.0,443.0


In [341]:
edges.tail(5)

Unnamed: 0,Source,Target,Strength
4554,422755981,3082933433,2016-10-27 12:06:31
4555,4146483742,113364160,2016-10-12 21:09:18
4556,328120703,14662569,2016-08-05 02:24:21
4557,406619360,50769180,2016-11-04 23:58:30
4558,278087727,710553234654367744,2016-10-17 01:28:16


### 2. Leave only the tweets whose strength level >= user specified level (directed)

In [343]:
strengthLevel = 2  # Network connection strength level: the number of times in total each of the tweeters responded to or mentioned the other.
                   # If you have 1 as the level, then all tweeters who mentioned or replied to another at least once will be displayed. But if you have 5, only those who have mentioned or responded to a particular tweeter at least 5 times will be displayed, which means that only the strongest bonds are shown.

edges2 = edges.groupby(['Source','Target'])['Strength'].count()
edges2 = edges2.reset_index()
edges2 = edges2[edges2['Strength'] >= strengthLevel]

### 3. Export nodes

In [353]:
# Export nodes from the edges and add node attributes for both Sources and Targets.
userdata = userdata.sort_values(['Id','followers_count'], ascending=[True, False])
userdata = userdata.drop_duplicates(['Id'], keep='first') 

ids = edges2['Source'].append(edges2['Target']).to_frame()
ids.columns = ['Id']
ids = ids.drop_duplicates()

nodes = pd.merge(ids, userdata, on='Id', how='left')

In [356]:
nodes.head(5)

Unnamed: 0,Id,Label,user_created_at,profile_image,followers_count,friends_count
0,1022143381,RRBoulden,Wed Dec 19 13:57:17 +0000 2012,https://abs.twimg.com/sticky/default_profile_i...,18.0,71.0
1,1051550413,susiesam1,Tue Jan 01 01:14:10 +0000 2013,https://pbs.twimg.com/profile_images/770450345...,9.0,35.0
2,109949937,TurraloonNS,Sat Jan 30 20:31:43 +0000 2010,https://pbs.twimg.com/profile_images/594973377...,112.0,77.0
3,118504245,lminato,Sun Feb 28 22:08:11 +0000 2010,https://pbs.twimg.com/profile_images/754514218...,72.0,179.0
4,124199238,DSF2020,Thu Mar 18 15:49:49 +0000 2010,https://pbs.twimg.com/profile_images/668427205...,2855.0,475.0


In [355]:
ids.shape

(164, 1)

In [None]:
strengthLevel = 1  # Network connection strength level: the number of times in total each of the tweeters responded to or mentioned the other.
                   # If you have 1 as the level, then all tweeters who mentioned or replied to another at least once will be displayed. But if you have 5, only those who have mentioned or responded to a particular tweeter at least 5 times will be displayed, which means that only the strongest bonds are shown.

edges2 = edges.groupby(['Source','Target'])['Strength'].count()
edges2 = edges2.reset_index()
edges2 = edges2[edges2['Strength'] >= strengthLevel]

ids = edges2['Source'].append(edges2['Target']).to_frame()
ids.columns = ['Id']
ids = ids.drop_duplicates()
nodes2 = nodes.sort_values(['Id','followers_count'], ascending=[True, False])
nodes2 = nodes.drop_duplicates(['Id'], keep='first') # Remove tweets having different screen name but the same ID 
nodes2 = pd.merge(ids, nodes2, on='Id', how='inner')

In [348]:
edges2.shape

(125, 3)

### 4. Export nodes and edges to csv files

In [362]:
# change column names for Kumu import (Run this when using Kumu)
nodes.columns = ['Id', 'Label', 'Date', 'Image', 'followers_count', 'friends_count']
edges2.columns = ['From','To','Strength']

In [363]:
# Print nodes to check
nodes.head(3)

Unnamed: 0,Id,Label,Date,Image,followers_count,friends_count
0,1022143381,RRBoulden,Wed Dec 19 13:57:17 +0000 2012,https://abs.twimg.com/sticky/default_profile_i...,18.0,71.0
1,1051550413,susiesam1,Tue Jan 01 01:14:10 +0000 2013,https://pbs.twimg.com/profile_images/770450345...,9.0,35.0
2,109949937,TurraloonNS,Sat Jan 30 20:31:43 +0000 2010,https://pbs.twimg.com/profile_images/594973377...,112.0,77.0


In [364]:
# Print edges to check
edges2.head(3)

Unnamed: 0,From,To,Strength
22,1022143381,759251,2
38,1051550413,26487169,2
74,109949937,777879358451093504,2


In [366]:
# Export nodes and edges to csv files
nodes.to_csv('nodes2.csv', encoding='utf-8', index=False)
edges2.to_csv('edges2.csv', encoding='utf-8', index=False)