# PHEME Rumor Dataset: Exploratory Data Analysis

In [114]:
# Load dependencies for this Jupyter Notebook
import os, json, errno
import pandas as pd

### Prase and Clean Data
This step takes the raw PHEME rumor dataset and saves them as a cleaned CSV file. The original PHEME data consists of JSON files organized into directories by event and category (rumor or non-rumor). These two functions clean the data and load it as a Pandas DataFrame from the "cached" CSV file.

In [191]:
def get_event_data(event, dataset="pheme-rnr-dataset", refresh=False):
    """ Fetches event data as a Pandas DataFrame. If cleaned CSV file does not exist, then create it.
    
    Params:
        - event: Name of fake news event
        - dataset: fake news dataset, default to PHEME dataset
        - refresh: if True, then reparse raw PHEME to CSV and overwrite existing CSV file.
    
    Return: Pandas DataFrame
    """
    fn = "data/%s/%s.csv" % (dataset, event)    
    if refresh:
        clean_pheme_by_event(event)
        event = pd.read_csv(fn)
    else:
        try:
            event = pd.read_csv(fn)
        except OSError as e:
            clean_pheme_by_event(event)
            event = pd.read_csv(fn)
    return event;

def clean_pheme_by_event(event):
    """ Parses json data stored in directories of the PHEME dataset into a CSV file.
    
    Params:
        - event: Name fake news event and directory name in PHEME dataset
    
    Return: None
    """
    data = pd.DataFrame()
    dataset = "pheme-rnr-dataset"
    fn = "data/%s/%s.csv" % (dataset, event)
    for category in os.listdir("%s/%s" % (dataset, event)):
        for thread in os.listdir("%s/%s/%s" % (dataset, event, category)):
            with open("%s/%s/%s/%s/source-tweet/%s.json" % (dataset, event, category, thread, thread)) as f:
                tweet = json.load(f)
                df = pd.DataFrame([{
                    "followers_count": tweet["user"]["followers_count"],
                    "is_rumor": True if category is "rumour" else False,
                    "retweet_count": tweet["retweet_count"],
                    "symbols_count": len(tweet["entities"]["symbols"]),
                    "mentions_count": len(tweet["entities"]["user_mentions"]),
                    "hashtags_count": len(tweet["entities"]["hashtags"]),                                          
                    "urls_count": len(tweet["entities"]["urls"]),                                                                                    
                    "favorite_count": tweet["favorite_count"],
                    "friends_count": tweet["user"]["friends_count"],
                    "statuses_count": tweet["user"]["statuses_count"],
                    "created": tweet["created_at"],
                    "lang": tweet["lang"]
                }])
                data = data.append(df, sort = False)
    data.to_csv(fn, index=False)
    return None

In [197]:
charliehebdo = get_event_data("charliehebdo")
charliehebdo.head()

Unnamed: 0,created,favorite_count,followers_count,friends_count,hashtags_count,is_rumor,lang,mentions_count,retweet_count,statuses_count,symbols_count,urls_count
0,Wed Jan 07 11:11:33 +0000 2015,41,41591,2268,0,False,en,0,202,15128,0,0
1,Wed Jan 07 11:12:44 +0000 2015,20,139847,30,0,False,en,0,144,58363,0,1
2,Wed Jan 07 11:14:42 +0000 2015,62,22568,1127,1,False,en,0,103,20575,0,0
3,Wed Jan 07 11:17:35 +0000 2015,22,139870,30,0,False,en,0,116,58370,0,1
4,Wed Jan 07 11:18:18 +0000 2015,55,2327,1270,1,False,en,0,105,20722,0,0
