# PHEME Rumor Dataset: Exploratory Data Analysis

In [1]:
# Load dependencies for this Jupyter Notebook
import os, json, errno
import pandas as pd
from pandas.io.json import json_normalize

### Prase and Clean Data
This step takes the raw PHEME rumor dataset and saves them as a cleaned CSV file. The original PHEME data consists of JSON files organized into directories by event and category (rumor or non-rumor). These three functions below clean the data and load them as a Pandas DataFrame from the "cached" CSV file.

In [4]:
def get_event_data(event, dataset="pheme-rnr-dataset", refresh=False):
    """ Fetches event data as a Pandas DataFrame. If cleaned CSV file does not exist, then create it.
    
    Params:
        - event: Name of fake news event
        - dataset: fake news dataset, default to PHEME dataset
        - refresh: if True, then reparse raw PHEME to CSV and overwrite existing CSV file.
    
    Return: Pandas DataFrame
    """
    fn = "data/%s/%s.csv" % (dataset, event)  
    if refresh:
        clean_pheme_by_event(event)
        event = pd.read_csv(fn)
    else:
        try:
            event = pd.read_csv(fn)
        except OSError as e:
            clean_pheme_by_event(event)
            event = pd.read_csv(fn)
    return event;

def clean_pheme_by_event(event):
    """ Parses json data stored in directories of the PHEME dataset into a CSV file.
    
    Params:
        - event: Name fake news event and directory name in PHEME dataset
    
    Return: None
    """
    dataset = "pheme-rnr-dataset"
    fn = "data/%s/%s.csv" % (dataset, event)
    header = True
    data = pd.DataFrame()            
    for category in os.listdir("%s/%s" % (dataset, event)):
        for thread in os.listdir("%s/%s/%s" % (dataset, event, category)):
            with open("%s/%s/%s/%s/source-tweet/%s.json" % (dataset, event, category, thread, thread)) as f:
                tweet = json.load(f)
            df = tweet_to_df(tweet, category, thread)
            data = data.append(df)
            for reaction in os.listdir("%s/%s/%s/%s/reactions" % (dataset, event, category, thread)):
                with open("%s/%s/%s/%s/reactions/%s" % (dataset, event, category, thread, reaction)) as f:
                    tweet = json.load(f)
                df = tweet_to_df(tweet, category, thread)
                data = data.append(df)
    data.to_csv(fn, index=False)
    return None

def tweet_to_df(twt, cat, thrd):
    """  Convert tweet meta-data to DataFrame instance
    
    Params:
        - twt: The new tweet to add to the table
        - cat: The category of the tweet, either rumor or non rumor
        - thrd: The thread id of the tweet
    """
    
    return pd.DataFrame([{
        "thread": thrd,
        "tweet_length": len(twt.get("text","")),
        "text": twt.get("text"),
        "id": twt.get("id"),
        "in_reply_id": twt.get("in_reply_to_status_id", None),
        "in_reply_user": twt.get("in_reply_to_user", None),
        "favorite_count": twt.get("favorite_count"),
        "retweeted": twt.get("retweeted"),
        "coordinates": twt.get("coordinates"),
        "user.tweets_count": twt["user"]["statuses_count"],
        "user.followers_count": twt["user"]["followers_count"],
        "has_url": True if len(twt["entities"]["urls"]) > 0 else False,
        "is_rumor": True if cat is "rumour" else False,
        "retweet_count": twt.get("retweet_count"),
        "symbols_count": len(twt["entities"]["symbols"]),
        "mentions_count": len(twt["entities"]["user_mentions"]),
        "hashtags_count": len(twt["entities"]["hashtags"]),                                          
        "urls_count": len(twt["entities"]["urls"]),
        "user.friends_count": twt["user"]["friends_count"],
        "created": twt.get("created_at"),
        "lang": twt.get("lang")
    }])

In [5]:
charliehebdo = get_event_data("charliehebdo", refresh=True)
charliehebdo.head()

Unnamed: 0,coordinates,created,favorite_count,has_url,hashtags_count,id,in_reply_id,in_reply_user,is_rumor,lang,...,retweet_count,retweeted,symbols_count,text,thread,tweet_length,urls_count,user.followers_count,user.friends_count,user.tweets_count
0,,Wed Jan 07 11:11:33 +0000 2015,41.0,False,0.0,5.527846e+17,,,False,en,...,202.0,False,0.0,Charlie Hebdo became well known for publishing...,5.527846e+17,82.0,0.0,41591.0,2268.0,15128.0
1,,Wed Jan 07 11:14:08 +0000 2015,0.0,False,0.0,5.527852e+17,5.527846e+17,,False,en,...,0.0,False,0.0,"Now 10 dead in a shooting there today RT ""@BBC...",5.527846e+17,138.0,0.0,4671.0,4954.0,5064.0
2,,Wed Jan 07 11:20:08 +0000 2015,0.0,False,0.0,5.527868e+17,5.527846e+17,,False,en,...,0.0,False,0.0,@BBCDanielS @BBCWorld I'm guessing this is bei...,5.527846e+17,93.0,0.0,59.0,113.0,2170.0
3,,Wed Jan 07 11:20:18 +0000 2015,0.0,False,1.0,5.527868e+17,5.527846e+17,,False,en,...,1.0,False,0.0,@BBCDanielS @BBCWorld why would you mention th...,5.527846e+17,95.0,0.0,96357.0,385.0,12064.0
4,,Wed Jan 07 11:20:54 +0000 2015,0.0,False,0.0,5.52787e+17,5.527846e+17,,False,en,...,0.0,False,0.0,@BBCDanielS @BBCWorld perps identified?,5.527846e+17,39.0,0.0,751.0,1332.0,13256.0
