In [165]:
# ---- LIBRARY IMPORTS ----
import pandas as pd
import re

In [168]:
"""
---- RETRIEVE DATA ----
Source:  https://www.cs.ucsb.edu/~william/data/liar_dataset.zip
"""
# Create column names
column_names = ['id', 'label', 'statement', 'subjects', 'speaker', 'job_title', 'state', 'party_affiliation', 'credit_history_count',\
         'false_count', 'half_true_count', 'mostly_true_count', 'pants_on_fire_count', 'context']

# specify data types
dtypes = {0: 'string', 1: 'string', 2: 'string', 3: 'string', 4: 'string', 5: 'string', 6: 'string', 7: 'string',\
          8: 'UInt64', 9: 'UInt64', 10: 'UInt64', 11: 'UInt64', 12: 'UInt64', 13: 'string'}

# Retrieve data
statements_train_data = pd.read_csv('data/statements/train.tsv', sep='\t', header=None, dtype=dtypes, names=column_names)
statements_test_data = pd.read_csv('data/statements/test.tsv', sep='\t', header=None, dtype=dtypes, names=column_names)
statements_validate_data = pd.read_csv('data/statements/valid.tsv', sep='\t', header=None, dtype=dtypes, names=column_names)

In [176]:
"""
---- CLEANING TEXT DATA ----
The following steps will be taken to clean data
- all non-alphanumeric data will be removed
- all text will be made lowercase
- 
"""

# Regex function to clean strings
def regex_cleaner(text):
    try:
    # Make all text lowercase
        text_lowercase = text.lower()
    # Remove all non-alphanumeric text
        text_alphanumeric = re.sub(r'[^a-z0-9\s\-]', '', text_lowercase)
    # Combine words that overlap to a new line
        clean_text = re.sub(r'(\-\n)', '', text_alphanumeric)
        return clean_text
    except:
        return text

# Cleaning Statements Data
statements_train_data['statement'] = statements_train_data['statement'].apply(regex_cleaner)
statements_test_data['statement'] = statements_test_data['statement'].apply(regex_cleaner)
statements_validate_data['statement'] = statements_validate_data['statement'].apply(regex_cleaner)

# Clean labels
label_dict = {'barely-true': 0, 'false': 1, 'half-true': 2, 'mostly-true': 3,\
              'pants-fire': 4, 'true': 5}
label_changer = lambda ele: label_dict.get(ele)

statements_train_data['label'] = statements_train_data['label'].apply(label_changer)
statements_test_data['label'] = statements_test_data['label'].apply(label_changer)
statements_validate_data['label'] = statements_validate_data['label'].apply(label_changer)


# Cleaning Fake News Articles Data
buzz_feed_fake_data = pd.read_csv('data/fake_news_articles/BuzzFeed_fake_news_content.csv')
buzz_feed_real_data = pd.read_csv('data/fake_news_articles/BuzzFeed_real_news_content.csv')
politi_fact_news_fake_data = pd.read_csv('data/fake_news_articles/PolitiFact_fake_news_content.csv')
politi_fact_news_real_data = pd.read_csv('data/fake_news_articles/PolitiFact_real_news_content.csv')

buzz_feed_fake_data['text'] = buzz_feed_fake_data['text'].apply(regex_cleaner)
buzz_feed_real_data['text'] = buzz_feed_real_data['text'].apply(regex_cleaner)
politi_fact_news_fake_data['text'] = politi_fact_news_fake_data['text'].apply(regex_cleaner)
politi_fact_news_real_data['text'] = politi_fact_news_real_data['text'].apply(regex_cleaner)

In [177]:
politi_fact_news_real_data

Unnamed: 0,id,title,text,url,top_img,authors,source,publish_date,movies,images,canonical_link,meta_data
0,Real_1-Webpage,Trump Just Insulted Millions Who Lost Everythi...,168k shares share this story\n\nhillary clinto...,http://occupydemocrats.com/2016/09/27/trump-ju...,http://occupydemocrats.com/wp-content/uploads/...,"Brett Bose,Grant Stern,Steve Bernstein,Natalie...",http://occupydemocrats.com,{'$date': 1474934400000},,http://occupydemocrats.com/wp-content/uploads/...,http://occupydemocrats.com/2016/09/27/trump-ju...,"{""generator"": ""Powered by Visual Composer - dr..."
1,Real_10-Webpage,Famous dog killed in spot she waited a year fo...,famous dog killed in spot she waited a year fo...,http://rightwingnews.com/top-news/famous-dog-k...,http://rightwingnews.com/wp-content/uploads/20...,,http://rightwingnews.com,{'$date': 1474948336000},,http://rightwingnews.com/wp-content/uploads/20...,http://rightwingnews.com/top-news/famous-dog-k...,"{""googlebot"": ""noimageindex"", ""og"": {""site_nam..."
2,Real_100-Webpage,House oversight panel votes Clinton IT chief i...,story highlights the house oversight panel vot...,http://cnn.it/2deaH2d,http://i2.cdn.cnn.com/cnnnext/dam/assets/16091...,"Tom Lobianco,Deirdre Walsh",http://cnn.it,,,http://i2.cdn.cnn.com/cnnnext/dam/assets/17050...,http://www.cnn.com/2016/09/22/politics/bryan-p...,"{""description"": ""Members of the House Oversigh..."
3,Real_101-Webpage,America Just Tragically Lost A Country Music I...,we are absolutely heartbroken to hear about th...,http://newsbake.com/entertainment-news/music-e...,http://newsbake.com/wp-content/uploads/2016/05...,Nancy Wells,http://newsbake.com,{'$date': 1474898600000},https://www.youtube.com/embed/8ozTJcu-_BU,http://0.gravatar.com/avatar/0d702c6042933cd78...,http://newsbake.com/entertainment-news/music-e...,"{""shareaholic"": {""site_name"": ""NewsBake"", ""lan..."
4,Real_102-Webpage,Monuments to the Battle for the New South,nine years ago a driver lost control of his pi...,http://politi.co/2dd9U1x,http://static.politico.com/25/ed/85332de14c45b...,"Jack Shafer,Lisa Rab",http://politi.co,{'$date': 1473941820000},,http://static.politico.com/25/ed/85332de14c45b...,http://www.politico.com/magazine/story/2016/09...,"{""description"": ""Virginia, increasingly divers..."
...,...,...,...,...,...,...,...,...,...,...,...,...
115,Real_95-Webpage,"Donald Trump, Germany’s disfavored son – POLITICO",kallstadt germany few places in germany are a...,http://politi.co/2csN1WG,http://www.politico.eu/wp-content/uploads/2016...,"Matthew Karnitschnig,Janosch Delcker",http://politi.co,{'$date': 1474601447000},,http://g8fip1kplyr33r3krz5b97d1.wpengine.netdn...,http://www.politico.eu/article/donald-trump-an...,"{""description"": ""In the idyllic hamlet of Kall..."
116,Real_96-Webpage,BREAKING: Hollywood Legend Just Died Of Terrib...,hollywood loses yet another one of their deare...,http://newsbake.com/entertainment-news/tv/brea...,http://newsbake.com/wp-content/uploads/2016/09...,Nancy Wells,http://newsbake.com,{'$date': 1474250978000},https://www.youtube.com/embed/pcj4boVT4fc,http://0.gravatar.com/avatar/0d702c6042933cd78...,http://newsbake.com/entertainment-news/tv/brea...,"{""shareaholic"": {""site_name"": ""NewsBake"", ""lan..."
117,Real_97-Webpage,Worst. President. Ever.,as my 25th wedding anniversary approached i tr...,http://politi.co/2d1qa5t,http://static.politico.com/2f/8c/e44158e84ee8a...,"Jack Shafer,Robert Strauss",http://politi.co,{'$date': 1474196700000},,"data:image/gif;base64,R0lGODlhAQABAAAAACH5BAEK...",http://www.politico.com/magazine/story/2016/09...,"{""description"": ""People are debating who will ..."
118,Real_98-Webpage,Don King drops N-word while introducing Donald...,story highlights trump was sitting in a chair ...,http://cnn.it/2dh5vq9,http://i2.cdn.cnn.com/cnnnext/dam/assets/16092...,Jeremy Diamond,http://cnn.it,,,http://i2.cdn.cnn.com/cnnnext/dam/assets/12102...,http://www.cnn.com/2016/09/21/politics/don-kin...,"{""description"": ""The controversial boxing prom..."
