In [1]:
# ---- LIBRARY IMPORTS ----
import pandas as pd
import re
frtom 

In [20]:
"""
---- RETRIEVE DATA ----
Source:  https://www.cs.ucsb.edu/~william/data/liar_dataset.zip
"""
# Create column names
column_names = ['id', 'label', 'statement', 'subjects', 'speaker', 'job_title', 'state', 'party_affiliation', 'credit_history_count',\
         'false_count', 'half_true_count', 'mostly_true_count', 'pants_on_fire_count', 'context']

# specify data types
dtypes = {0: 'string', 1: 'string', 2: 'string', 3: 'string', 4: 'string', 5: 'string', 6: 'string', 7: 'string',\
          8: 'UInt64', 9: 'UInt64', 10: 'UInt64', 11: 'UInt64', 12: 'UInt64', 13: 'string'}

# Retrieve data
statements_train_data = pd.read_csv('data/statements/train.tsv', sep='\t', header=None, dtype=dtypes, names=column_names)
statements_test_data = pd.read_csv('data/statements/test.tsv', sep='\t', header=None, dtype=dtypes, names=column_names)
statements_validate_data = pd.read_csv('data/statements/valid.tsv', sep='\t', header=None, dtype=dtypes, names=column_names)

In [21]:
"""
---- CLEANING TEXT DATA ----
The following steps will be taken to clean data
- all non-alphabetical data will be removed
- all text will be made lowercase
- 
"""

# Regex function to clean strings
def regex_cleaner(text):
    try:
    # Make all text lowercase
        text_lowercase = text.lower()
    # Remove all non-alphanumeric text
        text_alphanumeric = re.sub(r'[^a-z\s\-]', '', text_lowercase)
    # Combine words that overlap to a new line
        clean_text = re.sub(r'(\-\n)', '', text_alphanumeric)
        return clean_text
    except:
        return text

# Cleaning Statements Data
statements_train_data['statement'] = statements_train_data['statement'].apply(regex_cleaner)
statements_test_data['statement'] = statements_test_data['statement'].apply(regex_cleaner)
statements_validate_data['statement'] = statements_validate_data['statement'].apply(regex_cleaner)

# Clean labels
label_dict = {'barely-true': 0, 'false': 1, 'half-true': 2, 'mostly-true': 3,\
              'pants-fire': 4, 'true': 5}
label_changer = lambda ele: label_dict.get(ele)

statements_train_data['label'] = statements_train_data['label'].apply(label_changer)
statements_test_data['label'] = statements_test_data['label'].apply(label_changer)
statements_validate_data['label'] = statements_validate_data['label'].apply(label_changer)


# Cleaning Fake News Articles Data
buzz_feed_fake_data = pd.read_csv('data/fake_news_articles/BuzzFeed_fake_news_content.csv')
buzz_feed_real_data = pd.read_csv('data/fake_news_articles/BuzzFeed_real_news_content.csv')
politi_fact_news_fake_data = pd.read_csv('data/fake_news_articles/PolitiFact_fake_news_content.csv')
politi_fact_news_real_data = pd.read_csv('data/fake_news_articles/PolitiFact_real_news_content.csv')

buzz_feed_fake_data['text'] = buzz_feed_fake_data['text'].apply(regex_cleaner)
buzz_feed_real_data['text'] = buzz_feed_real_data['text'].apply(regex_cleaner)
politi_fact_news_fake_data['text'] = politi_fact_news_fake_data['text'].apply(regex_cleaner)
politi_fact_news_real_data['text'] = politi_fact_news_real_data['text'].apply(regex_cleaner)

In [22]:
# --- CONVERT DATA TO BAG OF WORDS ----

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer
import torch

In [28]:
corpus = buzz_feed_real_data['text']
vectorizer = TfidfVectorizer(strip_accents='ascii', ngram_range=(1, 2))
X = vectorizer.fit_transform(corpus)

In [31]:
test = X.toarray()
test = torch.from_numpy(test)

In [32]:
test

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], dtype=torch.float64)