In [1]:
import pandas as pd
import nltk
nltk.download('vader_lexicon')
import collections
import itertools
from collections import Counter
import tldextract
import numpy as np
#snorkel
from snorkel.labeling import labeling_function
from snorkel.labeling import PandasLFApplier
from snorkel.labeling.model import MajorityLabelVoter
from snorkel.labeling import LFAnalysis
from snorkel.labeling.model import LabelModel
# web
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
import time
import json

ABSTAIN = -1
FAKE = 0
REAL = 1

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\sefilipi\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
data = pd.read_csv(r'..\data\FNID-dataset\dataset\fake news detection(FakeNewsNet)\fnn_train.csv')
data.head(3)

Unnamed: 0,id,date,speaker,statement,sources,paragraph_based_content,fullText_based_content,label_fnn
0,3106,2011-01-25T06:00:00-05:00,Joe Wilkinson,A national organization says Georgia has one o...,['http://www.ajc.com/news/georgia-politics-ele...,['A coalition of government watchdog groups la...,A coalition of government watchdog groups last...,fake
1,5655,2012-04-02T11:42:20-04:00,Rick Scott,"Says Barack Obama's health care law ""will be t...",['http://www.youtube.com/watch?v=TaC0mKApf9Q&f...,['As Supreme Court justices embarked on three ...,As Supreme Court justices embarked on three da...,fake
2,3506,2011-04-01T09:49:05-04:00,J.D. Alexander,Says the Southwest Florida Water Management Di...,['http://www.tampabay.com/news/politics/gubern...,"[""Here's a new one: The Senate budget committe...",Here's a new one: The Senate budget committee ...,fake


In [3]:
data.shape

(15212, 8)

### Converting the label to numbers, to use it for the validation

In [4]:
data["label_numeric"] = data.apply(lambda row: 1 if row["label_fnn"]=='real' else 0, axis=1)

### Initializing the sentiment analysis package, to use later

In [5]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

### Retriving the labels or valuable information from each site

In [6]:
# contacts a url, downloads the website's content and parses it.  
def get_parsed_html(url):
    req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
    webpage = urlopen(req).read()
    parsed_html = BeautifulSoup(webpage)
    return parsed_html

### www.politifact.com

In [7]:
def get_poitifact_image_alt(url):
    result = "abstain"
    try:
        parsed_html = get_parsed_html(url)
        div = parsed_html.body.find('div', attrs={'class':'m-statement__meter'})
        result = div.find("img", attrs={'class':'c-image__original'})["alt"]
        time.sleep(3)
    except Exception as e:
        print(e)
    return result

### www.snopes.com

In [8]:
def get_snopes_image_alt(url):
    result = "abstain"
    try:
        parsed_html = get_parsed_html(url)
        div = parsed_html.body.find('div', attrs={'class':'media rating'})
        result = div.find("img")["alt"]
    except Exception as e:
        print(e)
    return result

### www.factcheck.org

In [9]:
def get_factcheck_first_paragraph(url):
    result = "abstain"
    try:
        parsed_html = get_parsed_html(url)
        div = parsed_html.body.find('div', attrs={'class':'entry-content'})
        # if the first paragraph starts with 'Q:' and the second with 'A:' than it is a Q & A style; 
        # take the second paragraph
        # otherwise take the first.
        parag = div.find_all("p")
        if(parag[0].text[0:3] == 'Q: ' and parag[1].text[0:3] == 'A: '):           
            return parag[1].text
        return parag[0].text
    except Exception as e:
        print(e)
    return result

### www.factcheck.afp.com

In [10]:
def get_factcheck_afp_title(url):
    result = "abstain"
    try:
        parsed_html = get_parsed_html(url)
        h3 = parsed_html.body.find('h3')
        return h3.text
    except Exception as e:
        print(e)
    return result

### www.twitter.com

In [11]:
def extract_twitter_name(url):
    start = url.find('https')
    sub = url[20+start:len(url)] # removing 'https://twitter.com/'
    index = sub.find('/')
    if(index == -1):
        return sub
    else:
        return sub[:index]

### Retrieving urls of fact checking sites

In [12]:
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
import time

In [13]:
fact_checking_sites = {
    "www.politifact.com" : get_poitifact_image_alt,
    "www.snopes.com": get_snopes_image_alt,
    "www.twitter.com":  extract_twitter_name,
    "www.factcheck.org": get_factcheck_first_paragraph,
    "factcheck.afp.com": get_factcheck_afp_title,
    "www.washingtonpost.com/news/fact-checker/": None,
    "www.realclearpolitics.com": None,
    "www.glennbeck.com": None,
}

In [14]:
def sources_as_list(source, domain):
    urls = source[1:-1].split(',')
    u = []
    for url in urls:
        if domain in url:
            u.append(url)
    return u

In [15]:
# Initialize the new columns
for site in fact_checking_sites: 
      data[site] = None
data_size = data.shape[0]
data_size

15212

In [16]:
fact_checking_sites_results = {
    "www.politifact.com" : [None] * data_size,
    "www.snopes.com": [None] * data_size,
    "www.twitter.com":  [None] * data_size,
    "www.factcheck.org": [None] * data_size,
    "factcheck.afp.com": [None] * data_size,
    "www.washingtonpost.com/news/fact-checker/": [None] * data_size,
    "www.realclearpolitics.com": [None] * data_size,
    "www.glennbeck.com": [None] * data_size,
}

In [None]:
# Iterate through the records
# and looks through the sources for each fact-checking site
#
# Commented out because it takes hours to run (the sites will throttle too many requests)
# the results are presented below.
''' 
with open("factchecking_results.txt", "a") as results:
    for i, row in data.iterrows():
        for site in fact_checking_sites: 
            sources = sources_as_list(row["sources"], site)
            if len(sources) != 0:
                #print("{}".format(i))
                labels = ""
                for source in sources:
                    handler = fact_checking_sites[site]
                    if handler:
                        #print("Handling: {} ++++++++++++++++++++++++++".format(site))
                        source = str(source).strip()[1:-1]
                        if(len(labels) > 0):
                            labels += ", "+handler(str(source))
                        else:
                            labels += handler(str(source))
                        #print("Result: {} ++++++++++++++++++++++++++".format(labels))
                    else:
                        if(len(labels) > 0):
                            #print("Handling: {} ++++++++++++++++++++++++++".format(site))
                            labels += ", "+ source
                        else:
                            labels += source
                    #print("Result: {} ++++++++++++++++++++++++++".format(labels))
                fact_checking_sites_results[site][i] =labels
                print("{} | {} | {}".format(i, site, labels))
                results.write("{} | {} | {}\n".format(i, site, labels))
'''

In [17]:
for site in fact_checking_sites: 
      data[site] = fact_checking_sites_results[site]

In [18]:
### ALTERNATIVE TO THE TWO CELL ABOVE, IF LOADING FROM THE FILE
apiResultsFile = open("apiResults.txt", "r", encoding='utf-8')
for line in apiResultsFile:
    try:
        sr = line.split("|")
        row = int(sr[0].strip())
        col = sr[1].strip()
        data.at[row,col] = sr[2]
    except Exception as e:
        print(e)
apiResultsFile.close()

## Crowdsourcing - reading the results from the rated files, and adding them to the dataset

### www.glennbeck.com

In [19]:
glenbeck_ratings = pd.read_csv(r"..\data\glennbeck_ratings.csv");

for i, row in glenbeck_ratings.iterrows():
    data.loc[data["id"] == row["id"],["www.glennbeck.com"]] = row["www.glennbeck.com"]

### www.realclearpolitics.com/

In [20]:
rp_ratings = pd.read_csv(r"..\data\realclearpolitics_ratings.csv");

for i, row in rp_ratings.iterrows():
    data.loc[data["id"] == row["id"],["www.realclearpolitics.com"]] = row["www.realclearpolitics.com"]

### www.washingtonpost.com/news/fact-checker/

In [21]:
wp_ratings = pd.read_csv(r"..\data\washingtonpost_ratings.csv");

for i, row in wp_ratings.iterrows():
    data.loc[data["id"] == row["id"],["www.washingtonpost.com/news/fact-checker/"]] = row["www.washingtonpost.com/news/fact-checker/"]

# Learning the labels with Snorkel

In [23]:
@labeling_function()
def label_snopes(row):
    label = row["www.snopes.com"]
    if label is not None:
        label = str(row["www.snopes.com"])
        if ('real' in label):
            return REAL
        else: 
            return FAKE
    else: 
        return ABSTAIN

In [24]:
@labeling_function()
def label_wp(row):
    label = row["www.washingtonpost.com/news/fact-checker/"]
    if label is not None:
        label = str(row["www.washingtonpost.com/news/fact-checker/"])
        if ('real' in label):
            return REAL
        else: 
            return FAKE
    else: 
        return ABSTAIN

In [25]:
@labeling_function()
def label_rp(row):
    label = row["www.realclearpolitics.com"]
    if label is not None:
        label = str(row["www.realclearpolitics.com"])
        if ('real' in label):
            return REAL
        else: 
            return FAKE
    else: 
        return ABSTAIN

In [26]:
truth_o_meter = {
    "true": 4,
    "mostly-true": 3,
    "half-true": 2,
    "barely-true": 1,
    "mostly-false": -1,
    "false": -2,
    "pants-fire": -3    
}
@labeling_function()
def label_politifact(row):
    total_score = 0
    labels = row["www.politifact.com"]
    #print(labels)
    if(labels):
        labels = str(row["www.politifact.com"]).split(',')
        # The last label has the newline character
        if(len(labels) > 0):
            labels[-1] = labels[-1][:-2]
        for label in labels:
            #print(label)
            label = label.strip()
            if(label in truth_o_meter):
                total_score += truth_o_meter[label]                
    #print("score: {} ".format(total_score))          
    if(total_score > 0):
        return REAL
    if(total_score < 0): 
        return FAKE
    
    return ABSTAIN

In [27]:
def facktcheck_sentiment(row, columnName):
    label = str(row[columnName])
    score = 0
    if(label):
        claims = label[1:-1].split(',')
        for claim in claims:
            #print(claim)
            sentiment = sid.polarity_scores(claim)
            #print(sentiment)
            if(sentiment["neg"] > sentiment["pos"]):
                score -=1
            elif(sentiment["pos"] > sentiment["neg"]):
                score +=1
        if(score > 0):
            return REAL
        elif (score < 0):
            return FAKE
        else:
            return ABSTAIN
    return ABSTAIN

In [28]:
@labeling_function()
def facktcheckqa_sentiment(row):
    return facktcheck_sentiment(row, "www.factcheck.org")

In [29]:
@labeling_function()
def facktcheckafpqa_sentiment(row):
    return facktcheck_sentiment(row, "factcheck.afp.com")

### Transfer Learning from the liar dataset

In [30]:
# Load the Liar dataset
liar = pd.read_csv(r'..\data\FNID-dataset\dataset\fake news detection(LIAR)\liar_train.csv')
liar.head(3)

Unnamed: 0,id,date,speaker,statement,sources,paragraph_based_content,fullText_based_content,label-liar
0,18178,2020-03-18T13:26:42-04:00,Instagram posts,"""COVID-19 started because we eat animals.""",['https://www.cdc.gov/coronavirus/2019-ncov/ca...,['Vegan Instagram users are pinning the 2019 c...,Vegan Instagram users are pinning the 2019 cor...,barely-true
1,3350,2011-03-04T09:12:59-05:00,Glenn Beck,Says Michelle Obama has 43 people on her staff...,['http://www.glennbeck.com/2011/02/25/while-wo...,['Glenn Beck rekindled a falsehood about the s...,Glenn Beck rekindled a falsehood about the siz...,pants-fire
2,14343,2017-07-21T11:52:44-04:00,Mike Pence,"Says President Donald Trump ""has signed more l...",['https://nrf.com/events/retail-advocates-summ...,['Vice President Mike Pence says that when it ...,Vice President Mike Pence says that when it co...,half-true


In [31]:
# check the unique labels
labels = liar["label-liar"].unique()
labels

array(['barely-true', 'pants-fire', 'half-true', 'mostly-true', 'true',
       'false'], dtype=object)

In [32]:
counts = {}
# true speakers
counts_true = collections.Counter(liar[(liar["label-liar"]=="mostly-true") | (liar["label-liar"]=="true")]["speaker"])
counts_true = dict(counts_true.most_common())
# false speakers
counts_false = collections.Counter(liar[(liar["label-liar"]=="false" )| (liar["label-liar"]=="pants-fire")]["speaker"])
counts_false = dict(counts_false.most_common())

In [33]:
false_percent = {}
for k, v in counts_false.items():
    total = v
    if k in counts_true:
        total += counts_true[k]
    false_percent[k] = v/total

In [34]:
true_percent = {}
for k, v in counts_true.items():
    total = v
    if k in counts_false:
        total += counts_false[k]
    true_percent[k] = v/total

In [35]:
@labeling_function()
def speaker(row):
    speaker = row["speaker"]
    if(speaker in true_percent and true_percent[speaker] > 0.6):
        return REAL
    if(speaker in false_percent and false_percent[speaker] > 0.6):
        return FAKE
    return ABSTAIN

## Training the snorkel model

In [36]:
data = data.sample(frac = 1, random_state=1)
df_train = data[:12170]
df_valid = data[12170:]

lfs = [
        label_rp,
        label_wp, 
        label_snopes,
        label_politifact,
        facktcheckqa_sentiment,
        facktcheckafpqa_sentiment,
        speaker
      ]

applier = PandasLFApplier(lfs=lfs)
L_train = applier.apply(df=df_train)
LFAnalysis(L=L_train, lfs=lfs).lf_summary()

100%|██████████████████████████████████████████████████████████████████████████| 12170/12170 [00:01<00:00, 8529.45it/s]


Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
label_rp,0,"[0, 1]",0.00797,0.006984,0.002219
label_wp,1,"[0, 1]",0.00986,0.008874,0.003122
label_snopes,2,[0],0.027691,0.026952,0.004108
label_politifact,3,"[0, 1]",0.244618,0.184717,0.071076
facktcheckqa_sentiment,4,"[0, 1]",0.020707,0.019967,0.010682
facktcheckafpqa_sentiment,5,"[0, 1]",0.000822,0.000822,0.000493
speaker,6,"[0, 1]",0.721282,0.21627,0.075678


In [1]:
majority_model = MajorityLabelVoter()
preds_train_majority = majority_model.predict(L=L_train)
L_valid = applier.apply(df=df_valid)

Y_valid = df_valid["label_numeric"].values
LFAnalysis(L_valid, lfs).lf_summary(Y_valid)

NameError: name 'MajorityLabelVoter' is not defined

In [38]:
label_model = LabelModel()
label_model.fit(L_train=L_train, n_epochs=100, log_freq=100, seed=123)
preds_train_label = label_model.predict(L=L_train)
preds_valid_label = label_model.predict(L=L_valid)
L_valid = applier.apply(df_valid)

Y_valid = df_valid["label_numeric"].values
f1_micro = label_model.score(L_valid, Y_valid, metrics=["f1_micro"])
accuracy = label_model.score(L_valid, Y_valid, metrics=["accuracy"])
recall = label_model.score(L_valid, Y_valid, metrics=["recall"])
precision = label_model.score(L_valid, Y_valid, metrics=["precision"])

print("{} {} {} {}".format(f1_micro, accuracy, recall, precision))

AttributeError: 'Graph' object has no attribute 'node'

In [39]:
Y_valid = df_valid["label_numeric"].values
f1_micro = majority_model.score(L_valid, Y_valid, metrics=["f1_micro"])
accuracy = majority_model.score(L_valid, Y_valid, metrics=["accuracy"])
recall = majority_model.score(L_valid, Y_valid, metrics=["recall"])
precision = majority_model.score(L_valid, Y_valid, metrics=["precision"])

print("{} {} {} {}".format(f1_micro, accuracy, recall, precision))



{'f1_micro': 0.7560199909132213} {'accuracy': 0.7560199909132213} {'recall': 0.8211091234347049} {'precision': 0.7314741035856573}


In [134]:
snorkel_predictions = np.concatenate((preds_train_label,preds_valid_label))
snorkel_predictions.shape

NameError: name 'preds_train_label' is not defined

In [255]:
data["snorkel_labels"] =snorkel_predictions

In [256]:
data.to_csv("data_nlp.csv")