## Deception Detection

In [1]:
import pandas as pd
import numpy as np
import os
import pickle as pkl

Few stuffs: 

The label: **"is_deception"**

1 --> yes, that's deception

0 --> No, not deception. 

In [101]:
def process_dataset_and_save_pkl(df, text="text", label="label", 
                                 group="None", subgroup="None", 
                                 saved_data_folder="/disk2/sadat/FakeNewsData/Holistic_v2/", random_state=100):
    
    '''This function will create the 10% data for test cases. 
    ****Important**** It is only for those datasets, that doesnt specify a test or validation set
    
    text: The name of the text field. 
    label: The name of the label field
    group: Name of the dataset it belongs to
    subgroup: If there's a multiple source in the dataset, it accounts for that
    '''
    
    
    df_train = df.sample(frac=0.9, random_state=random_state)
    df_test = df.drop(df_train.index)
    df_train["group"] = df_test["group"] = group
    df_train["subgroup"] = df_test["subgroup"]  = subgroup
    df_train.rename(columns={text: "text", label: "is_deception"}, inplace=True)
    df_test.rename(columns={text: "text", label: "is_deception"}, inplace=True)
    Name_train = saved_data_folder + "train_" + group + "_" + subgroup + ".pkl"
    Name_test = saved_data_folder + "test_" + group + "_" + subgroup + ".pkl"
    df_train.to_pickle(Name_train)
    df_test.to_pickle(Name_test)
    return df_train, df_test   

In [87]:
def rename_and_save_pkl(df_train, df_test, text="text", label="label", 
                                 group="None", subgroup="None", 
                                 saved_data_folder="/disk2/sadat/FakeNewsData/Holistic_v2/"):
    
    '''
    This function will take in the preprocessed train and test data, rename their name and save the pickle.
    ****Improtant**** If you donot have dataset-defined train/val/test data, donot use it.
    
    df_train: The train data provided with the dataset
    df_test: The test data provided with the dataset
    text: The name of the text field. 
    label: The name of the label field
    group: Name of the dataset it belongs to
    subgroup: If there's a multiple source in the dataset, it accounts for that
    '''
    
    df_train["group"] = df_test["group"] = group
    df_train["subgroup"] = df_test["subgroup"]  = subgroup
    df_train.rename(columns={text: "text", label: "is_deception"}, inplace=True)
    df_test.rename(columns={text: "text", label: "is_deception"}, inplace=True)
    Name_train = saved_data_folder + "train_" + group + "_" + subgroup + ".pkl"
    Name_test = saved_data_folder + "test_" + group + "_" + subgroup + ".pkl"
    df_train.to_pickle(Name_train)
    df_test.to_pickle(Name_test)
    return df_train, df_test   

## Dataset category 1.1: AAAI--COVID-19

In [70]:
path = "/disk2/sadat/FakeNewsData/COVID19/"
os.listdir(path)

['Constraint_English_Train - Sheet1.csv',
 'Constraint_English_Val.csv',
 'COVID Fake News Data.csv']

In [81]:
AAAI_COVID_train = pd.read_csv(path + "Constraint_English_Train - Sheet1.csv")
AAAI_COVID_train.head()

Unnamed: 0,id,tweet,label
0,1,The CDC currently reports 99031 deaths. In gen...,real
1,2,States reported 1121 deaths a small rise from ...,real
2,3,Politically Correct Woman (Almost) Uses Pandem...,fake
3,4,#IndiaFightsCorona: We have 1524 #COVID testin...,real
4,5,Populous states can generate large case counts...,real


In [82]:
AAAI_COVID_val = pd.read_csv(path + "Constraint_English_Val.csv")
AAAI_COVID_val.head()

Unnamed: 0,id,tweet,label
0,1,Chinese converting to Islam after realising th...,fake
1,2,11 out of 13 people (from the Diamond Princess...,fake
2,3,"COVID-19 Is Caused By A Bacterium, Not Virus A...",fake
3,4,Mike Pence in RNC speech praises Donald Trump’...,fake
4,5,6/10 Sky's @EdConwaySky explains the latest #C...,real


In [83]:
print("AAAI_COVID_train shape is: " + str(AAAI_COVID_train.shape))
print("AAAI_COVID_val shape is: " + str(AAAI_COVID_val.shape))

AAAI_COVID_train shape is: (6420, 3)
AAAI_COVID_val shape is: (2140, 3)


### We are going to use the val as test set

In [84]:
AAAI_COVID_train.label = AAAI_COVID_train.label.apply(lambda x:1 if x=="fake" else 0)
AAAI_COVID_val.label = AAAI_COVID_val.label.apply(lambda x:1 if x=="fake" else 0)

In [88]:
train, val = rename_and_save_pkl(AAAI_COVID_train, AAAI_COVID_val, text="tweet", label="label", 
                                 group="COVID", subgroup="AAAI", 
                                 saved_data_folder="/disk2/sadat/FakeNewsData/Holistic_v2/")

## Dataset category 1.2: zenodo--COVID-19

In [93]:
zenodo = pd.read_csv(path + "COVID Fake News Data.csv")

In [94]:
zenodo.outcome.value_counts()

0    9727
1     474
Name: outcome, dtype: int64

In [95]:
zenodo.head()

Unnamed: 0,headlines,outcome
0,A post claims compulsory vacination violates t...,0
1,A photo claims that this person is a doctor wh...,0
2,Post about a video claims that it is a protest...,0
3,All deaths by respiratory failure and pneumoni...,0
4,The dean of the College of Biologists of Euska...,0


In [96]:
zenodo["outcome"] = zenodo.outcome.apply(lambda x:1 if x==0 else 0)

In [102]:
zenodo_train, zenodo_test = process_dataset_and_save_pkl(zenodo, text="headlines", label="outcome", 
                                 group="COVID", subgroup="zenodo", 
                                 saved_data_folder="/disk2/sadat/FakeNewsData/Holistic_v2/", random_state=100)

## Dataset category 2.1: FakeNewsNet--politifact


In [104]:
path = "/disk2/sadat/FakeNewsData/FakeNewsNet/"

In [112]:
def read_process_FakeNewsNet_data(path, source=None):
    source_real = pd.read_csv(path + source + "_real.csv")
    source_real["label"] = 0
    source_fake = pd.read_csv(path + source + "_fake.csv")
    source_fake["label"] = 1
    df = pd.concat([source_real, source_fake], axis=0)
    df.reset_index(drop=True, inplace=True)
    return df

In [113]:
politifact = read_process_FakeNewsNet_data(path, source="politifact")

In [114]:
politifact.label.value_counts()

0    624
1    432
Name: label, dtype: int64

In [115]:
politifact.head()

Unnamed: 0,id,news_url,title,tweet_ids,label
0,politifact14984,http://www.nfib-sbet.org/,National Federation of Independent Business,967132259869487105\t967164368768196609\t967215...,0
1,politifact12944,http://www.cq.com/doc/newsmakertranscripts-494...,comments in Fayetteville NC,942953459\t8980098198\t16253717352\t1668513250...,0
2,politifact333,https://web.archive.org/web/20080204072132/htt...,"Romney makes pitch, hoping to close deal : Ele...",,0
3,politifact4358,https://web.archive.org/web/20110811143753/htt...,Democratic Leaders Say House Democrats Are Uni...,,0
4,politifact779,https://web.archive.org/web/20070820164107/htt...,"Budget of the United States Government, FY 2008",89804710374154240\t91270460595109888\t96039619...,0


In [116]:
poli_train, poli_test = process_dataset_and_save_pkl(politifact, text="title", label="label", 
                                 group="FakeNewsNet", subgroup="Politifact", 
                                 saved_data_folder="/disk2/sadat/FakeNewsData/Holistic_v2/", random_state=100)

## Dataset category 2.2: FakeNewsNet--Gossipcop


In [118]:
gossipcop = read_process_FakeNewsNet_data(path, source="gossipcop")

In [119]:
gossipcop.head()

Unnamed: 0,id,news_url,title,tweet_ids,label
0,gossipcop-882573,https://www.brides.com/story/teen-mom-jenelle-...,Teen Mom Star Jenelle Evans' Wedding Dress Is ...,912371411146149888\t912371528343408641\t912372...,0
1,gossipcop-875924,https://www.dailymail.co.uk/tvshowbiz/article-...,Kylie Jenner refusing to discuss Tyga on Life ...,901989917546426369\t901989992074969089\t901990...,0
2,gossipcop-894416,https://en.wikipedia.org/wiki/Quinn_Perkins,Quinn Perkins,931263637246881792\t931265332022579201\t931265...,0
3,gossipcop-857248,https://www.refinery29.com/en-us/2018/03/19192...,I Tried Kim Kardashian's Butt Workout & Am For...,868114761723936769\t868122567910936576\t868128...,0
4,gossipcop-884684,https://www.cnn.com/2017/10/04/entertainment/c...,Celine Dion donates concert proceeds to Vegas ...,915528047004209152\t915529285171122176\t915530...,0


In [121]:
gossipcop.shape

(22140, 5)

In [122]:
gossipcop.label.value_counts()

0    16817
1     5323
Name: label, dtype: int64

In [120]:
gos_train, gos_test = process_dataset_and_save_pkl(gossipcop, text="title", label="label", 
                                 group="FakeNewsNet", subgroup="gossipcop", 
                                 saved_data_folder="/disk2/sadat/FakeNewsData/Holistic_v2/", random_state=100)

## Dataset category 3.1: EMNLP_2017: newsfiles


In [150]:
path = "/disk2/sadat/FakeNewsData/FakeNewsNet/"

In [151]:
newsfiles_full = pd.read_csv("/disk2/sadat/FakeNewsData/newsfiles/pubreleasednewsfiles/full.csv", names=["label", "text"], header=None)

In [152]:
newsfiles_full.shape

(38859, 2)

In [153]:
newsfiles_full.head()

Unnamed: 0,label,text
0,1,"A little less than a decade ago, hockey fans w..."
1,1,The writers of the HBO series The Sopranos too...
2,1,Despite claims from the TV news outlet to offe...
3,1,After receiving 'subpar' service and experienc...
4,1,After watching his beloved Seattle Mariners pr...


In [154]:
newsfiles_full.label.value_counts()

3    17870
1    14047
2     6942
Name: label, dtype: int64

Newsfile has the following labels:


1 -> Satire

2 -> Hoax

3 -> Propaganda


Since, the satire is entirely created for entiretainment, we will regard it as Non-deceptive. For the hoax and Propaganda, these are strongly deceptive content. However, this time, we will keep the original labels as well for references.

In [155]:
def newsfile_label_conversion(label):
    if label==1:
        return 0 #satire is non-deceptive
    else:
        return 1

In [156]:
newsfiles_full["is_deception"] = newsfiles_full.label.apply(lambda x:newsfile_label_conversion(x))

In [157]:
newsfiles_full.is_deception.value_counts()

1    24812
0    14047
Name: is_deception, dtype: int64

In [158]:
nf_train, nf_test = process_dataset_and_save_pkl(newsfiles_full, text="text", label="is_deception", 
                                 group="EMNLP2017", subgroup="Newsfiles", 
                                 saved_data_folder="/disk2/sadat/FakeNewsData/Holistic_v2/", random_state=100)

## Dataset category 3.2: EMNLP_2017: politifact


In [40]:
path = "/media/sadat/SadatExtHDD/FakeNewsData/politifact_data/politifact_data"

In [162]:
poli_EMNLP_train = pd.read_csv("/disk2/sadat/FakeNewsData/politifact_data/politifact_data/train.csv", delimiter='\t')
poli_EMNLP_dev = pd.read_csv("/disk2/sadat/FakeNewsData/politifact_data/politifact_data/dev.csv", delimiter='\t')

In [163]:
print("poli_EMNLP_train shape is " + str(poli_EMNLP_train.shape))
print("poli_EMNLP_dev shape is " + str(poli_EMNLP_dev.shape))

poli_EMNLP_train shape is (2576, 3)
poli_EMNLP_dev shape is (712, 3)


In [164]:
poli_EMNLP_dev.head()

Unnamed: 0,Speaker,Statement,Rating
0,Peggy Noonan,(President) Reagan's second inaugural was move...,1
1,Florida Consumer Action Network,"For what we spend in just one week in Iraq, 80...",1
2,Roger Williams,The non-partisan Congressional Budget Office c...,3
3,American Civil Liberties Union of Florida,"13,000 Floridians are able to vote -- but thei...",4
4,Mike Dovilla,Mike Dovillas big ideas like expanding the Ohi...,2


0 -> True

1 -> Mostly True

2 -> Half True

3 -> Mostly False

4 -> False

5 -> Pants on fire False


There can be debates of which one of them is deceptive and which one are not. However, we go by simple measurements--> If the content is atleast HALF TRUE, it is not deceptive. 

In [166]:
# We will encode them with either True or False

poli_EMNLP_train["label"] = poli_EMNLP_train["Rating"].apply(lambda x:0 if x<=2 else 1)
poli_EMNLP_dev["label"] = poli_EMNLP_dev["Rating"].apply(lambda x:0 if x<=2 else 1)

In [167]:
poli_train, poli_test = rename_and_save_pkl(poli_EMNLP_train, poli_EMNLP_dev, text="Statement", label="label", 
                                 group="EMNLP2017", subgroup="Politifact", 
                                 saved_data_folder="/disk2/sadat/FakeNewsData/Holistic_v2/")

## Dataset category 04: LIAR


In [183]:
Liar_train = pd.read_csv("/disk2/sadat/FakeNewsData/liar_dataset/train.tsv",
                         sep="\t",
                         header=None,
                         names=["ID", "label", "statement", "subject", "speaker", "speaker's job title", "state",
                               "party_affiliation", "crdt_his_barely_true", "crdt_his_false", "crdt_his_half_true",
                               "crdt_his_mostly_true", "pants_on_fire_count", "ctx"])

In [184]:
Liar_valid = pd.read_csv("/disk2/sadat/FakeNewsData/liar_dataset/valid.tsv",
                         sep="\t",
                         header=None,
                         names=["ID", "label", "statement", "subject", "speaker", "speaker's job title", "state",
                               "party_affiliation", "crdt_his_barely_true", "crdt_his_false", "crdt_his_half_true",
                               "crdt_his_mostly_true", "pants_on_fire_count", "ctx"])

In [193]:
Liar_test = pd.read_csv("/disk2/sadat/FakeNewsData/liar_dataset/test.tsv",
                         sep="\t",
                         header=None,
                         names=["ID", "label", "statement", "subject", "speaker", "speaker's job title", "state",
                               "party_affiliation", "crdt_his_barely_true", "crdt_his_false", "crdt_his_half_true",
                               "crdt_his_mostly_true", "pants_on_fire_count", "ctx"])

As we have a seperate test set, we will concatenate train and validation.

In [185]:
Liar = pd.concat([Liar_train, Liar_valid], axis=0)

In [186]:
Liar.reset_index(drop=True, inplace=True)

In [187]:
Liar.shape

(11524, 14)

In [188]:
Liar.head(2)

Unnamed: 0,ID,label,statement,subject,speaker,speaker's job title,state,party_affiliation,crdt_his_barely_true,crdt_his_false,crdt_his_half_true,crdt_his_mostly_true,pants_on_fire_count,ctx
0,2635.json,false,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,0.0,0.0,0.0,a mailer
1,10540.json,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.


In [189]:
Liar.label.value_counts()

half-true      2362
false          2258
mostly-true    2213
barely-true    1891
true           1845
pants-fire      955
Name: label, dtype: int64

In [190]:
def encode_label_to_numeric(label):
    if label=="true" or label=="half-true" or label=="mostly-true":
        return 0
    elif label=="false" or label=="barely-true" or label=="pants-fire":
        return 1
    else:
        return -1

In [191]:
Liar["is_deception"] = Liar.label.apply(lambda x:encode_label_to_numeric(x))

In [194]:
Liar_test["is_deception"] = Liar.label.apply(lambda x:encode_label_to_numeric(x))

In [192]:
Liar.is_deception.value_counts()

0    6420
1    5104
Name: is_deception, dtype: int64

In [195]:
Liar_test.is_deception.value_counts()

0    716
1    551
Name: is_deception, dtype: int64

In [254]:
Liar_train, Liar_test = rename_and_save_pkl(Liar, Liar_test, text="statement", label="is_deception", 
                                 group="LIAR", subgroup="None", 
                                 saved_data_folder="/disk2/sadat/FakeNewsData/Holistic_v2/")

In [255]:
Liar_train.head()

Unnamed: 0,ID,label,text,subject,speaker,speaker's job title,state,party_affiliation,crdt_his_barely_true,crdt_his_false,crdt_his_half_true,crdt_his_mostly_true,pants_on_fire_count,ctx,is_deception,group,subgroup
0,2635.json,false,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,0.0,0.0,0.0,a mailer,1,LIAR,
1,10540.json,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.,0,LIAR,
2,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,Denver,0,LIAR,
3,1123.json,false,Health care reform legislation is likely to ma...,health-care,blog-posting,,,none,7.0,19.0,3.0,5.0,44.0,a news release,1,LIAR,
4,9028.json,half-true,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,,Florida,democrat,15.0,9.0,20.0,19.0,2.0,an interview on CNN,0,LIAR,


## Dataset category 05: Hyperpartisan News


In [198]:
path = "/disk2/sadat/FakeNewsData/Hyperpartisan_news_2019_semeval/data.csv"
Hyp = pd.read_csv(path)

In [199]:
Hyp.head()

Unnamed: 0.1,Unnamed: 0,date,title,internal,external,article_text,hyperpartisan,labeled-by,url
0,0,2017-09-10,Kucinich: Reclaiming the money power,4,['https://farm8.static.flickr.com/7020/6551534...,from flickr.com: money {mid numberplaceholder}...,True,article,https://www.opednews.com/articles/Kucinich-Rec...
1,1,2017-10-12,Trump Just Woke Up & Viciously Attacked Puerto...,0,['http://www.cnn.com/2017/03/16/politics/trump...,donald trump ran on many braggadocios and larg...,True,article,http://bipartisanreport.com/2017/10/12/trump-j...
2,2,2017-10-11,"Liberals wailing about gun control, but what a...",0,[],photo by justin images in response to joyce ne...,True,article,https://www.reviewjournal.com/opinion/letters/...
3,3,2017-09-24,Laremy Tunsil joins NFL players in kneeling du...,0,['https://twitter.com/UncleChaps/status/911927...,after colin kaepernick rightly chose to kneel ...,True,article,https://www.redcuprebellion.com/2017/9/24/1635...
4,4,2017-10-12,It's 1968 All Over Again,0,['http://www.nationalreview.com/redirect/amazo...,"almost a half century ago, in numberplaceholde...",False,article,https://www.realclearpolitics.com/articles/201...


In [200]:
Hyp.shape

(645, 9)

In [201]:
## Encoding: True means, it is a hyperpartisan text, which means, it is deceptive. False otherwise
Hyp['label'] = Hyp['hyperpartisan'].apply(lambda x:1 if x==True else 0)

As per our previous analysis, we decide to concatenate the title and full text for our analysis

In [208]:
Hyp["text"] = Hyp["title"] + " " + Hyp["article_text"]

In [210]:
Hyp.label.value_counts()

0    407
1    238
Name: label, dtype: int64

In [211]:
Hyp_train, Hyp_test = process_dataset_and_save_pkl(Hyp, text="text", label="label", 
                                 group="PAN_Hyperpartisan_News", subgroup="Trained_by_article", 
                                 saved_data_folder="/disk2/sadat/FakeNewsData/Holistic_v2/", random_state=100)

## Dataset category 06: PHEME  


In [227]:
rumours = pd.read_pickle("/disk2/sadat/FakeNewsData/Metadata/RUMOURS.pickle")
rumours["label"] = 1
nonrumours = pd.read_pickle("/disk2/sadat/FakeNewsData/Metadata/NON_RUMOURS.pickle")
nonrumours["label"] = 0

In [240]:
PHEME = pd.concat([rumours, nonrumours], 0)
PHEME = PHEME.sample(frac=1, random_state=100).reset_index(drop=True)

In [241]:
PHEME.shape

(6425, 28)

In [242]:
PHEME.head()

Unnamed: 0,contributors,truncated,text,in_reply_to_status_id,id,favorite_count,source,retweeted,coordinates,entities,...,lang,created_at,in_reply_to_status_id_str,place,Event,possibly_sensitive,extended_entities,filter_level,metadata,label
0,,False,"After the awful attack on #CharlieHebdo, Franc...",,553490241445965824,116,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",False,,"{'user_mentions': [], 'symbols': [], 'trends':...",...,en,2015-01-09 09:55:31+00:00,,,charliehebdo-all-rnr-threads,False,,low,,0
1,,False,BREAKING: @ottawapolice confirm one death on P...,,524949073154301952,53,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",False,,"{'symbols': [], 'user_mentions': [{'id': 23937...",...,en,2014-10-22 15:43:06+00:00,,,ottawashooting-all-rnr-threads,,,,,1
2,,False,Converts are DISTURBED!!\nCo-Pilot of #GermanW...,,581295858898989056,11,"<a href=""http://twitter.com/download/android"" ...",False,,"{'user_mentions': [], 'symbols': [], 'trends':...",...,en,2015-03-27 03:25:07+00:00,,,germanwings-crash-all-rnr-threads,False,,low,,1
3,,False,"Charlie Hebdo, the magazine that dared to poke...",,552815443959091200,29,"<a href=""https://about.twitter.com/products/tw...",False,,"{'symbols': [], 'user_mentions': [], 'hashtags...",...,en,2015-01-07 13:14:07+00:00,,,charliehebdo-all-rnr-threads,False,{'media': [{'expanded_url': 'http://twitter.co...,,,0
4,,False,White smoke has been spotted above the Kremlin...,,576504635738951680,155,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",False,,"{'symbols': [], 'user_mentions': [], 'hashtags...",...,en,2015-03-13 22:06:30+00:00,,,putinmissing-all-rnr-threads,False,{'media': [{'expanded_url': 'http://twitter.co...,,,0


In [243]:
PHEME.label.value_counts()

0    4023
1    2402
Name: label, dtype: int64

In [250]:
PHEME_train, PHEME_test = process_dataset_and_save_pkl(PHEME, text="text", label="label", 
                                 group="PHEME", subgroup="None", 
                                 saved_data_folder="/disk2/sadat/FakeNewsData/Holistic_v2/", random_state=100)

In [251]:
PHEME_train.shape

(5782, 30)

In [252]:
PHEME_test.shape

(643, 30)