In [2]:
import pandas as pd

In [3]:
# Ez a kódrészlet a LIAR adathalmazt dolgozza fel, amely hamis hírek osztályozására szolgál.
# Az adatok három különálló TSV fájlból kerülnek beolvasásra: teszt, validációs és training adatok.
# Ezeket a DataFrame-eket egyesítjük egyetlen táblázattá.

liar_df_test = pd.read_csv('data/liar_test.tsv',sep = '\t', header = None)
liar_df_valid = pd.read_csv('data/liar_valid.tsv',sep = '\t', header = None)
liar_df_train = pd.read_csv('data/liar_train.tsv',sep = '\t', header = None)

# Concatenate the two dataframes
liar_df = pd.concat([liar_df_test, liar_df_train, liar_df_valid], ignore_index=True)

rename_mapping = {
    liar_df.columns[0]: 'title',
    liar_df.columns[2]: 'text',
    liar_df.columns[1]: 'label'
}

liar_df = liar_df.rename(columns = rename_mapping)[['title', 'text', 'label']]
liar_df['text'].isna().sum()

0

In [4]:
liar_df = liar_df[["text","label"]]

liar_df = liar_df.loc[liar_df['label'].isin(['true','false'])]
liar_df

Unnamed: 0,text,label
0,Building a wall on the U.S.-Mexico border will...,true
1,Wisconsin is on pace to double the number of l...,false
2,Says John McCain has done nothing to help the ...,false
5,Over the past five years the federal governmen...,true
6,Says that Tennessee law requires that schools ...,true
...,...,...
12776,We've excluded lobbyists from policymaking jobs.,false
12778,Says George Flinn has been a no show at schedu...,false
12782,"I supported (the surge), I argued for it. I'm ...",false
12788,"John McCain and George Bush have ""absolutely n...",true


In [5]:
#WELFake dataset feldolgozása

welfake_df = pd.read_csv('data/WELFake_Dataset.csv')

welfake_df = welfake_df[['text', 'label']]

welfake_df['text'].isna().sum()



welfake_df

Unnamed: 0,text,label
0,No comment is expected from Barack Obama Membe...,1
1,Did they post their votes for Hillary already?,1
2,"Now, most of the demonstrators gathered last ...",1
3,A dozen politically active pastors came here f...,0
4,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1
...,...,...
72129,WASHINGTON (Reuters) - Hackers believed to be ...,0
72130,"You know, because in fantasyland Republicans n...",1
72131,Migrants Refuse To Leave Train At Refugee Camp...,0
72132,MEXICO CITY (Reuters) - Donald Trump’s combati...,0


In [None]:
#BS detector dataset feldolgozása
#Itt csak a 'junksci','bs','fake' sorokat használjuk 'fake'-ként

bs_detector_df = pd.read_csv('data/bs_detector.csv')

rename_mapping = {
    'type': 'label'
}

bs_detector_df = bs_detector_df.rename(columns = rename_mapping)[['text', 'label']]

bs_detector_df['text'].isna().sum()

bs_detector_df

In [24]:
bs_detector_df = bs_detector_df.loc[bs_detector_df['label'].isin(['junksci','bs','fake'])]

bs_detector_df

Unnamed: 0,text,label
57,18 SHARE The Amish in America have committed t...,fake
58,64 SHARE President Obama has signed an Executi...,fake
59,United States Marine Field McConnell Plum Cit...,bs
60,"So ,you have Rothschild banksters and British ...",bs
61,Here is the problem . The USA constitution sta...,bs
...,...,...
12994,It DOES allow you to put a dog face on top of ...,bs
12995,Wait till you see what happens to the valuatio...,bs
12996,I'm waiting for the one that puts a pussy on m...,bs
12997,$4 Billion even after they are known to be kee...,bs


In [28]:
concat_df = pd.concat([liar_df,welfake_df, bs_detector_df], ignore_index = True)

concat_df = concat_df.drop_duplicates().reset_index()[['text', 'label']]

concat_df['text'].isna().sum()

np.int64(1)

In [29]:
label_mapping = {
    'true': '1',
    'false': '0',
    'fake': '0',
    'bs': '0',
    'hate': '0',
    'junksci': '0',
    1: '1',
    0: '0'
}

concat_df['label'] = concat_df['label'].map(label_mapping)

In [30]:
concat_df

Unnamed: 0,text,label
0,Building a wall on the U.S.-Mexico border will...,1
1,Wisconsin is on pace to double the number of l...,0
2,Says John McCain has done nothing to help the ...,0
3,Over the past five years the federal governmen...,1
4,Says that Tennessee law requires that schools ...,1
...,...,...
78410,It DOES allow you to put a dog face on top of ...,0
78411,Wait till you see what happens to the valuatio...,0
78412,I'm waiting for the one that puts a pussy on m...,0
78413,$4 Billion even after they are known to be kee...,0


In [31]:

import re

def clean_text(text):
    text = str(text)
    text = text.lower()  # Lowercase
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

concat_df['cleaned_text'] =concat_df['text'].apply(clean_text)

concat_df

Unnamed: 0,text,label,cleaned_text
0,Building a wall on the U.S.-Mexico border will...,1,building a wall on the u s mexico border will ...
1,Wisconsin is on pace to double the number of l...,0,wisconsin is on pace to double the number of l...
2,Says John McCain has done nothing to help the ...,0,says john mccain has done nothing to help the ...
3,Over the past five years the federal governmen...,1,over the past five years the federal governmen...
4,Says that Tennessee law requires that schools ...,1,says that tennessee law requires that schools ...
...,...,...,...
78410,It DOES allow you to put a dog face on top of ...,0,it does allow you to put a dog face on top of ...
78411,Wait till you see what happens to the valuatio...,0,wait till you see what happens to the valuatio...
78412,I'm waiting for the one that puts a pussy on m...,0,i m waiting for the one that puts a pussy on m...
78413,$4 Billion even after they are known to be kee...,0,4 billion even after they are known to be keep...


In [32]:
concat_df[['text','label']].to_csv('prepared_data/cleaned_fake_news_dataset.csv', index=False)