In [1]:
from fastai.torch_basics import *
from fastai.data.all import *

In [2]:
import spacy,html
from spacy.symbols import ORTH

In [38]:
import emoji

In [3]:
UNK, PAD, BOS, EOS, FLD, TK_REP, TK_WREP, TK_UP, TK_MAJ = "xxunk xxpad xxbos xxeos xxfld xxrep xxwrep xxup xxmaj".split()

In [4]:
_re_spec = re.compile(r'([/#\\])')

def spec_add_spaces(t):
    "Add spaces around / and #"
    return _re_spec.sub(r' \1 ', t)

In [5]:
_re_space = re.compile(' {2,}')

def rm_useless_spaces(t):
    "Remove multiple spaces"
    return _re_space.sub(' ', t)

In [6]:
_re_rep = re.compile(r'(\S)(\1{2,})')

def replace_rep(t):
    "Replace repetitions at the character level: cccc -- TK_REP 4 c"
    def _replace_rep(m):
        c,cc = m.groups()
        return f' {TK_REP} {len(cc)+1} {c} '
    return _re_rep.sub(_replace_rep, t)

In [7]:
_re_wrep = re.compile(r'(?:\s|^)(\w+)\s+((?:\1\s+)+)\1(\s|\W|$)')

In [8]:
def replace_wrep(t):
    "Replace word repetitions: word word word word -- TK_WREP 4 word"
    def _replace_wrep(m):
        c,cc,e = m.groups()
        return f' {TK_WREP} {len(cc.split())+2} {c} {e}'
    return _re_wrep.sub(_replace_wrep, t)

In [9]:
def fix_html(x):
    "Various messy things we've seen in documents"
    x = x.replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace('nbsp;', ' ').replace(
        '#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace('<br />', "\n").replace(
        '\\"', '"').replace('<unk>',UNK).replace(' @.@ ','.').replace(' @-@ ','-').replace('...',' …')
    return html.unescape(x)

In [10]:
_re_all_caps = re.compile(r'(\s|^)([A-Z]+[^a-z\s]*)(?=(\s|$))')
def replace_all_caps(t):
    "Replace tokens in ALL CAPS by their lower version and add `TK_UP` before."
    def _replace_all_caps(m):
        tok = f'{TK_UP} ' if len(m.groups()[1]) > 1 else ''
        return f"{m.groups()[0]}{tok}{m.groups()[1].lower()}"
    return _re_all_caps.sub(_replace_all_caps, t)

In [11]:
_re_maj = re.compile(r'(\s|^)([A-Z][^A-Z\s]*)(?=(\s|$))')
def replace_maj(t):
    "Replace tokens in Sentence Case by their lower version and add `TK_MAJ` before."
    def _replace_maj(m):
        tok = f'{TK_MAJ} ' if len(m.groups()[1]) > 1 else ''
        return f"{m.groups()[0]}{tok}{m.groups()[1].lower()}"
    return _re_maj.sub(_replace_maj, t)

In [12]:
def lowercase(t, add_bos=True, add_eos=False):
    "Converts `t` to lowercase"
    return (f'{BOS} ' if add_bos else '') + t.lower().strip() + (f' {EOS}' if add_eos else '')

In [13]:
def replace_space(t):
    "Replace embedded spaces in a token with unicode line char to allow for split/join"
    return t.replace(' ', '▁')

In [39]:
def preProcessString(t):
    text = str(t)
    text = spec_add_spaces(t)
    text = rm_useless_spaces(text)
    text = replace_rep(text)
    text = replace_wrep(text)
    text = fix_html(text)
    text = replace_all_caps(text)
    text = replace_maj(text)
    text = lowercase(text)
    text = emoji.demojize(text)
    return text

In [40]:
df = pd.read_csv("./TrainWithoutPreProcess.csv")

In [41]:
df.head()

Unnamed: 0,id,tweet,label
0,1,The CDC currently reports 99031 deaths. In general the discrepancies in death counts between different sources are small and explicable. The death toll stands at roughly 100000 people today.,real
1,2,States reported 1121 deaths a small rise from last Tuesday. Southern states reported 640 of those deaths. https://t.co/YASGRTT4ux,real
2,3,Politically Correct Woman (Almost) Uses Pandemic as Excuse Not to Reuse Plastic Bag https://t.co/thF8GuNFPe #coronavirus #nashville,fake
3,4,#IndiaFightsCorona: We have 1524 #COVID testing laboratories in India and as on 25th August 2020 36827520 tests have been done : @ProfBhargava DG @ICMRDELHI #StaySafe #IndiaWillWin https://t.co/Yh3ZxknnhZ,real
4,5,Populous states can generate large case counts but if you look at the new cases per million today 9 smaller states are showing more cases per million than California or Texas: AL AR ID KS KY LA MS NV and SC. https://t.co/1pYW6cWRaS,real


In [42]:
df['tweet'] = df['tweet'].apply(preProcessString)
encode_label = {'real' : 0, 'fake' : 1}
df['label'] = df['label'].map(encode_label)

In [43]:
df.head()

Unnamed: 0,id,tweet,label
0,1,xxbos xxmaj the xxup cdc currently reports 99031 deaths. xxmaj in general the discrepancies in death counts between different sources are small and explicable. xxmaj the death toll stands at roughly 1 xxrep 5 0 people today.,0
1,2,xxbos xxmaj states reported 1121 deaths a small rise from last xxmaj tuesday. xxmaj southern states reported 640 of those deaths. https: / / t.co / yasgrtt4ux,0
2,3,xxbos xxmaj politically xxmaj correct xxmaj woman (almost) xxmaj uses xxmaj pandemic as xxmaj excuse xxmaj not to xxmaj reuse xxmaj plastic xxmaj bag https: / / t.co / thf8gunfpe # coronavirus # nashville,1
3,4,xxbos # indiafightscorona: xxmaj we have 1524 # xxup covid testing laboratories in xxmaj india and as on 25th xxmaj august 2020 36827520 tests have been done : @profbhargava xxup dg @icmrdelhi # staysafe # indiawillwin https: / / t.co / yh3zxknnhz,0
4,5,xxbos xxmaj populous states can generate large case counts but if you look at the new cases per million today 9 smaller states are showing more cases per million than xxmaj california or xxmaj texas: xxup al xxup ar xxup id xxup ks xxup ky xxup la xxup ms xxup nv and xxup sc. https: / / t.co / 1pyw6cwras,0


In [44]:
df.to_csv('./finalTrainFilePreProcessed.csv')

In [45]:
dfVal = pd.read_csv('./validationWithoutPreProcess.csv')

In [46]:
dfVal.head()

Unnamed: 0,id,tweet,label
0,1,Chinese converting to Islam after realising that no muslim was affected by #Coronavirus #COVD19 in the country,fake
1,2,11 out of 13 people (from the Diamond Princess Cruise ship) who had intially tested negative in tests in Japan were later confirmed to be positive in the United States.,fake
2,3,"COVID-19 Is Caused By A Bacterium, Not Virus And Can Be Treated With Aspirin",fake
3,4,Mike Pence in RNC speech praises Donald Trump’s COVID-19 “seamless” partnership with governors and leaves out the president's state feuds: https://t.co/qJ6hSewtgB #RNC2020 https://t.co/OFoeRZDfyY,fake
4,5,6/10 Sky's @EdConwaySky explains the latest #COVID19 data and government announcement. Get more on the #coronavirus data here👇 https://t.co/jvGZlSbFjH https://t.co/PygSKXesBg,real


In [47]:
dfVal['tweet'] = dfVal['tweet'].apply(preProcessString)
encode_label = {'real' : 0, 'fake' : 1}
dfVal['label'] = dfVal['label'].map(encode_label)

In [48]:
dfVal.head()

Unnamed: 0,id,tweet,label
0,1,xxbos xxmaj chinese converting to xxmaj islam after realising that no muslim was affected by # xxmaj coronavirus # xxup covd19 in the country,1
1,2,xxbos 11 out of 13 people (from the xxmaj diamond xxmaj princess xxmaj cruise ship) who had intially tested negative in tests in xxmaj japan were later confirmed to be positive in the xxmaj united xxmaj states.,1
2,3,"xxbos xxup covid-19 xxmaj is xxmaj caused xxmaj by a xxmaj bacterium, xxmaj not xxmaj virus xxmaj and xxmaj can xxmaj be xxmaj treated xxmaj with xxmaj aspirin",1
3,4,xxbos xxmaj mike xxmaj pence in xxup rnc speech praises xxmaj donald xxmaj trump’s xxup covid-19 “seamless” partnership with governors and leaves out the president's state feuds: https: / / t.co / qj6hsewtgb # xxup rnc2020 https: / / t.co / ofoerzdfyy,1
4,5,xxbos 6 / 10 xxmaj sky's @edconwaysky explains the latest # xxup covid19 data and government announcement. xxmaj get more on the # coronavirus data here:backhand_index_pointing_down: https: / / t.co / jvgzlsbfjh https: / / t.co / pygskxesbg,0


In [49]:
dfVal.to_csv('./finalValidationFilePreProcessed.csv')