# Input Text Clean-up

In [33]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [34]:
%cp -av "/content/drive/MyDrive/NLP Project/Dataset.7z" /content/

'/content/drive/MyDrive/NLP Project/Dataset.7z' -> '/content/Dataset.7z'


In [35]:
!pip install pyunpack
!pip install patool
#%mkdir snapdata
from pyunpack import Archive
Archive('/content/Dataset.7z').extractall('/content/')



In [36]:
import pandas as pd
import os
import numpy as np
#import plotly.express as px
import random
import nltk
from nltk.corpus import stopwords

SEED = 54
random.seed(SEED)

In [37]:
import nltk
from nltk.corpus import stopwords        
from nltk.stem import PorterStemmer      
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk import pos_tag
nltk.download('punkt')      # Punkt Sentence Tokenizer (https://www.kite.com/python/docs/nltk.punkt)
nltk.download('stopwords')  # Stopwords (https://gist.github.com/sebleier/554280) 
nltk.download('wordnet')    # Wordnet NLTk (https://www.nltk.org/howto/wordnet.html)
!nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
/bin/bash: -c: line 0: syntax error near unexpected token `'omw-1.4''
/bin/bash: -c: line 0: `nltk.download('omw-1.4')'


In [38]:
!pip install fuzzywuzzy
import re

from fuzzywuzzy import process as fuzzy_proc
master_entities=[]
master_entities_text=[]
#code to load all standard entities for fuzzy matching & correction
with open('/content/MasterEntities.csv', 'r') as rf:
  for i in rf:
    master_entities.append(i.strip())
#master_entities=master_entities[1:] #skipped header

#code to load all standard entities for fuzzy matching & correction
with open('/content/MasterEntities_Text.csv', 'r') as rf1:
  for i in rf1:
    master_entities_text.append(i.strip())


def entity_correction(row, tgt_type):
  corrected_entity=[]
  for i in row[tgt_type]:
    match_score=fuzzy_proc.extractOne(i, master_entities)
    if match_score[1]>90:
      corrected_entity.append(match_score[0])
    else:
      corrected_entity.append(i)
  return corrected_entity

def entity_clean(entity_list):
  corrected_entity_list=[]
  for text in entity_list:
    text = re.sub('[^a-zA-Z0-9 ]', ' ', text)
    text = str.lower(text)
    #print(text)
    #lz = WordNetLemmatizer()
    #text.lemmatize(text) 
    corrected_entity_list.append(text)
  return corrected_entity_list

def text_correction(text_list):
  corrected_text_list=[]
  for i in text_list:
    match_score=fuzzy_proc.extractOne(i, master_entities_text)
    if match_score[1]>90:
      corrected_text_list.append(match_score[0])
    else:
      corrected_text_list.append(i)
  return corrected_text_list



In [39]:
import pandas as pd
from collections import OrderedDict
import re
import copy

# Prepare target variable
def clean_text(text):
    #text = text.replace('\r', ' ').replace('\n', ' ')
    text = re.sub('[^a-zA-Z0-9 ]', ' ', text)
    text = str.lower(text)
    text=word_tokenize(text)
    #print(text)
    #stop_words = set(stopwords.words('english'))
    #text = [t for t in text if not t in stop_words]
    #print(text)
    lz = WordNetLemmatizer()
    text = [lz.lemmatize(t) for t in text]
    before_list = copy.deepcopy(text)
    text = text_correction(text)
    after_list = copy.deepcopy(text)
    diff_set = set(after_list)-set(before_list)

    if len(diff_set)>0:
      print("Diff",diff_set)
      #print("Before",before_list)
      #print("After",after_list)

    text = ' '.join(text)
    #print(text)
    text = text.title()

    #print(text)
    return text

# Prepare target variable
def prepare_target(classval, classtype):
    target = ""
    if len(classval) > 0:
        target = f"[B{classtype}] " + " [SEP] ".join(classval) + f" [E{classtype}]"

    return target

def prepare_entiry(classval,classtype):
    target = ""
    if len(classval) > 0:
      if (classtype=='H'):
        target = ['B-HERO']*len(classval)
        #target = [0] * len(classval)
      if (classtype=='VL'):
        target = ['B-VIL']*len(classval)
        #target = [1] * len(classval)
      if (classtype=='VC'):
        target = ['B-VIC']*len(classval)
        #target = [2] * len(classval)
      if (classtype=='O'):
        target = ['B-OTH']*len(classval)
        #target = [3] * len(classval)
    else:
      target=[]

    return target

def mergeclasses(class_data: list):
    #random.shuffle(class_data)
    target = ""
    for _class in class_data:
        target+=_class + " "
    
    return target.strip()

def uniques(xs):
    return list(OrderedDict().fromkeys(xi for x in xs for xi in x))

def get_dataset(filename):
    covid_path = "Dataset/covid/annotations"
    politics_path = "Dataset/politics/annotations"

    covid_anno = pd.read_json(f"./{covid_path}/{filename}.jsonl", lines = True).reset_index(drop=True)
    politics_anno = pd.read_json(f"./{politics_path}/{filename}.jsonl", lines = True).reset_index(drop=True)

    covid_entities = covid_anno.apply(lambda x: x.hero + x.villain + x.victim + x.other, axis=1)
    politics_entities = politics_anno.apply(lambda x: x.hero + x.villain + x.victim + x.other, axis=1)


    final_ds = pd.concat([covid_anno, politics_anno], axis =0, ignore_index=True)

    #final_ds = final_ds[0:100]

    #final_ds['OCR_cleaned'] = (final_ds['OCR'].pipe(hero.clean))
    #final_ds['OCR_cleaned'] = final_ds['OCR']
    final_ds['OCR_cleaned'] = final_ds.apply(lambda x: clean_text(x.OCR), axis=1)
    
    final_ds["_Hero_Entity"] = final_ds.apply(lambda x: prepare_entiry(x.hero,"H"), axis=1)
    final_ds["_Villain_Entity"] = final_ds.apply(lambda x: prepare_entiry(x.villain,"VL"), axis=1) 
    final_ds["_Victim_Entity"] = final_ds.apply(lambda x: prepare_entiry(x.victim,"VC"), axis=1)
    final_ds["_Other_Entity"] = final_ds.apply(lambda x: prepare_entiry(x.other,"O"), axis=1)

    final_ds['villain_c']=final_ds.apply(lambda row: entity_clean(row.villain), axis=1)
    final_ds['victim_c']=final_ds.apply(lambda row: entity_clean(row.victim), axis=1)
    final_ds['hero_c']=final_ds.apply(lambda row: entity_clean(row.hero), axis=1)
    final_ds['other_c']=final_ds.apply(lambda row: entity_clean(row.other), axis=1)

    final_ds['villain_c1']=final_ds.apply(lambda row: entity_correction(row, 'villain_c'), axis=1)
    final_ds['victim_c1']=final_ds.apply(lambda row: entity_correction(row, 'victim_c'), axis=1)
    final_ds['hero_c1']=final_ds.apply(lambda row: entity_correction(row, 'hero_c'), axis=1)
    final_ds['other_c1']=final_ds.apply(lambda row: entity_correction(row, 'other_c'), axis=1)

    #--df['E']=(df.B+df.C).map(set).map(list)
    final_ds["Target"] = (final_ds.hero+final_ds.villain+final_ds.victim+final_ds.other).map(list)
    final_ds["Target_c"] = (final_ds.hero_c1+final_ds.villain_c1+final_ds.victim_c1+final_ds.other_c1).map(list)
    #print(set(final_ds.Target_c) - set(final_ds.Target))

    final_ds["Target_Entity"] = (final_ds._Hero_Entity+final_ds._Villain_Entity+final_ds._Victim_Entity+final_ds._Other_Entity).map(list)
    
    #--final_ds["Target_Entity"] = final_ds[['_Hero_Entity', '_Villain_Entity', '_Victim_Entity', '_Other_Entity']].apply(uniques, axis=1)
    #--final_ds["Target"] = final_ds[['hero', 'villain', 'victim', 'other']].apply(uniques, axis=1)
    #--final_ds["Target_Entity"] = final_ds[['_Hero_Entity', '_Villain_Entity', '_Victim_Entity', '_Other_Entity']].apply(uniques, axis=1)
    '''
    #final_ds["Target"] = final_ds.apply(lambda x: mergeclasses([x["hero"],x["villain"],x["victim"],x["other"]]), axis = 1)
    #final_ds["Target_Entity"] = final_ds.apply(lambda x: mergeclasses([x["_Hero_Entity"],x["_Villain_Entity"],x["_Victim_Entity"],x["_Other_Entity"]]), axis = 1)
    
    # = final_ds.apply(lambda x: prepare_target(x.hero, x.villain, x.victim, x.other), axis=1)
    final_ds["_Hero"] = final_ds.apply(lambda x: prepare_target(x.hero, "H"), axis=1)
    final_ds["_Villain"] = final_ds.apply(lambda x: prepare_target(x.villain, "VIL"), axis=1)
    final_ds["_Victim"] = final_ds.apply(lambda x: prepare_target(x.victim, "VIC"), axis=1)
    final_ds["_Other"] = final_ds.apply(lambda x: prepare_target(x.other, "OTH"), axis=1)
    final_ds["Target"] = final_ds.apply(lambda x: mergeclasses([x["hero"],x["villain"],x["victim"],x["other"]]), axis = 1)
    final_ds["All_Entities"] = final_ds.apply(lambda x: x.hero + x.villain + x.victim + x.other, axis=1)
    #final_ds["Target"] = final_ds["Target"].apply(lambda x: x.strip())
    '''
    # Remove column name 'A'
    final_ds = final_ds.drop(['image','hero','villain','victim','other','_Hero_Entity','_Villain_Entity','_Victim_Entity','_Other_Entity'], axis = 1)
    return final_ds

In [40]:
text = "hello ?>< how are u..i am memes fantastic?"
clean_text(text)

'Hello How Are U I Am Meme Fantastic'

In [41]:
!pwd

/content


In [42]:
train = get_dataset("train")
valid = get_dataset("val")

Diff {'sanders'}
Diff {'american'}
Diff {'american'}
Diff {'american'}
Diff {'vladimir'}
Diff {'american'}
Diff {'sanders'}
Diff {'american'}
Diff {'coronavirus'}
Diff {'coronavirus'}
Diff {'american'}
Diff {'coronavirus'}
Diff {'trump'}
Diff {'coronavirus'}
Diff {'virus'}
Diff {'barack', 'sanders'}
Diff {'coronavirus'}
Diff {'american'}
Diff {'american'}
Diff {'american'}
Diff {'american'}
Diff {'american'}
Diff {'grand'}
Diff {'american'}
Diff {'american'}
Diff {'covid'}
Diff {'american'}
Diff {'coronavirus'}
Diff {'coronavirus'}
Diff {'vaccine'}
Diff {'american'}
Diff {'coronavirus'}
Diff {'coronavirus'}
Diff {'covid'}
Diff {'american'}
Diff {'american'}
Diff {'american'}
Diff {'coronavirus'}
Diff {'american'}
Diff {'vaccine'}
Diff {'american'}
Diff {'virus'}
Diff {'american'}
Diff {'american'}
Diff {'american'}
Diff {'american'}
Diff {'american'}
Diff {'american'}
Diff {'trump'}
Diff {'coronavirus'}
Diff {'coronavirus'}
Diff {'american'}
Diff {'american'}
Diff {'american'}
Diff {'a



Diff {'american'}
Diff {'american'}
Diff {'american'}
Diff {'coronavirus'}
Diff {'coronavirus'}
Diff {'american'}
Diff {'harris'}
Diff {'american'}
Diff {'american'}
Diff {'american'}
Diff {'trump'}
Diff {'american'}
Diff {'american'}
Diff {'american'}
Diff {'american'}
Diff {'libertarian'}
Diff {'sanders'}
Diff {'republican', 'harris'}
Diff {'republican'}
Diff {'american'}
Diff {'american'}
Diff {'american'}
Diff {'american'}
Diff {'american'}
Diff {'american'}
Diff {'american'}
Diff {'american'}
Diff {'american'}
Diff {'sanders'}
Diff {'american'}
Diff {'american'}
Diff {'barack'}
Diff {'american'}
Diff {'republican'}
Diff {'american'}
Diff {'american'}
Diff {'american'}
Diff {'american'}




In [43]:
train
train[train.apply(lambda row: all(i in row.Target_c for i in row.Target), axis=1)==False]

Unnamed: 0,OCR,OCR_cleaned,villain_c,victim_c,hero_c,other_c,villain_c1,victim_c1,hero_c1,other_c1,Target,Target_c,Target_Entity
36,This is Hob. He voted for Trump. This is Bob's...,This Is Hob He Voted For Trump This Is Bob S F...,[],[],[],"[donald trump, bill clinton, hilary clinton, h...",[],[],[],"[donald trump, bill clinton, hillary clinton, ...","[donald trump, bill clinton, hilary clinton, h...","[donald trump, bill clinton, hillary clinton, ...","[B-OTH, B-OTH, B-OTH, B-OTH]"
40,"re Photos\nIF YOU LOVE AMERICA,\nYOU'RE GENERA...",Re Photo If You Love American You Re Generally...,[liberal],[],[conservative],"[charlie kirk, america, usa]",[liberal],[],[conservative],"[charlie kirk, american, usa]","[conservative, liberal, charlie kirk, america,...","[conservative, liberal, charlie kirk, american...","[B-HERO, B-VIL, B-OTH, B-OTH, B-OTH]"
43,JUST HAD THE COVID-19\nVACCINE\nFEELING GREAT!...,Just Had The Covid 19 Vaccine Feeling Great Ma...,[covid19 vaccine],[],[],"[covid 19 vaccine, covid vaccine]",[covid vaccine],[],[],"[covid vaccine, covid vaccine]","[covid19 vaccine, covid 19 vaccine, covid vacc...","[covid vaccine, covid vaccine, covid vaccine]","[B-VIL, B-OTH, B-OTH]"
46,Me when I get my covid vaccine\nMemeZila.com\n,Me When I Get My Covid Vaccine Memezila Com,[],[],[],"[covid19 vaccine, covid vaccine]",[],[],[],"[covid vaccine, covid vaccine]","[covid19 vaccine, covid vaccine]","[covid vaccine, covid vaccine]","[B-OTH, B-OTH]"
47,HECHEVS\nTHE WHITE HC\nWASHINGTON\nNIK NEWS SP...,Hechevs The White Hc Washington Nik News Speci...,[],[],[],"[donald trump, dr anthony fauci, white house,...",[],[],[],"[donald trump, dr anthony fauci, white house,...","[donald trump, dr. anthony fauci, white house,...","[donald trump, dr anthony fauci, white house,...","[B-OTH, B-OTH, B-OTH, B-OTH]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5522,LET US BIULD THE ENTERPRISE\nWEAREALL\nOFUCKED\n,Let U Biuld The Enterprise Weareall Ofucked,[],[],[],"[kim jong un, barack obama, kim jong un]",[],[],[],"[kim jong un, barack obama, kim jong un]","[kim jong un, barack obama, kim jong-un]","[kim jong un, barack obama, kim jong un]","[B-OTH, B-OTH, B-OTH]"
5526,HITLER BLAMED JEWSFOR\nTHE WOES OF\nGERMANY\nR...,Hitler Blamed Jewsfor The Woe Of Germany Remem...,"[donald trump, adolf hitler]","[jews, mexicans]",[],"[germany, america, mexican]","[donald trump, adolf hitler]","[jews, mexicans]",[],"[germany, american, mexican]","[donald trump, adolf hitler, jews, mexicans, g...","[donald trump, adolf hitler, jews, mexicans, g...","[B-VIL, B-VIL, B-VIC, B-VIC, B-OTH, B-OTH, B-OTH]"
5538,"Because of me, the Republican party\nlost to a...",Because Of Me The Republican Party Lost To A B...,[],[black people],[],"[americans, george bush, george walker bush, r...",[],[black people],[],"[american, george bush, george walker bush, re...","[black people, americans, george bush, george ...","[black people, american, george bush, george w...","[B-VIC, B-OTH, B-OTH, B-OTH, B-OTH, B-OTH]"
5545,J.B. PRITZER'S NET\nWORTH: $3.5 BILLION\nTHERE...,J B Pritzer S Net Worth 3 5 Billion There S No...,[j b pritzer],[],[],"[socialist, j b pritzer, libertarians]",[j b pritzer],[],[],"[socialist, j b pritzer, libertarian]","[j.b. pritzer, socialist, j. b. pritzer, liber...","[j b pritzer, socialist, j b pritzer, liber...","[B-VIL, B-OTH, B-OTH, B-OTH]"


In [44]:
train = train.drop(['OCR','villain_c','victim_c','hero_c','other_c','villain_c1','victim_c1','hero_c1','other_c1','Target'], axis = 1)
valid = valid.drop(['OCR','villain_c','victim_c','hero_c','other_c','villain_c1','victim_c1','hero_c1','other_c1','Target'], axis = 1)
train.head()

Unnamed: 0,OCR_cleaned,Target_c,Target_Entity
0,Bernie Or Elizabeth Be Informed Compare Them O...,"[bernie sanders, elizabeth warren]","[B-OTH, B-OTH]"
1,Extending The Brexit Deadline Until October 31...,[uk government],[B-VIL]
2,Kwai Gkwa 0964 Nnevvy Applause To Thai From Ho...,"[thais, hong kong]","[B-HERO, B-OTH]"
3,So I Order This Foce Mask To Protect Ogainst F...,"[china, face mask, made in china, coronavirus]","[B-VIL, B-OTH, B-OTH, B-OTH]"
4,Best Candidate For Ja 2020 Joe Biden Kamala Ha...,"[joe biden, bernie sanders, kamala harris, tik...","[B-HERO, B-OTH, B-OTH, B-OTH]"


In [45]:
valid

Unnamed: 0,OCR_cleaned,Target_c,Target_Entity
0,Herman Caino Othehermancain Mask Will Not Be M...,"[herman cain, donald trump, covid, coronavirus...","[B-VIL, B-OTH, B-OTH, B-OTH, B-OTH, B-OTH]"
1,Let S Go To China To Save Them From I M The Ma...,"[batman, china, coronavirus]","[B-OTH, B-OTH, B-OTH]"
2,Who Would Win Thanos One Hantavirus Boy Sorry ...,"[hantavirus, thanos, hanta virus, thanos vs ha...","[B-OTH, B-OTH, B-OTH, B-OTH]"
3,Dotors Working 24X7 Relentlessly To Save Life ...,"[goicho saib, goan aunties, goan uncles, docto...","[B-HERO, B-VIL, B-VIL, B-VIC, B-OTH, B-OTH, B-..."
4,Occupy Danocrats A At 12 Dreakng Barack Cbaria...,"[joe biden, barack obama]","[B-HERO, B-OTH]"
...,...,...,...
645,Totaly Agree The Republican Party Is Not A Pol...,"[republican party, richard belzer, political p...","[B-VIL, B-OTH, B-OTH]"
646,Thank You Lord Esus President Trump Never Unde...,"[donald trump, people, jesus, george carlin, t...","[B-VIL, B-VIL, B-OTH, B-OTH, B-OTH]"
647,Republican Democrat Always Blame Democrat Alwa...,"[republican, democrat, political puppets, repu...","[B-VIL, B-VIL, B-VIL, B-OTH, B-OTH]"
648,Ta Banana Rep Srun By An Os,"[donald trump, banana, orange]","[B-OTH, B-OTH, B-OTH]"


In [46]:
train.to_csv('/content/train_dataframe.csv', index = False, header=True)
valid.to_csv('/content/valid_dataframe.csv', index = False, header=True)

# BERT

In [47]:
!pip install transformers



In [48]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

tokenizer = AutoTokenizer.from_pretrained("dslim/bert-large-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-large-NER")

nlp = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy='first')


In [49]:
example = "Bernie Sanders Or Elizabeth Warren Be Informed Compare Them On The Issues That Matter Issue Who Makes The CoronaVirus China HANTAVIRUS People World Virus NewYear Donald Trumph Hate Barak Obama"
example=str.lower(example)
print(example)

ner_results = nlp(example)
ner_results
#for x in ner_results:
#  print(x['entity'],x['word'])

bernie sanders or elizabeth warren be informed compare them on the issues that matter issue who makes the coronavirus china hantavirus people world virus newyear donald trumph hate barak obama


[]

In [50]:
train.head()
train.shape

(5552, 3)

In [51]:
#train = train[0:10]
#string = string.replace('\r', '').replace('\n', '')

train["BERT_Target"] = ""
train["BERT_Target_Entity"] = ""

for i in range(train.shape[0]):
    Bert_Entities = []
    Bert_Entity_Names = []

    text=str(train['OCR_cleaned'][i])
    if len(text)>0:
      out = nlp(text)
      if len(out)>0:
        for x in out:
          Bert_Entities.append(x['entity_group'])
          Bert_Entity_Names.append(x['word'])
      train["BERT_Target"][i] = Bert_Entities
      train["BERT_Target_Entity"][i] = Bert_Entity_Names

train

Unnamed: 0,OCR_cleaned,Target_c,Target_Entity,BERT_Target,BERT_Target_Entity
0,Bernie Or Elizabeth Be Informed Compare Them O...,"[bernie sanders, elizabeth warren]","[B-OTH, B-OTH]","[PER, PER, MISC]","[Bernie, Elizabeth, Dankest]"
1,Extending The Brexit Deadline Until October 31...,[uk government],[B-VIL],[MISC],[Brexit]
2,Kwai Gkwa 0964 Nnevvy Applause To Thai From Ho...,"[thais, hong kong]","[B-HERO, B-OTH]","[ORG, MISC, LOC, MISC]","[Kwai Gkwa, Thai, Hong Kong, Thai]"
3,So I Order This Foce Mask To Protect Ogainst F...,"[china, face mask, made in china, coronavirus]","[B-VIL, B-OTH, B-OTH, B-OTH]","[PER, ORG, LOC]","[Ogainst, Corond, China]"
4,Best Candidate For Ja 2020 Joe Biden Kamala Ha...,"[joe biden, bernie sanders, kamala harris, tik...","[B-HERO, B-OTH, B-OTH, B-OTH]","[LOC, PER, PER, PER, PER]","[Ja, Joe Biden, Kamala Harris, Bernie Sanders,..."
...,...,...,...,...,...
5547,Trump Could Shoot Someone On The Senate Floor ...,"[donald trump, senate floor, republican]","[B-VIL, B-OTH, B-OTH]","[PER, ORG, LOC, MISC]","[Trump, Senate, Floor, Republican]"
5548,Many People Ask Me Why All My School Record Ar...,"[school, university, joe biden]","[B-OTH, B-OTH, B-OTH]",[],[]
5549,My Be Friend My Mother Consclence My Therapist...,"[msnbc, bernie sanders, democratic party, joe ...","[B-OTH, B-OTH, B-OTH, B-OTH, B-OTH]","[PER, MISC]","[Ciny B, Democratic]"
5550,The N Word Pas Signed And Approved By Beak Obana,[barack obama],[B-OTH],"[ORG, PER]","[Word, Beak Obana]"


In [53]:
x = train
x

Unnamed: 0,OCR_cleaned,Target_c,Target_Entity,BERT_Target,BERT_Target_Entity
0,Bernie Or Elizabeth Be Informed Compare Them O...,"[bernie sanders, elizabeth warren]","[B-OTH, B-OTH]","[PER, PER, MISC]","[Bernie, Elizabeth, Dankest]"
1,Extending The Brexit Deadline Until October 31...,[uk government],[B-VIL],[MISC],[Brexit]
2,Kwai Gkwa 0964 Nnevvy Applause To Thai From Ho...,"[thais, hong kong]","[B-HERO, B-OTH]","[ORG, MISC, LOC, MISC]","[Kwai Gkwa, Thai, Hong Kong, Thai]"
3,So I Order This Foce Mask To Protect Ogainst F...,"[china, face mask, made in china, coronavirus]","[B-VIL, B-OTH, B-OTH, B-OTH]","[PER, ORG, LOC]","[Ogainst, Corond, China]"
4,Best Candidate For Ja 2020 Joe Biden Kamala Ha...,"[joe biden, bernie sanders, kamala harris, tik...","[B-HERO, B-OTH, B-OTH, B-OTH]","[LOC, PER, PER, PER, PER]","[Ja, Joe Biden, Kamala Harris, Bernie Sanders,..."
...,...,...,...,...,...
5547,Trump Could Shoot Someone On The Senate Floor ...,"[donald trump, senate floor, republican]","[B-VIL, B-OTH, B-OTH]","[PER, ORG, LOC, MISC]","[Trump, Senate, Floor, Republican]"
5548,Many People Ask Me Why All My School Record Ar...,"[school, university, joe biden]","[B-OTH, B-OTH, B-OTH]",[],[]
5549,My Be Friend My Mother Consclence My Therapist...,"[msnbc, bernie sanders, democratic party, joe ...","[B-OTH, B-OTH, B-OTH, B-OTH, B-OTH]","[PER, MISC]","[Ciny B, Democratic]"
5550,The N Word Pas Signed And Approved By Beak Obana,[barack obama],[B-OTH],"[ORG, PER]","[Word, Beak Obana]"


In [54]:
x.to_csv('/content/bert_dataframe_first_final.csv', index = False, header=True)

In [55]:
#!pip install fuzzywuzzy
from fuzzywuzzy import fuzz


x["in_count"] = 0
x["match_count"] = 0

total_row_matches = 0
for i in range(0,x.shape[0]):
  a = x["Target_c"][i]
  b = x["BERT_Target_Entity"][i]
  in_count = len(a)
  x["in_count"][i] = in_count
  match_count = 0
  #print(a,b)
  for c in a:
    for d in b:
      match_score=fuzz.ratio(str.lower(c).strip(), str.lower(d).strip())
      if match_score >=50:
        #print(c,d)
        match_count+=1
        break
    x["match_count"][i] = match_count
  if match_count>0:
    total_row_matches+=1
  
print(total_row_matches)

'''
    match_score=fuzzy_proc.extractOne(i, master_entities)
    if match_score[1]>90:
      corrected_entity.append(match_score[0])
    else:
      corrected_entity.append(i)
  return corrected_entity
'''

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


3744


'\n    match_score=fuzzy_proc.extractOne(i, master_entities)\n    if match_score[1]>90:\n      corrected_entity.append(match_score[0])\n    else:\n      corrected_entity.append(i)\n  return corrected_entity\n'

In [56]:
x.head()
total = x["in_count"].sum()
matched = x["match_count"].sum()
print("%match",float(matched/total))

%match 0.48041566746602715


In [None]:
#BERT different aggregation strategy results
#'first' = 48%
#'average' = 43%
#'max' = 43%
#'simple' = 44%

In [57]:
x.head()

Unnamed: 0,OCR_cleaned,Target_c,Target_Entity,BERT_Target,BERT_Target_Entity,in_count,match_count
0,Bernie Or Elizabeth Be Informed Compare Them O...,"[bernie sanders, elizabeth warren]","[B-OTH, B-OTH]","[PER, PER, MISC]","[Bernie, Elizabeth, Dankest]",2,2
1,Extending The Brexit Deadline Until October 31...,[uk government],[B-VIL],[MISC],[Brexit],1,0
2,Kwai Gkwa 0964 Nnevvy Applause To Thai From Ho...,"[thais, hong kong]","[B-HERO, B-OTH]","[ORG, MISC, LOC, MISC]","[Kwai Gkwa, Thai, Hong Kong, Thai]",2,2
3,So I Order This Foce Mask To Protect Ogainst F...,"[china, face mask, made in china, coronavirus]","[B-VIL, B-OTH, B-OTH, B-OTH]","[PER, ORG, LOC]","[Ogainst, Corond, China]",4,3
4,Best Candidate For Ja 2020 Joe Biden Kamala Ha...,"[joe biden, bernie sanders, kamala harris, tik...","[B-HERO, B-OTH, B-OTH, B-OTH]","[LOC, PER, PER, PER, PER]","[Ja, Joe Biden, Kamala Harris, Bernie Sanders,...",4,4


In [58]:
def entity_clean(text):
  text = re.sub('[^a-zA-Z0-9 ]', ' ', text)
  text = str.lower(text).title()
  #text = str.lower(text)
  return text

In [60]:
file1 = open('myfile.txt', 'w')

import re
#df = df[0:10]

x["Org_Target"] = ""
x["Org_Target_Entity"] = ""

for i in range(x.shape[0]):
    Bert_Entities = []
    Bert_Entity_Names = []
    text = str(x["Target_c"][i])
    text = entity_clean(text)
    #print(text)
    if len(text.strip())>0:
      out = nlp(text)
      if len(out)>0:
        for z in out:
          Bert_Entities.append(z['entity_group'])
          Bert_Entity_Names.append(z['word'])
          #print(text,Bert_Entities,Bert_Entity_Names)
      x["Org_Target"][i] = Bert_Entities
      x["Org_Target_Entity"][i] = Bert_Entity_Names
      t = str(text+','+str(Bert_Entities)+','+str(Bert_Entity_Names)+'\n')
      file1.write(t)
file1.close()

x.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,OCR_cleaned,Target_c,Target_Entity,BERT_Target,BERT_Target_Entity,in_count,match_count,Org_Target,Org_Target_Entity
0,Bernie Or Elizabeth Be Informed Compare Them O...,"[bernie sanders, elizabeth warren]","[B-OTH, B-OTH]","[PER, PER, MISC]","[Bernie, Elizabeth, Dankest]",2,2,"[PER, PER]","[Bernie Sanders, Elizabeth Warren]"
1,Extending The Brexit Deadline Until October 31...,[uk government],[B-VIL],[MISC],[Brexit],1,0,[ORG],[Uk Government]
2,Kwai Gkwa 0964 Nnevvy Applause To Thai From Ho...,"[thais, hong kong]","[B-HERO, B-OTH]","[ORG, MISC, LOC, MISC]","[Kwai Gkwa, Thai, Hong Kong, Thai]",2,2,"[MISC, LOC]","[Thais, Hong Kong]"
3,So I Order This Foce Mask To Protect Ogainst F...,"[china, face mask, made in china, coronavirus]","[B-VIL, B-OTH, B-OTH, B-OTH]","[PER, ORG, LOC]","[Ogainst, Corond, China]",4,3,"[LOC, MISC, LOC]","[China, Face Mask, China]"
4,Best Candidate For Ja 2020 Joe Biden Kamala Ha...,"[joe biden, bernie sanders, kamala harris, tik...","[B-HERO, B-OTH, B-OTH, B-OTH]","[LOC, PER, PER, PER, PER]","[Ja, Joe Biden, Kamala Harris, Bernie Sanders,...",4,4,"[PER, PER, PER, PER]","[Joe Biden, Bernie Sanders, Kamala Harris, Tik..."


In [69]:
valid.head()
#valid.shape

Unnamed: 0,OCR_cleaned,Target_c,Target_Entity,BERT_Target,BERT_Target_Entity,in_count,match_count,Org_Target,Org_Target_Entity
0,Herman Caino Othehermancain Mask Will Not Be M...,"[herman cain, donald trump, covid, coronavirus...","[B-VIL, B-OTH, B-OTH, B-OTH, B-OTH, B-OTH]","[PER, ORG, PER]","[Herman Caino, Othehermancain, Trump]",6,0,"[PER, PER, MISC, MISC]","[Herman Cain, Donald Trump, Covid Coronavirus ..."
1,Let S Go To China To Save Them From I M The Ma...,"[batman, china, coronavirus]","[B-OTH, B-OTH, B-OTH]","[ORG, LOC, LOC, ORG]","[Let S, China, China, Coronav]",3,0,[MISC],[Batman China Coronavirus]
2,Who Would Win Thanos One Hantavirus Boy Sorry ...,"[hantavirus, thanos, hanta virus, thanos vs ha...","[B-OTH, B-OTH, B-OTH, B-OTH]","[ORG, MISC]","[Thanos, Hantavirus]",4,1,"[MISC, MISC, MISC, MISC]","[Hantavirus, Thanos Hanta Virus, Thanos, Hanta..."
3,Dotors Working 24X7 Relentlessly To Save Life ...,"[goicho saib, goan aunties, goan uncles, docto...","[B-HERO, B-VIL, B-VIL, B-VIC, B-OTH, B-OTH, B-...","[PER, MISC, MISC, PER, PER, PER, ORG]","[Dotors, Coronavirus, Goan, Goa, Goicho, Saib ...",9,1,"[PER, MISC, MISC, MISC, PER, MISC, MISC, MISC]","[Goicho, Saib, Goan, Goan, Goencho Saib, Goan,..."
4,Occupy Danocrats A At 12 Dreakng Barack Cbaria...,"[joe biden, barack obama]","[B-HERO, B-OTH]","[ORG, PER, PER, PER, ORG, PER, PER, PER, LOC, ...","[Occupy Danocrats, Barack Cbaria Andoruas, Jos...",2,1,"[PER, PER]","[Joe Biden, Barack Obama]"


In [71]:
#train = train[0:10]
#string = string.replace('\r', '').replace('\n', '')

valid["BERT_Target"] = ""
valid["BERT_Target_Entity"] = ""

for i in range(valid.shape[0]):
    Bert_Entities = []
    Bert_Entity_Names = []

    text=str(valid['OCR_cleaned'][i])
    if len(text)>0:
      out = nlp(text)
      if len(out)>0:
        for x in out:
          Bert_Entities.append(x['entity_group'])
          Bert_Entity_Names.append(x['word'])
      valid["BERT_Target"][i] = Bert_Entities
      valid["BERT_Target_Entity"][i] = Bert_Entity_Names

valid

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,OCR_cleaned,Target_c,Target_Entity,BERT_Target,BERT_Target_Entity,in_count,match_count,Org_Target,Org_Target_Entity
0,Herman Caino Othehermancain Mask Will Not Be M...,"[herman cain, donald trump, covid, coronavirus...","[B-VIL, B-OTH, B-OTH, B-OTH, B-OTH, B-OTH]","[PER, ORG, PER]","[Herman Caino, Othehermancain, Trump]",6,0,"[PER, PER, MISC, MISC]","[Herman Cain, Donald Trump, Covid Coronavirus ..."
1,Let S Go To China To Save Them From I M The Ma...,"[batman, china, coronavirus]","[B-OTH, B-OTH, B-OTH]","[ORG, LOC, LOC, ORG]","[Let S, China, China, Coronav]",3,0,[MISC],[Batman China Coronavirus]
2,Who Would Win Thanos One Hantavirus Boy Sorry ...,"[hantavirus, thanos, hanta virus, thanos vs ha...","[B-OTH, B-OTH, B-OTH, B-OTH]","[ORG, MISC]","[Thanos, Hantavirus]",4,1,"[MISC, MISC, MISC, MISC]","[Hantavirus, Thanos Hanta Virus, Thanos, Hanta..."
3,Dotors Working 24X7 Relentlessly To Save Life ...,"[goicho saib, goan aunties, goan uncles, docto...","[B-HERO, B-VIL, B-VIL, B-VIC, B-OTH, B-OTH, B-...","[PER, MISC, MISC, PER, PER, PER, ORG]","[Dotors, Coronavirus, Goan, Goa, Goicho, Saib ...",9,1,"[PER, MISC, MISC, MISC, PER, MISC, MISC, MISC]","[Goicho, Saib, Goan, Goan, Goencho Saib, Goan,..."
4,Occupy Danocrats A At 12 Dreakng Barack Cbaria...,"[joe biden, barack obama]","[B-HERO, B-OTH]","[ORG, PER, PER, PER, ORG, PER, PER, PER, LOC, ...","[Occupy Danocrats, Barack Cbaria Andoruas, Jos...",2,1,"[PER, PER]","[Joe Biden, Barack Obama]"
...,...,...,...,...,...,...,...,...,...
645,Totaly Agree The Republican Party Is Not A Pol...,"[republican party, richard belzer, political p...","[B-VIL, B-OTH, B-OTH]","[ORG, PER]","[Republican Party, Richard Belzer]",3,0,"[ORG, ORG]","[Republican Party, Richard Belzer Political Pa..."
646,Thank You Lord Esus President Trump Never Unde...,"[donald trump, people, jesus, george carlin, t...","[B-VIL, B-VIL, B-OTH, B-OTH, B-OTH]","[PER, PER, PER, MISC]","[Esus, Trump, George Carlin, Democrat]",5,0,"[PER, PER, PER]","[Donald Trump, Jesus George Carlin, Trump]"
647,Republican Democrat Always Blame Democrat Alwa...,"[republican, democrat, political puppets, repu...","[B-VIL, B-VIL, B-VIL, B-OTH, B-OTH]","[MISC, MISC, MISC, MISC]","[Republican, Democrat, Democrat, Republican]",5,0,"[MISC, ORG, MISC]","[Republican Democrat, Political Puppets, Repub..."
648,Ta Banana Rep Srun By An Os,"[donald trump, banana, orange]","[B-OTH, B-OTH, B-OTH]","[MISC, PER]","[Ta Banana Rep Srun, An Os]",3,1,"[PER, MISC]","[Donald Trump, Orange]"


In [72]:
x = valid
x.to_csv('/content/bert_dataframe_first_final_valid.csv', index = False, header=True)

In [73]:
#!pip install fuzzywuzzy
from fuzzywuzzy import fuzz


x["in_count"] = 0
x["match_count"] = 0

total_row_matches = 0
for i in range(0,x.shape[0]):
  a = x["Target_c"][i]
  b = x["BERT_Target_Entity"][i]
  in_count = len(a)
  x["in_count"][i] = in_count
  match_count = 0
  #print(a,b)
  for c in a:
    for d in b:
      match_score=fuzz.ratio(str.lower(c).strip(), str.lower(d).strip())
      if match_score >=50:
        #print(c,d)
        match_count+=1
        break
    x["match_count"][i] = match_count
  if match_count>0:
    total_row_matches+=1
  
print(total_row_matches)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


449


In [74]:
x.head()
total = x["in_count"].sum()
matched = x["match_count"].sum()
print("%match",float(matched/total))

%match 0.49202513291445144


In [75]:
def entity_clean(text):
  text = re.sub('[^a-zA-Z0-9 ]', ' ', text)
  text = str.lower(text).title()
  #text = str.lower(text)
  return text

In [76]:
file1 = open('myfile_valid.txt', 'w')

import re
#df = df[0:10]

x["Org_Target"] = ""
x["Org_Target_Entity"] = ""

for i in range(x.shape[0]):
    Bert_Entities = []
    Bert_Entity_Names = []
    text = str(x["Target_c"][i])
    text = entity_clean(text)
    #print(text)
    if len(text.strip())>0:
      out = nlp(text)
      if len(out)>0:
        for z in out:
          Bert_Entities.append(z['entity_group'])
          Bert_Entity_Names.append(z['word'])
          #print(text,Bert_Entities,Bert_Entity_Names)
      x["Org_Target"][i] = Bert_Entities
      x["Org_Target_Entity"][i] = Bert_Entity_Names
      t = str(text+','+str(Bert_Entities)+','+str(Bert_Entity_Names)+'\n')
      file1.write(t)
file1.close()

x.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,OCR_cleaned,Target_c,Target_Entity,BERT_Target,BERT_Target_Entity,in_count,match_count,Org_Target,Org_Target_Entity
0,Herman Caino Othehermancain Mask Will Not Be M...,"[herman cain, donald trump, covid, coronavirus...","[B-VIL, B-OTH, B-OTH, B-OTH, B-OTH, B-OTH]","[PER, ORG, PER]","[Herman Caino, Othehermancain, Trump]",6,3,"[PER, PER, MISC, MISC]","[Herman Cain, Donald Trump, Covid Coronavirus ..."
1,Let S Go To China To Save Them From I M The Ma...,"[batman, china, coronavirus]","[B-OTH, B-OTH, B-OTH]","[ORG, LOC, LOC, ORG]","[Let S, China, China, Coronav]",3,2,[MISC],[Batman China Coronavirus]
2,Who Would Win Thanos One Hantavirus Boy Sorry ...,"[hantavirus, thanos, hanta virus, thanos vs ha...","[B-OTH, B-OTH, B-OTH, B-OTH]","[ORG, MISC]","[Thanos, Hantavirus]",4,4,"[MISC, MISC, MISC, MISC]","[Hantavirus, Thanos Hanta Virus, Thanos, Hanta..."
3,Dotors Working 24X7 Relentlessly To Save Life ...,"[goicho saib, goan aunties, goan uncles, docto...","[B-HERO, B-VIL, B-VIL, B-VIC, B-OTH, B-OTH, B-...","[PER, MISC, MISC, PER, PER, PER, ORG]","[Dotors, Coronavirus, Goan, Goa, Goicho, Saib ...",9,8,"[PER, MISC, MISC, MISC, PER, MISC, MISC, MISC]","[Goicho, Saib, Goan, Goan, Goencho Saib, Goan,..."
4,Occupy Danocrats A At 12 Dreakng Barack Cbaria...,"[joe biden, barack obama]","[B-HERO, B-OTH]","[ORG, PER, PER, PER, ORG, PER, PER, PER, LOC, ...","[Occupy Danocrats, Barack Cbaria Andoruas, Jos...",2,2,"[PER, PER]","[Joe Biden, Barack Obama]"
