In [415]:
import pandas as pd
import numpy as np
import re 

In [416]:
df = pd.read_csv("df.csv")
df.head()

Unnamed: 0,page_index,entity_index,entity_group,word
0,0,0.0,STATE,immigration status
1,0,1.0,PERSON,others
2,0,2.0,FORM,uk visas and immigration
3,0,3.0,LOCATION,uk
4,0,4.0,FORM,document


In [420]:
len(df)

151996

In [418]:
# Checking for the most common entities
n = 20
df["word"].value_counts()[:n].index.tolist()

['uk',
 'people',
 'users',
 'employer',
 'child',
 'person',
 'government',
 'hmrc',
 'england',
 'employee',
 'someone',
 'children',
 'company',
 'employees',
 'business',
 'partner',
 'country',
 'employers',
 'national insurance',
 'court']

In [419]:
# removes any rows with a `word` that occurs less than 5 times (cuts off about 30,000 for economic recovery)
# note that this takes 1-2 mins to run 

v = df[['word']]
df_reduced = df[v.replace(v.stack().value_counts()).gt(5).all(1)]

In [None]:
df_reduced

In [421]:
len(df_reduced)

129051

In [422]:
# Checking all of the entity groups
df_reduced["entity_group"].value_counts()[:].index.tolist()

['PERSON',
 'ORGANIZATION',
 'LOCATION',
 'FINANCE',
 'DATE',
 'FORM',
 'EVENT',
 'CONTACT',
 'STATE',
 'MISC']

In [423]:
# Want to drop DATE as this are often independent of year
# Therefore could link articles which are not related. This is an unwanted bias

df_reduced_dates = df_reduced.drop(df_reduced[df_reduced.entity_group == "DATE"].index)  # removes another 8,000

In [None]:
df_reduced_dates

In [424]:
# Want to remove FORM as these are often poorly formatted

df_reduced_dates_forms = df_reduced_dates.drop(df_reduced_dates[df_reduced_dates.entity_group == "FORM"].index)  # removes another 4,000

In [None]:
df_reduced_dates_forms

In [425]:
# Drop FINANCE for same reason as dates
df_reduced_dff = df_reduced_dates_forms.drop(df_reduced_dates_forms[df_reduced_dates_forms.entity_group == "FINANCE"].index)  # removes another 20,000

In [None]:
df_reduced_dff

In [426]:
# Cleaning the column
df_reduced_dff["word_cleaned"] = df_reduced_dff["word"].str.lower()
df_reduced_dff["word_cleaned"] = df_reduced_dff["word_cleaned"].apply(lambda x: re.sub(r".{}+[‚Äö√§√¥(?/)]", "", x))

In [427]:
# checking the number of unique words
print("Number of unique words:", df_reduced_dff.word_cleaned.unique().size)

Number of unique words: 1650


In [428]:
# Checking if entity groups were successsfully removed
df_reduced_dff["entity_group"].value_counts()[:].index.tolist()

['PERSON', 'ORGANIZATION', 'LOCATION', 'EVENT', 'CONTACT', 'STATE', 'MISC']

In [429]:
words = df_reduced_dff.word.unique()

In [None]:
pd.DataFrame(words).to_csv("words.csv", index=False)

In [430]:
df_words = pd.read_csv("words.csv")
df_words.head()

Unnamed: 0,0
0,immigration status
1,others
2,uk
3,employer
4,landlord


In [431]:
df_words.columns = ["words"]

In [432]:
# Two types of junk words get through
# E.g., Those that have an unwanted . at the end --> replace these with the correct ones
# The others are total junk and should just be removed
# E.g., Äö√

remove = ["18", "16", "#NAME?", ".", "2018", "2015", "-", "citizen ‚Äô s advice bureau", "30", "2", "/", "post -", "non -", "2019", "2014pdf", "1", "4", "2022", "2021", ",", "2012", "3", "10", "‚Äö√Ñ√¥", "www", "partneryour child", "full -" ,"volunteersnational association"]

print(len(df_words))
df_words_removed = df_words[~df_words.words.isin(remove)]
print(len(df_words_removed))

1650
1623


In [433]:
# values to be replaced

df_reduced_dff["word_cleaned"].replace({"northern ireland.": "northern ireland",
                                   "northern irelandthere": "northern ireland",
                                   "northern irelandthe": "northern ireland",
                                   "organisation.": "organisation",
                                   "country.": "country",
                                   "customers.": "customers",
                                   "community actionwebsitenational council for voluntary organisations": "national council for voluntary organisations",
                                   "uk.": "uk",
                                   "workforce.": "workforce",
                                   "area.": "area",
                                   "employeesyou": "employees",
                                   "armed forces.": "armed forces",
                                   "employer.": "employer",
                                   "employers.": "employers",
                                   "employment tribunal.": "employment tribunal",
                                   "council.": "council",
                                   "charity.": "charity",
                                   "ireland.": "ireland",
                                   "govuk": "gov. uk",
                                   "staff.": "staff",
                                   "switzerland.": "switzerland",
                                   "england.": "england",
                                   "travel.": "travel",
                                   "people.": "people",
                                   "children.": "children",
                                   "police.": "police",
                                   "families.": "families",
                                   "home.": "home",
                                   "scotland.": "scotland",
                                   "government.": "government",
                                   "prison.": "prison",
                                   "child.": "child",
                                   "local council.": "local council",
                                   "court.": "court",
                                   ". gov. uk": "gov. uk",
                                   "company.": "company",
                                   "spain.": "spain",
                                   "countryyou": "country",
                                   "government digital service": "gds",
                                   "provider.": "provider",
                                   "areas.": "areas",
                                   "information commissioner ‚Äö√Ñ√¥": "information commissioner",
                                   "child maintenance servicethe child maintenance service": "child maintenance service",
                                   "child maintenance serviceto": "child maintenance service",
                                   "wales.": "wales",
                                   "magistrates ‚Äö√Ñ√¥ court": "magistrates court",
                                   "visa application centre." : "visa application centre",
                                   "eu.": "eu",
                                   "gov.": "gov",
                                   "agency.": "agency",
                                   "local authority.": "local authority",
                                   "companies house.": "companies house",
                                   "public.": "public",
                                   "school.": "school",
                                   "overseas.": "overseas",
                                   "businesses.": "businesses",
                                   "household.": "household",
                                   "magistrates ‚Äö√Ñ√¥ courts": "magistrates courts",
                                   "citizen ‚Äö√Ñ√¥ s advice bureau": "citizens advice bureau",
                                   "thefinancial ombudsman service": "financial ombudsman service",
                                   "population.": "population",
                                   "tenants.": "tenants",
                                   "ofqual /": "ofqual",
                                   "nhs.": "nhs",
                                   "local authorities.": "local authorities",
                                   "partneryou": "partner",
                                   "northern irelandyou": "northern ireland",
                                   "thehome office": "home office"
                                  }, inplace=True)

In [None]:
df_reduced_dff

In [434]:
# remove unwanted rows
print(len(df_reduced_dff))
df_words_clean1 = df_reduced_dff[~df_reduced_dff.word_cleaned.isin(remove)]
print(len(df_words_clean1))
# df_words_clean1

102753
102547


In [435]:
# Check for number of unique words again
words_clean_2 = df_words_clean1.word_cleaned.unique()
len(words_clean_2)

1563

In [None]:
pd.DataFrame(words_clean_2).to_csv("words_clean_2.csv", index=False)

In [437]:
df_words = pd.read_csv("words_clean_2.csv")
df_words

In [438]:
df_words.columns = ["words"]

In [None]:
df_words_clean1

In [439]:
page_indexes_test = df_words_clean1.loc[df_words_clean1["word_cleaned"] == "immigration status", "page_index"]

In [None]:
page_indexes_test

In [440]:
page_indexes_test_2 = df_words_clean1.loc[df_words_clean1["word_cleaned"].isin(["immigration status", "northern ireland"]), "page_index"]

In [None]:
page_indexes_test_2

In [None]:
df_words

In [441]:
common_words = df_words["words"].tolist()

In [None]:
common_words

In [442]:
page_indexes_common = df_words_clean1.loc[df_words_clean1["word_cleaned"].isin(common_words), "page_index"]

In [None]:
page_indexes_common

In [444]:
df_words_clean1.loc[df_words_clean1["word"].isin(common_words)].iloc[:]["page_index"]

In [None]:
df_words_clean1

In [445]:
from sklearn.preprocessing import OneHotEncoder

In [446]:
df_words_clean1.word_cleaned.value_counts().sort_values(ascending=False).head(20)


uk            6133
people        2752
users         2213
employer      2186
child         1994
person        1952
government    1760
hmrc          1597
england       1557
employee      1485
someone       1484
children      1442
company       1248
employees     1212
partner       1090
country       1083
business      1046
employers     1037
staff          914
court          885
Name: word_cleaned, dtype: int64

In [448]:
y = pd.get_dummies(df_words_clean1.word_cleaned, prefix="page_index")
# print(y.head())

In [None]:
y.to_csv("one-hot-encoding-2.csv")