<a href="https://colab.research.google.com/github/skevin-dev/NLP-FELLOWSHIP/blob/week2/Cleaning_(1).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [51]:
# !pip install textract

In [52]:
# pip install clean-text


In [4]:
import textract
import os
import re 
import string 
from cleantext import clean



In [5]:
path = '/content/gdrive/MyDrive/NLP Fellowship /week1 /data'
os.chdir(path)
!ls

'Hansard Report - Thursday, 13th October 2022 (P).pdf'
'Hansard Report - Thursday 6th October 2022 (P).pdf'
'Hansard Report - Tuesday, 11th October 2022 (P).pdf'
'Hansard Report - Wednesday, 12th October 2022 (A).pdf'
'Hansard Report - Wednesday, 12th October 2022 (P)_0.pdf'
 text_files


In [53]:
text = textract.process("Hansard Report - Thursday, 13th October 2022 (P).pdf")
text = text.decode("utf-8")

# Cleaning Text

This is the first step in the pipeline. In the real-world, text will not be clean. The above text has a lot of new lines, blank spaces, punctuations, urls and many more. In classification tasks, they will not be helpful. Hence we need to remove them. 

## Removing the empty lines
For the text to be processed well, we need to have the corpus split by only one new line. This will result in one sentence per line

In [7]:
processed_text = []
for line in text.splitlines():  
  if line.strip():
    processed_text.append(line)

print(processed_text[:20])

['REPUBLIC OF KENYA ', 'THIRTEENTH PARLIAMENT ', 'NATIONAL ASSEMBLY ', 'THE HANSARD ', 'VOL. I NO. 10 ', '13th October 2022                       NATIONAL ASSEMBLY DEBATES                                               1 ', 'THE HANSARD ', 'Thursday, 13th October 2022 ', 'The House met at 2.30 p.m. ', '[The Hon. Speaker (Hon. Moses Wetang’ula) in the Chair] ', 'PRAYERS ', 'Hon. Speaker: We have quorum and so we can proceed.  ', 'NOTICES OF MOTIONS ', 'Hon: Speaker: The Member for Kwanza, Hon. Ferdinand Wanyonyi. ', 'PUBLICATION OF REGULATIONS ON PRIVATE ', 'LAND USE AND MANAGEMENT ', 'Hon. Ferdinand Wanyonyi (Kwanza, FORD-K): Hon. Speaker, I beg to give Notice ', 'of the following Motion:  ', 'THAT, aware that Article 68(c)(i) of the Constitution of Kenya provides ', 'for the minimum and maximum holding acreages with respect to private land; ']


In [8]:
print(len(processed_text))

3349


## Same case
When preparing text for classification task, the text need to to be in different cases. We are trying to make the number of words used as small as possible

In [9]:
print('Text' == 'text')
print("text" == "text")
print("TEXT" == "TEXT")

False
True
True


In [10]:
case_processed = []
for line in processed_text:
  line = line.lower()
  case_processed.append(line)

print(case_processed[:20])

['republic of kenya ', 'thirteenth parliament ', 'national assembly ', 'the hansard ', 'vol. i no. 10 ', '13th october 2022                       national assembly debates                                               1 ', 'the hansard ', 'thursday, 13th october 2022 ', 'the house met at 2.30 p.m. ', '[the hon. speaker (hon. moses wetang’ula) in the chair] ', 'prayers ', 'hon. speaker: we have quorum and so we can proceed.  ', 'notices of motions ', 'hon: speaker: the member for kwanza, hon. ferdinand wanyonyi. ', 'publication of regulations on private ', 'land use and management ', 'hon. ferdinand wanyonyi (kwanza, ford-k): hon. speaker, i beg to give notice ', 'of the following motion:  ', 'that, aware that article 68(c)(i) of the constitution of kenya provides ', 'for the minimum and maximum holding acreages with respect to private land; ']


## Remove duplicate lines
When scrapping data, there is a high probaility of the text having duplicate lines or whole articles.

In [11]:
set_example = set([1,1,2,3,4,4,5])
set_example

{1, 2, 3, 4, 5}

In [12]:
# Check the order of text when working with set only
no_dups_text = []
checked_text = set()
for line in case_processed:
  if line not in checked_text:
    no_dups_text.append(line)
    checked_text.add(line)

print(no_dups_text[:10])


['republic of kenya ', 'thirteenth parliament ', 'national assembly ', 'the hansard ', 'vol. i no. 10 ', '13th october 2022                       national assembly debates                                               1 ', 'thursday, 13th october 2022 ', 'the house met at 2.30 p.m. ', '[the hon. speaker (hon. moses wetang’ula) in the chair] ', 'prayers ']


In [13]:
print(len(no_dups_text))

3188


## Remove the unwanted whitespaces
The document has whitespaces inbetween the sentences

In [14]:
## ENTER CODE HERE
text_wo_whitespaces = [" ".join(word.split()) for word in no_dups_text]

In [15]:
text_wo_whitespaces[:10]

['republic of kenya',
 'thirteenth parliament',
 'national assembly',
 'the hansard',
 'vol. i no. 10',
 '13th october 2022 national assembly debates 1',
 'thursday, 13th october 2022',
 'the house met at 2.30 p.m.',
 '[the hon. speaker (hon. moses wetang’ula) in the chair]',
 'prayers']

## Remove URLS and Email address
We do not need urls and email addresses

In [16]:
## ENTER CODE HERE

# codes for find email and urls

# a sample list
shyaka = ['send the letter to shyakakevin1@gmail.com ', 'this is nlp ~~fellowship \U0001f602', 'find!! the updloaded video?\U0001f601 on this link: https://www.youtube.com/watch?v=oorVWW9ywG0&list=RDMM&index=4']

# regex to search emails and urls
regexE = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
regexL = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'

emails = []
links = []

# loop through list of contents and extract links and urls
for i in text_wo_whitespaces:
  words = i.split(" ")
  for word in words:
    if (re.fullmatch(regexE,word)):
      emails.append(word)
    if (re.fullmatch(regexL,word)):
      links.append(word)




In [17]:
emails,links

([], [])

In [18]:
# removing emails and urls in case we have them 
regexALL = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b|http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
w = []
for i in shyaka:
  # print(i)
  d = re.sub(regexALL,"",i)
  w.append(d)
w

['send the letter to  ',
 'this is nlp ~~fellowship 😂',
 'find!! the updloaded video?😁 on this link: ']

No emails and links in our email

## Removing punctuations
Punctuations are tricky especially in African languages. We cannot use the punctuation functions in string class. We need to create our own custom list of punctuations.

['[]','\\',':',';','']

In [19]:
## ENTER CODE HERE


no_punct = [words.translate(str.maketrans("","",string.punctuation)) for words in text_wo_whitespaces]
no_punct[-20:]

['thank you so much for allowing me this opportunity',
 'the temporary speaker hon david ochieng’ thank you so much member for',
 'kesses',
 'that marks the end of debate on this motion and like i said i apologise to members',
 'who were not able to speak next time they will have an opportunity to do so the mover of',
 'this motion would be asked to reply to it next time',
 'hon members pursuant to the provisions of standing order no283 regarding the',
 'calendar of the assembly and resolutions of the house on wednesday 12th october 2022 the',
 'speaker notifies that upon the rise of the house at the appointed time today regular sittings',
 'will resume on tuesday 25th october 2022 at 230 pm',
 'adjournment',
 'the temporary speaker hon david ochieng’ there being no other business and',
 'the time being 911 pm this house stands adjourned until tuesday 25th october 2022 at',
 '230 pm',
 'the house rose at 911 pm',
 '13th october 2022 national assembly debates 66',
 'published by',
 'cle

In [20]:
text_wo_whitespaces[-20:]

['thank you so much for allowing me this opportunity.',
 'the temporary speaker (hon. david ochieng’): thank you so much, member for',
 'kesses.',
 'that marks the end of debate on this motion; and like i said, i apologise to members',
 'who were not able to speak. next time, they will have an opportunity to do so. the mover of',
 'this motion would be asked to reply to it next time.',
 'hon. members pursuant to the provisions of standing order no.28(3), regarding the',
 'calendar of the assembly, and resolutions of the house on wednesday 12th october 2022, the',
 'speaker notifies that upon the rise of the house at the appointed time today, regular sittings',
 'will resume on tuesday, 25th october 2022 at 2:30 p.m.',
 'adjournment',
 'the temporary speaker (hon. david ochieng’): there being no other business, and',
 'the time being 9.11 p.m., this house, stands adjourned until tuesday, 25th october 2022, at',
 '2.30 p.m.',
 'the house rose at 9.11 p.m.',
 '13th october 2022 national a

## Remove Emoticons
The internet has brought the rise of emojis. If you will be processing data from social media, you will need to remove emojis. Emojis are represented in the form of unicodes e.g U+1F600

In [21]:
shyaka

['send the letter to shyakakevin1@gmail.com ',
 'this is nlp ~~fellowship 😂',
 'find!! the updloaded video?😁 on this link: https://www.youtube.com/watch?v=oorVWW9ywG0&list=RDMM&index=4']

In [22]:
## ENTER CODE HERE

no_emojis_ = [clean(words,no_emoji=True) for words in shyaka]
no_emojis_

['send the letter to shyakakevin1@gmail.com',
 'this is nlp ~~fellowship',
 'find!! the updloaded video? on this link: https://www.youtube.com/watch?v=oorvww9ywg0&list=rdmm&index=4']

In [23]:
emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags=re.UNICODE)

# print(emoji_pattern.sub(r'', text)) # no emoji

no_emojis = [emoji_pattern.sub(r'',words) for words in shyaka]

no_emojis

['send the letter to shyakakevin1@gmail.com ',
 'this is nlp ~~fellowship ',
 'find!! the updloaded video? on this link: https://www.youtube.com/watch?v=oorVWW9ywG0&list=RDMM&index=4']

## Remove sensitive data
There are some privacy considerations when dealing with data. There are senstive data such as phone numbers, Names, card details and many more. +44XX XXXXX

In [24]:
## ENTER CODE HERE


# In-class Practicals
Finish writing the code above. Save the data in a txt file. It will be used for the next class


# Assignment 
Create a function that will run all the steps in one iteration. Run the code on the csv file used in the last assignment. 

In [41]:
text_= '''
This is nlp fellowship which might be good \U0001f602 for those interested in machine'learning!!!. 
This is nlp fellowship which might be good \U0001f602 for those interested in machine'learning!!!. 
got a question ?, All question must be SUBMITTED  to this email: shyakakevin1@gmail.com. interesting \U0001f602 ??? or yeah ~~~ and this phone +4487928272. 
we have monthly hackaton for 250$ each groups and mr. LADIES and GENTLEMAN, we invite to visit this link https://huzalabs.medium.com/nlp-fellowship-2022-africa-boosting-skills-in-machine-translation-and-ai-chatbots-7fe9886fdceb
for our medium. we have PROF.MCsharry on the board>> a nd our CEO da @vy to make thing@#!s cleae@$rl. 
'''

In [50]:
def cleaning_text(text):
  """This functions helps to make some cleanings given a text file 

  Args
  ----
  text(str): text to be cleaned 

  Returns
  -------
  final_text(list): returns a list containing cleaned text  
  """

  # convert text file into a list and remove empty lines 
  uncleaned_text = [line for line in text.splitlines() if line.strip()]

  # same cases 
  same_case = [line.lower() for line in uncleaned_text]

  # remove duplicate lines 
  no_dups_line = []
  for line in same_case:
    if line not in no_dups_line:
      no_dups_line.append(line)

  # remove emails and urls
  email_link_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b|http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'

  no_email_link = [re.sub(email_link_pattern,"",line) for line in no_dups_line ]

  # remove punctuations
  no_punct = [line.translate(str.maketrans(" "," ",string.punctuation)) for line in no_email_link]

  # remove emojis 
  no_emojis = [clean(line,no_emoji=True) for line in no_punct]

  # remove sensitive data 
  sensitive_data_pattern =  r'\d{1,}|\+\d+\s\d+\s\d+|\+\d+\s\d+'

  no_sensitive_data = [re.sub(sensitive_data_pattern,"",line) for line in no_emojis ]

  # remove white spaces 
  final_text = [" ".join(line.split()) for line in no_sensitive_data]

  return final_text

# Write text into a .txt file
cleaned_data =  cleaning_text(text)
path = '/content/gdrive/MyDrive/NLP Fellowship /week 2 /cleaned_data/'
y = open(path+'/cleaned_data.txt', 'w+',  encoding='utf-8')
for x in cleaned_data:
  y.write(x+' \n')