<a href="https://colab.research.google.com/github/shashithenuwara/IRWA_Project/blob/project/IRWA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

1. **Data Collection**

In [None]:
#From Google drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#From local device
from google.colab import files

# Uploading the PDFs from local device
uploaded = files.upload()

2. **Data Preprocessing**

2.1 Text Extraction

In [None]:
#Installing required libraries for PDF text extraction, tokenization, normalization, and other NLP tasks
!pip install PyPDF2 pdfplumber spacy nltk
!python -m spacy download en_core_web_sm

In [None]:
import os
import pdfplumber

#Path to the dataset
pdf_directory = '/content/drive/My Drive/DataSets'

#Extracting text from PDFs
def extract_text_from_pdf (pdf_path):
  text=""
  with pdfplumber.open(pdf_path) as pdf:
    for page in pdf.pages:
      text+= page.extract_text()
  return text

#Looping to extract text from all PDFs
pdf_texts = []
for filename in os.listdir(pdf_directory):
  if filename.endswith(".pdf"):
    pdf_path = os.path.join(pdf_directory, filename)
    text = extract_text_from_pdf(pdf_path)
    pdf_texts.append(text)

2.2 Data Cleaning

In [None]:
#To remove HTML tags
!pip install beautifulsoup4

In [None]:
import re
from bs4 import BeautifulSoup

def clean_text(text):
    #Removing HTML tags
    soup = BeautifulSoup(text, "html.parser")
    text = soup.get_text()

    #Removing special characters and punctuation (except basic punctuation)
    text = re.sub(r"[^a-zA-Z0-9\s,.!?'-]", '', text)

    #Removing extra whitespaces
    text = re.sub(r'\s+', ' ', text).strip()

    #Removing duplicate lines/sentences
    sentences = text.split('. ')
    unique_sentences = list(dict.fromkeys(sentences))
    cleaned_text = '. '.join(unique_sentences)

    return cleaned_text

In [None]:
#Applying the cleaning function to all documents
cleaned_texts = {filename: clean_text(text) for filename, text in pdf_texts.items()}

In [None]:
#Print cleaned text from one file
print(cleaned_texts['example.pdf'])

2.3 Normalization

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string

nltk.download('punkt')
nltk.download('stopwords')

#Tokenization and normalization function
def preprocess_text(text):

  #Converting to lowercase and tokenizing
  tokens = word_tokenize(text.lower())

  #Removing punctuation marks and stop words
  tokens = [word for word in tokens if word.isalpha()]
  stop_words = set(stopwords.words('english'))
  tokens = [word for word in tokens if word not in stop_words]

  return tokens

  #Preprocessing each extracted text
  preprocessed_texts = [preprocess_text(text) for text in pdf_texts]

2.4 Entity Recognition

In [None]:
import spacy

#Loading spaCy's small English model
nlp = spacy.load('en_core_web_sm')

#Function for NER
def perform_ner(text):
  doc = nlp(text)
  entities = [(ent.text, ent.label_) for ent in doc.ents]
  return entities

#Performing NER on each text
entities_per_doc = [perform_ner(" ".join(tokens)) for tokens in preprocessed_texts]

2.5 Stemming/Lemmatization

In [None]:
#Using PorterStemmer
from nltk.stem import PorterStemmer

#Initializing stemmer
stemmer = PorterStemmer()

#Function to stem tokens
def stem_tokens (tokens):
  return [stemmer.stem(token) for token in tokens]

#Applying stemming to each document
stemmed_texts = [stem_tokens(tokens) for tokens in preprocessed_texts]

In [None]:
# Importing spacy for lemmatization
import spacy

# Loading the spacy model
nlp = spacy.load('en_core_web_sm')

# Function to lemmatize tokens
def lemmatize_tokens(tokens):
    return [token.lemma_ for token in nlp(" ".join(tokens)) if token.is_alpha]

# Applying lemmatization to each document
lemmatized_texts = [lemmatize_tokens(tokens) for tokens in preprocessed_texts]