<a href="https://colab.research.google.com/github/shashithenuwara/IRWA_Project/blob/project/upload_from_pc.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**1.Data Collection**

In [None]:
#From local device
from google.colab import files

# Uploading the PDFs from local device
uploaded = files.upload()

# Saving the names of uploaded files
pdf_files = uploaded.keys()

**2. Data Preprocessing**

2.1 Text Extraction

In [None]:
#Installing required libraries for PDF text extraction, tokenization, normalization, and other NLP tasks
!pip install PyPDF2 pdfplumber spacy nltk
!python -m spacy download en_core_web_sm

In [None]:
import pdfplumber

def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text()
    return text

# Loop through each uploaded PDF
extracted_texts = {}
for pdf_file in pdf_files:
    # Save the extracted text for each PDF file
    extracted_texts[pdf_file] = extract_text_from_pdf(pdf_file)

# Display extracted text for the first file (or save it to a file)
print(extracted_texts[pdf_files[0]])

In [None]:
for pdf_file, text in extracted_texts.items():
    # Save the extracted text to a .txt file with the same name as the PDF
    with open(pdf_file.replace('.pdf', '.txt'), 'w') as txt_file:
        txt_file.write(text)

2.2 Data Cleaning

In [None]:
#To remove HTML tags
!pip install beautifulsoup4

In [None]:
import re
from bs4 import BeautifulSoup
from google.colab import files
import pdfplumber

# Step 3: Cleaning function to process text
def clean_text(text):
    # Removing HTML tags
    soup = BeautifulSoup(text, "html.parser")
    text = soup.get_text()

    # Removing special characters and punctuation (except basic punctuation)
    text = re.sub(r"[^a-zA-Z0-9\s,.!?'-]", '', text)

    # Removing extra whitespaces
    text = re.sub(r'\s+', ' ', text).strip()

    # Removing duplicate lines/sentences
    sentences = text.split('. ')
    unique_sentences = list(dict.fromkeys(sentences))
    cleaned_text = '. '.join(unique_sentences)

    return cleaned_text

In [None]:
# Step 4: Apply the cleaning function to the extracted text
cleaned_texts = {filename: clean_text(text) for filename, text in pdf_texts.items()}

# Step 5: Print cleaned text from one file (e.g., the first uploaded file)
first_file = list(cleaned_texts.keys())[0]  # Get the name of the first file
print(cleaned_texts[first_file])

2.3 Normalization

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string

# Download required NLTK packages
nltk.download('punkt')
nltk.download('stopwords')

# Tokenization and normalization function
def preprocess_text(text):
    # Converting to lowercase and tokenizing
    tokens = word_tokenize(text.lower())

    # Removing punctuation marks and stop words
    tokens = [word for word in tokens if word.isalpha()]  # Keep only alphabetic tokens
    stop_words = set(stopwords.words('english'))  # Get English stop words
    tokens = [word for word in tokens if word not in stop_words]  # Remove stop words

    return tokens

# Assuming `pdf_texts` is a dictionary from the earlier text extraction step
# Preprocess each extracted text and store in a new dictionary
preprocessed_texts = {filename: preprocess_text(text) for filename, text in pdf_texts.items()}

# Print preprocessed tokens from the first file for verification
first_file = list(preprocessed_texts.keys())[0]  # Get the name of the first file
print(f"Preprocessed tokens for {first_file}:")
print(preprocessed_texts[first_file])

2.4 Entity Recognition

In [None]:
import spacy

# Load spaCy's small English model
nlp = spacy.load('en_core_web_sm')

# Function for Named Entity Recognition (NER)
def perform_ner(text):
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities

# Perform NER on each preprocessed text
# Here, we're joining the preprocessed tokens back into a single string
entities_per_doc = {filename: perform_ner(" ".join(tokens)) for filename, tokens in preprocessed_texts.items()}

# Print the named entities from the first file as an example
first_file = list(entities_per_doc.keys())[0]  # Get the name of the first file
print(f"Named entities for {first_file}:")
print(entities_per_doc[first_file])

2.5 Stemming/Lemmatization

In [None]:
# Using PorterStemmer for Stemming
from nltk.stem import PorterStemmer
import spacy

# Initializing the PorterStemmer
stemmer = PorterStemmer()

# Function to stem tokens
def stem_tokens(tokens):
    return [stemmer.stem(token) for token in tokens]

# Applying stemming to each document in preprocessed_texts
# Storing the stemmed tokens in a dictionary with filenames as keys
stemmed_texts = {filename: stem_tokens(tokens) for filename, tokens in preprocessed_texts.items()}

# Example output for stemming
first_file = list(preprocessed_texts.keys())[0]  # Get the first file name
print(f"Stemmed tokens for {first_file}:")
print(stemmed_texts[first_file])

In [None]:
# Importing spaCy for Lemmatization
# Loading the spaCy model
nlp = spacy.load('en_core_web_sm')

# Function to lemmatize tokens
def lemmatize_tokens(tokens):
    return [token.lemma_ for token in nlp(" ".join(tokens)) if token.is_alpha]

# Applying lemmatization to each document in preprocessed_texts
# Storing the lemmatized tokens in a dictionary with filenames as keys
lemmatized_texts = {filename: lemmatize_tokens(tokens) for filename, tokens in preprocessed_texts.items()}

# Example output for lemmatization
print(f"Lemmatized tokens for {first_file}:")
print(lemmatized_texts[first_file])