Data Collection and preprocessing

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import requests
from bs4 import BeautifulSoup
import csv

def scrape_amazon_reviews(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
        'Accept-Language': 'en-US,en;q=0.9',
        'Accept-Encoding': 'gzip, deflate, br',
        'Referer': 'https://www.amazon.com/'
    }
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        reviews = soup.find_all('div', class_='a-section review aok-relative')

        data = []
        for review in reviews[:100]:
            text = review.find('span', class_='review-text').text.strip()
            data.append({'Review': text})
            print(text)

        return data
    else:
        print('Failed to fetch data')
        return None

def save_to_csv(data, file_path):
    with open(file_path, mode='a', newline='') as file:
        writer = csv.DictWriter(file, fieldnames=['Review'])
        writer.writeheader()
        for item in data:
            writer.writerow(item)

if __name__ == '__main__':
    url = 'https://www.amazon.in/Apple-Midnight-Aluminium-Fitness-Resistant/product-reviews/B0BDKSLQ1X/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews'
    reviews = scrape_amazon_reviews(url)
    if reviews:
        save_to_csv(reviews, 'amazon_reviews.csv')
        print('Reviews saved to amazon_reviews.csv')


Following is the code to merge multiple csv files (in case reviews are stored in multiple files)

In [None]:
import csv

def merge_csv_files(file1, file2, output_file):
    with open(file1, 'r', newline='') as csv_file1:
        csv_reader1 = csv.reader(csv_file1)
        data1 = list(csv_reader1)

    with open(file2, 'r', newline='') as csv_file2:
        csv_reader2 = csv.reader(csv_file2)
        data2 = list(csv_reader2)

    merged_data = data1 + data2

    with open(output_file, 'w', newline='') as merged_file:
        csv_writer = csv.writer(merged_file)
        csv_writer.writerows(merged_data)

if __name__ == '__main__':
    file1 = '/content/sample_data/amazon_reviews (2).csv'
    file2 = '/content/sample_data/amazon_reviews.csv'
    output_file = 'merged_file.csv'

    merge_csv_files(file1, file2, output_file)
    print(f'Merged files {file1} and {file2} into {output_file}')


Following is the code to clean text data.
It does the following:
(1) Remove noise, such as special characters and punctuations.

(2) Remove numbers.

(3) Remove stopwords by using the stopwords list.

(4) Lowercase all texts

(5) Stemming.

(6) Lemmatization.

In [None]:
import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
import string
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Acquiring the NLTK assets
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Loading the dataset and ensuring its integrity
df = pd.read_csv('/content/drive/MyDrive/Aspect_Based_Summarization/merged_file.csv')
print("Data successfully loaded.")
print("Total rows:", len(df))

# Define the stop words and initialize the stemmer and the lemmatizer
stop_words_list = set(stopwords.words('english'))
stemmer_engine = PorterStemmer()
lemmatizer_tool = WordNetLemmatizer()

# This is the text cleaning function
def sanitize_text(text):
    # Eliminating punctuations and numbers
    text = ''.join([character for character in text if character not in string.punctuation and not character.isdigit()])
    # Tokenizing the text
    tokens = word_tokenize(text)
    # Removing stop words
    tokens = [word for word in tokens if word.lower() not in stop_words_list]
    # Converting to lowercase
    tokens = [word.lower() for word in tokens]
    # Lemmatizing
    lemmatized_tokens = [lemmatizer_tool.lemmatize(word) for word in tokens]
    return ' '.join(lemmatized_tokens)

# Applying text cleaning to the Review column
df['Cleaned_Review'] = df['Review'].apply(sanitize_text)

# Save the cleaned data to a new CSV file
cleaned_file_path = '/content/drive/MyDrive/Aspect_Based_Summarization/clean_text.csv'
df.to_csv(cleaned_file_path, index=False)

print("Data frame successfully saved to:", cleaned_file_path)


Following is the code to extract text data from XML file present in between a specific tag. Here the code extracts data present in between the 'text' tag

In [None]:
import xml.etree.ElementTree as ET
import csv

# Here we parse the XML file
tree = ET.parse('/content/drive/MyDrive/Aspect_Based_Summarization/Laptop_Train_v2.xml') #Path to the XML file
root = tree.getroot()

# Opening a CSV file for writing
with open('output.csv', mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['Text'])  # Write header

    # Extracting the data between <text> </text> tags and write to CSV file
    for elem in root.iter('text'):
        text = elem.text.strip() if elem.text else ''  # Removing the leading or trailing whitespaces
        writer.writerow([text])

print("Data extracted and saved to 'output.csv' file.")


Data extracted and saved to 'output.csv' file.


In [None]:
import pandas as pd
df = pd.read_csv('/content/output.csv')#Path to the new file containing the text data


Code to customize the number of reviews present in the file which can be used for sample run

In [None]:
import pandas as pd

# Reading the CSV file
df = pd.read_csv('/content/output_1000.csv')

# Specify the number of reviews to be kept in the bracket
df_subset = df.head(100)

# Writing the subset to a new CSV file
df_subset.to_csv('output_100.csv', index=False)

print("First 1001 rows saved to 'output_1000.csv'")


First 1001 rows saved to 'output_1000.csv'


Starting the first phase of the project: Named Entity Recognition
We have used 4 pre-trained unsupervised models to extract Named Entities From the acquired reviews dataset.

BERT-NER

In [None]:
!pip install transformers




In [None]:
#Importing necessary libraries
import pandas as pd
from transformers import BertTokenizer, BertForTokenClassification
import torch
import csv

# Loading the dataset
df = pd.read_csv('/content/drive/MyDrive/Aspect_Based_Summarization/output_1000.csv')
print("Dataset Loaded")

# Loading the BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=2)  # Assuming 2 labels for binary classification
print("BERT Tokenizer loaded")


# Function to extract named entities from text
def extract_named_entities(text):
    tokens = tokenizer.tokenize(text)
    tokenized_texts = []
    segment_ids = []
    tokenized_texts.append("[CLS]")
    segment_ids.append(0)
    for token in tokens:
        tokenized_texts.append(token)
        segment_ids.append(0)
    tokenized_texts.append("[SEP]")
    segment_ids.append(0)

    input_ids = tokenizer.convert_tokens_to_ids(tokenized_texts)
    input_ids = input_ids[:512]  # Truncate to maximum length
    segment_ids = segment_ids[:512]  # Truncate to maximum length

    inputs = torch.tensor([input_ids])
    segments = torch.tensor([segment_ids])
    outputs = model(inputs, token_type_ids=segments)
    predictions = torch.argmax(outputs.logits, dim=2)

    tokens = tokenizer.convert_ids_to_tokens(inputs.squeeze().tolist())
    named_entities = [(token, model.config.id2label[prediction]) for token, prediction in zip(tokens, predictions.squeeze().tolist())]
    return named_entities

# Output file path
output_file_path = '/content/drive/MyDrive/Aspect_Based_Summarization/BERT_CSV.csv'
print("CSV file saved")

# Processes each text and stores the named entities in the CSV file
with open(output_file_path, 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['Token', 'Entity'])
    for text in df['Text']:
        named_entities = extract_named_entities(text)
        for token, entity in named_entities:
            writer.writerow([token, entity])
        writer.writerow([])  # Separate documents by empty row

print(f"Named entities extracted and stored in '{output_file_path}'.")


Dataset Loaded


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERT Tokenizer loaded
CSV file saved
Named entities extracted and stored in '/content/drive/MyDrive/Aspect_Based_Summarization/BERT_CSV.csv'.


In [None]:
from transformers import BertTokenizer, BertForTokenClassification
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForTokenClassification.from_pretrained('bert-base-uncased')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
#Importing necessary libraries
import pandas as pd
from transformers import BertTokenizer, BertForTokenClassification
import torch

# Loading the dataset
#df = pd.read_csv('your_dataset.csv')

# Loading the BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=2)  # Assuming 2 labels for binary classification

# Following is the function to extract named entities from text
def extract_named_entities(text):
    tokens = tokenizer.tokenize(text)
    tokenized_texts = []
    segment_ids = []
    tokenized_texts.append("[CLS]")
    segment_ids.append(0)
    for token in tokens:
        tokenized_texts.append(token)
        segment_ids.append(0)
    tokenized_texts.append("[SEP]")
    segment_ids.append(0)

    input_ids = tokenizer.convert_tokens_to_ids(tokenized_texts)
    input_ids = input_ids[:512]  # Truncate to maximum length
    segment_ids = segment_ids[:512]  # Truncate to maximum length

    inputs = torch.tensor([input_ids])
    segments = torch.tensor([segment_ids])
    outputs = model(inputs, token_type_ids=segments)
    predictions = torch.argmax(outputs.logits, dim=2)

    tokens = tokenizer.convert_ids_to_tokens(inputs.squeeze().tolist())
    named_entities = [(token, model.config.id2label[prediction]) for token, prediction in zip(tokens, predictions.squeeze().tolist())]
    return named_entities

# Output file path
output_file_path = 'named_entities.txt'

# Process each text and store named entities in the file
with open(output_file_path, 'w') as file:
    for text in df['Text']:
        named_entities = extract_named_entities(text)
        for token, entity in named_entities:
            file.write(f"{token}\t{entity}\n")
        file.write("\n")  # Separate documents by newline

print(f"Named entities extracted and stored in '{output_file_path}'.")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


NameError: name 'df' is not defined

Using Flair NER to extract named entities

In [None]:
pip install flair




In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv('/content/output_100.csv')

# Preprocess the data (if needed)
data = df['Text'].tolist()


In [None]:
from flair.models import SequenceTagger

# Loading the pre-trained NER model
tagger = SequenceTagger.load('ner')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


pytorch_model.bin:   0%|          | 0.00/432M [00:00<?, ?B/s]

2024-04-01 18:13:33,282 SequenceTagger predicts: Dictionary with 20 tags: <unk>, O, S-ORG, S-MISC, B-PER, E-PER, S-LOC, B-ORG, E-ORG, I-PER, S-PER, B-MISC, I-MISC, E-MISC, I-ORG, B-LOC, E-LOC, I-LOC, <START>, <STOP>


Working code to extract Named Entities

In [None]:
import pandas as pd

# Loading the dataset
df = pd.read_csv('/content/drive/MyDrive/Aspect_Based_Summarization/output_1000.csv')
print("Dataset Loaded")

# Preprocess the data (if needed)
data = df['Text'].tolist()

from flair.models import SequenceTagger

# Loading the pre-trained NER model
tagger = SequenceTagger.load('ner')

# Process each review and extract named entities
from flair.data import Sentence
named_entities_list = []
for review in data:
    sentence = Sentence(review)
    tagger.predict(sentence)
    named_entities = [(entity.text, entity.labels[0].value) for entity in sentence.get_spans('ner')]
    named_entities_list.extend([(review, entity[0], entity[1]) for entity in named_entities])

# Convert the list of named entities to a DataFrame
ne_df = pd.DataFrame(named_entities_list, columns=['Review', 'Named Entity', 'Entity Type'])

# Save the result to a new CSV file
ne_df.to_csv('/content/drive/MyDrive/Aspect_Based_Summarization/Flair_NER.csv', index=False)
print("File successfully saved")

2024-04-01 18:55:20,724 SequenceTagger predicts: Dictionary with 20 tags: <unk>, O, S-ORG, S-MISC, B-PER, E-PER, S-LOC, B-ORG, E-ORG, I-PER, S-PER, B-MISC, I-MISC, E-MISC, I-ORG, B-LOC, E-LOC, I-LOC, <START>, <STOP>


Modifying code to extract a min of 3 entities

In [None]:
# Process each review and extract named entities
named_entities_list = []
for review in data:
    num_entities = 0
    sentence = Sentence(review)
    tagger.predict(sentence)
    named_entities = [(entity.text, entity.labels[0].value) for entity in sentence.get_spans('ner')]
    for entity in named_entities:
        named_entities_list.append((review, entity[0], entity[1]))
        num_entities += 1
        if num_entities >= 3:
            break

# Convert the list of named entities to a DataFrame
ne_df = pd.DataFrame(named_entities_list, columns=['Review', 'Named Entity', 'Entity Type'])

# Save the result to a new CSV file
ne_df.to_csv('/content/extracted_named_entities_Flair.csv', index=False)


Hugging Face NER

In [None]:
import pandas as pd
from transformers import pipeline

# Load the NER pipeline
ner = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english", tokenizer="dbmdz/bert-large-cased-finetuned-conll03-english")

# Read the CSV file
df = pd.read_csv('/content/drive/MyDrive/Aspect_Based_Summarization/output_1000.csv')

# Create a list to store the extracted entities
entity_list = []

# Iterate over rows and extract entities
for index, row in df.iterrows():
    text = row['Text']
    entities = ner(text)
    for entity in entities:
        entity_list.append({
            'Entity': entity['entity'],
            'Text': entity['word'],
            'Row': index
        })

# Create a new DataFrame from the entity list
entity_df = pd.DataFrame(entity_list)

# Save the entity DataFrame to a new CSV file
entity_df.to_csv('/content/drive/MyDrive/Aspect_Based_Summarization/HuggingFace_NER.csv', index=False)


Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


spaCY NER

In [None]:
import spacy
import pandas as pd

# Load the English language model
nlp = spacy.load("en_core_web_sm")

# Read the CSV file
df = pd.read_csv("/content/drive/MyDrive/Aspect_Based_Summarization/output_1000.csv")

# Specify the column containing the text data
text_column = "Text"

# Function to extract entities from text
def extract_entities(text):
    doc = nlp(text)
    entities = [ent.text for ent in doc.ents]
    # Ensure a minimum of 4 entities are extracted
    while len(entities) < 4:
        entities.append("")
    return ", ".join(entities[:4])

# Apply the function to the text column
df["Entities"] = df[text_column].apply(extract_entities)

# Save the result to a new CSV file
df.to_csv("/content/drive/MyDrive/Aspect_Based_Summarization/spaCY_NER.csv", index=False)


Extracting entitiy columns from the generated csv files.

In [None]:
import pandas as pd

# Read the CSV file
df = pd.read_csv("/content/sample_data/Flair_NER_OP.csv")

# Specify the column you want to store
column_name = "Named Entity"

# Store the column data to a new file
df[column_name].to_csv("new_file.csv", index=False)


Merging all the csv files with enities

In [None]:
import pandas as pd

# Read the three CSV files
df1 = pd.read_csv("/content/new_file.csv")
df2 = pd.read_csv("/content/new_file1.csv")
df3 = pd.read_csv("/content/new_file2.csv")
# Concatenate the data from all three columns into a single column
merged_data = pd.concat([df1.stack(), df2.stack(), df3.stack()]).reset_index(drop=True)

# Create a new DataFrame with the merged data in a single column
merged_df = pd.DataFrame({"Merged_Column": merged_data})

# Save the new DataFrame to a new CSV file
merged_df.to_csv("Entities.csv", index=False)


Code to rank the entity on frequency

In [None]:
import csv
from collections import Counter

# Assuming the CSV file has a column named 'Named_Entity' containing the named entities
csv_file_path = '/content/Entities.csv'

# Read the CSV file and extract the named entities
named_entities_list = []
with open(csv_file_path, mode='r', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    for row in reader:
        named_entities_list.append(row['Merged_Column'])

# Count the occurrences of each named entity
entity_counts = Counter(named_entities_list)

# Sort the entities based on their count in descending order
sorted_entities = sorted(entity_counts.items(), key=lambda x: x[1], reverse=True)

# Print the ranked list of named entities
for rank, (entity, count) in enumerate(sorted_entities, start=1):
    print(f"Rank {rank}: {entity} (Count: {count})")


Rank 1: , , ,  (Count: 579)
Rank 2: Mac (Count: 71)
Rank 3: Apple (Count: 56)
Rank 4: HP (Count: 51)
Rank 5: Dell (Count: 36)
Rank 6: ##shi (Count: 31)
Rank 7: ##ba (Count: 30)
Rank 8: ##B (Count: 30)
Rank 9: Windows (Count: 28)
Rank 10: To (Count: 28)
Rank 11: ##ook (Count: 25)
Rank 12: Toshiba (Count: 19)
Rank 13: Vista (Count: 19)
Rank 14: PC (Count: 19)
Rank 15: i (Count: 17)
Rank 16: Pro (Count: 17)
Rank 17: Sony (Count: 14)
Rank 18: Microsoft (Count: 13)
Rank 19: Toshiba, , ,  (Count: 13)
Rank 20: Apple, , ,  (Count: 12)
Rank 21: MacBook Pro (Count: 10)
Rank 22: Acer (Count: 10)
Rank 23: Ace (Count: 10)
Rank 24: ##r (Count: 10)
Rank 25: Mac, , ,  (Count: 10)
Rank 26: ##book (Count: 9)
Rank 27: 7 (Count: 9)
Rank 28: Best (Count: 8)
Rank 29: ##M (Count: 8)
Rank 30: one, , ,  (Count: 8)
Rank 31: X (Count: 7)
Rank 32: Buy (Count: 7)
Rank 33: ##P (Count: 7)
Rank 34: G (Count: 7)
Rank 35: first, , ,  (Count: 7)
Rank 36: Best Buy (Count: 6)
Rank 37: iTunes (Count: 6)
Rank 38: Intel (Cou

In [None]:
import csv
from collections import Counter

# Assuming the CSV file has a column named 'Named_Entity' containing the named entities
csv_file_path = '/content/Entities.csv'
output_csv_file_path = '/content/sample_data/Rank.csv'

# Read the CSV file and extract the named entities
named_entities_list = []
with open(csv_file_path, mode='r', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    for row in reader:
        named_entities_list.append(row['Merged_Column'])

# Count the occurrences of each named entity
entity_counts = Counter(named_entities_list)

# Sort the entities based on their count in descending order
sorted_entities = sorted(entity_counts.items(), key=lambda x: x[1], reverse=True)

# Write the ranked list of named entities to a new CSV file
with open(output_csv_file_path, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(['Named_Entity', 'Count'])
    for rank, (entity, count) in enumerate(sorted_entities, start=1):
        writer.writerow([entity, count])

print(f"Ranked named entities saved to {output_csv_file_path}")


Ranked named entities saved to /content/sample_data/Rank.csv


Phase 2 Starts Now. Step 1: Storing vector embeddings of each review

In [None]:
pip install annoy

Collecting annoy
  Downloading annoy-1.17.3.tar.gz (647 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m647.5/647.5 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: annoy
  Building wheel for annoy (setup.py) ... [?25l[?25hdone
  Created wheel for annoy: filename=annoy-1.17.3-cp310-cp310-linux_x86_64.whl size=552449 sha256=b905579b7ea3af77c53469507da21ccbee71362c7402f32c495c70fddcd8a065
  Stored in directory: /root/.cache/pip/wheels/64/8a/da/f714bcf46c5efdcfcac0559e63370c21abe961c48e3992465a
Successfully built annoy
Installing collected packages: annoy
Successfully installed annoy-1.17.3


In [None]:
!pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-2.7.0-py3-none-any.whl (171 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m171.5/171.5 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.11.0->sentence-transform

Retrieving related reviews by aspects

Retrieving related reviews by aspects and summarizing them using t5 transformer

In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from annoy import AnnoyIndex
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from transformers import pipeline

# Load the CSV file
df = pd.read_csv('/content/drive/MyDrive/Aspect_Based_Summarization/output_1000.csv')

# Extract texts from the "Text" column
texts = df['Text'].tolist()

# Generate embeddings for the texts
embeddings_model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = embeddings_model.encode(texts)

# Create AnnoyIndex
emb_dim = len(embeddings[0])
annoy_index = AnnoyIndex(emb_dim, 'angular')  # 'angular' for cosine similarity

# Add embeddings to AnnoyIndex
for i, emb in enumerate(embeddings):
    annoy_index.add_item(i, emb)

# Build the index
annoy_index.build(10)  # 10 trees

# Save the index to disk
annoy_index.save('annoy_index.ann')

# Load the index from disk
annoy_index_loaded = AnnoyIndex(emb_dim, 'angular')
annoy_index_loaded.load('annoy_index.ann')

# Example query
query = "photo"

# Perform initial retrieval
query_emb = embeddings_model.encode([query])[0]
indexes = annoy_index_loaded.get_nns_by_vector(query_emb, 20)  # Get 20 nearest neighbors

# Extract relevant features for reranking
features = []
for idx in indexes:
    emb = embeddings[idx]
    similarity_score = np.dot(emb, query_emb)  # Cosine similarity as a feature
    length_ratio = len(texts[idx]) / len(query)  # Length ratio as a feature
    features.append([similarity_score, length_ratio])

# Normalize the features
scaler = MinMaxScaler()
normalized_features = scaler.fit_transform(features)

# Reranking using a RandomForestRegressor (you can use any other model)
regressor = RandomForestRegressor()
regressor.fit(normalized_features, range(len(indexes)))  # Fit the model to predict the ranking

# Predict the ranking
reranked_indexes = regressor.predict(normalized_features)
reranked_indexes = sorted(range(len(reranked_indexes)), key=lambda k: reranked_indexes[k])

# Get the reranked texts
reranked_texts = [texts[indexes[idx]] for idx in reranked_indexes]
print("Reranked texts:", reranked_texts)
# Generate a summary for the top relevant reviews
summarizer = pipeline("summarization", model="t5-small", framework="pt")
summary = summarizer(reranked_texts[:5], max_length=50, min_length=30, do_sample=False)[0]['summary_text']
print("Query:", query)
print("Generated summary:", summary)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Reranked texts: ['Images are crisp and clean.', 'Theres a built in camera with special effects- for video and photography.', 'The image is great, and the soud is excelent.', 'I was taught to use Photoshop and was amazed.', 'Images can be multi-selected and viewed swiftly or in slideshow mode.', '#NAME?', '#NAME?', '#NAME?', '#NAME?', '#NAME?', 'wonderful features.', 'Amazing Quality!', 'Amazing machine.', 'Great product.', 'Does a great job with video shot on a Canon 5D MKII.', 'PLEASE MAKE THESE!', 'High price tag, however.', 'Where you click and hold and drag it picture, link, etc to where you want it.', 'what an elegant, wonderful machine.', "It's face and depanable."]


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Your max_length is set to 50, but your input_length is only 9. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=4)
Your max_length is set to 50, but your input_length is only 19. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=9)
Your max_length is set to 50, but your input_length is only 15. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=7)
Your max_length is set to 50, but your input_length is only 13. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=6)
Your max_leng

Query: photo
Generated summary: images are crisp and clean, crisp and crisp . images are clean, clean and a bit of a slick image .


Retrieving related reviews by aspects and summarizing them using BERT Transformer

In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from annoy import AnnoyIndex
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from transformers import pipeline
from transformers import BartTokenizer, BartForConditionalGeneration

# Load the CSV file
df = pd.read_csv('/content/drive/MyDrive/Aspect_Based_Summarization/output_1000.csv')

# Extract texts from the "Text" column
texts = df['Text'].tolist()

# Generate embeddings for the texts
embeddings_model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = embeddings_model.encode(texts)

# Create AnnoyIndex
emb_dim = len(embeddings[0])
annoy_index = AnnoyIndex(emb_dim, 'angular')  # 'angular' for cosine similarity

# Add embeddings to AnnoyIndex
for i, emb in enumerate(embeddings):
    annoy_index.add_item(i, emb)

# Build the index
annoy_index.build(10)  # 10 trees

# Save the index to disk
annoy_index.save('annoy_index.ann')

# Load the index from disk
annoy_index_loaded = AnnoyIndex(emb_dim, 'angular')
annoy_index_loaded.load('annoy_index.ann')

# Example query
query = "time"

# Perform initial retrieval
query_emb = embeddings_model.encode([query])[0]
indexes = annoy_index_loaded.get_nns_by_vector(query_emb, 20)  # Get 20 nearest neighbors

# Extract relevant features for reranking
features = []
for idx in indexes:
    emb = embeddings[idx]
    similarity_score = np.dot(emb, query_emb)  # Cosine similarity as a feature
    length_ratio = len(texts[idx]) / len(query)  # Length ratio as a feature
    features.append([similarity_score, length_ratio])

# Normalize the features
scaler = MinMaxScaler()
normalized_features = scaler.fit_transform(features)

# Reranking using a RandomForestRegressor (you can use any other model)
regressor = RandomForestRegressor()
regressor.fit(normalized_features, range(len(indexes)))  # Fit the model to predict the ranking

# Predict the ranking
reranked_indexes = regressor.predict(normalized_features)
reranked_indexes = sorted(range(len(reranked_indexes)), key=lambda k: reranked_indexes[k])

# Get the reranked texts
reranked_texts = [texts[indexes[idx]] for idx in reranked_indexes]
print("Reranked texts:", reranked_texts)
# Generate a summary for the top relevant reviews
summarizer = pipeline("summarization", model="t5-small", framework="pt")
summary = summarizer(reranked_texts[:5], max_length=50, min_length=30, do_sample=False)[0]['summary_text']

print("Generated summary:", summary)

# Load BART tokenizer and model
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')



# Concatenate the reviews into a single string
combined_review = " ".join(reranked_texts)

# Tokenize the combined review
inputs = tokenizer(combined_review, max_length=1024, truncation=True, padding='max_length', return_tensors='pt')

# Generate a summary
output = model.generate(**inputs, max_length=70, min_length=10, length_penalty=2.0, num_beams=4, early_stopping=True)

# Decode the generated summary
summary = tokenizer.decode(output[0], skip_special_tokens=True)

# Print the combined review and the summary
print("Combined Review:", combined_review)
print("Summary:", summary)
