In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/book-review-samples/Darth_Plagueis_reviews(4).csv
/kaggle/input/book-review-samples/Dune_Book_1_reviews.csv
/kaggle/input/book-review-samples/Dune_Book_1_reviews(4).csv


In [2]:
# import relevant transformer libraries
from tqdm import tqdm
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification

2024-08-19 12:39:14.307057: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-19 12:39:14.307219: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-19 12:39:14.504000: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
def setup_model():
    # Load model and tokenizer
    model_name = "facebook/bart-large-mnli"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name)
    
    # Set up classifier
    classifier = pipeline("zero-shot-classification", model=model, tokenizer=tokenizer)
    return classifier

def analyze_review(review_text, classifier):
    # Check if review_text is a string and not empty
    if not isinstance(review_text, str) or not review_text.strip():
        return "No valid review text", "unknown"

    # Extract key line (use the first sentence, or the whole text if it's short)
    sentences = review_text.split('.')
    key_line = sentences[0] + '.' if len(sentences) > 1 else review_text

    # Classify the sentiment
    candidate_labels = ["very bad", "bad", "average", "good", "great"]
    result = classifier(review_text, candidate_labels)
    
    rating = result['labels'][0]  # Get the highest probability label
    
    return key_line, rating

def process_reviews(df):
    classifier = setup_model()
    results = []
    
    for _, row in tqdm(df.iterrows(), total=len(df)):
        review_text = row['review_text']
        
        # Check if review_text is a valid string
        if pd.isna(review_text) or not isinstance(review_text, str):
            key_line, rating = "No valid review text", "unknown"
        else:
            key_line, rating = analyze_review(review_text, classifier)
        
        results.append({
            'review_website': row['review_website'],
            'review_date': row['review_date'],
            'key_line': key_line,
            'rating': rating
        })
    
    return pd.DataFrame(results)

csv_path = '/kaggle/input/book-review-samples/Darth_Plagueis_reviews(4).csv'

# Load your existing DataFrame
df = pd.read_csv(csv_path)

# Ensure 'review_text' column is string type and replace NaN with empty string
df['review_text'] = df['review_text'].astype(str).replace('nan', '')

# Process the reviews
results_df = process_reviews(df)

# Save the results
results_df.to_csv('analyzed_reviews.csv', index=False)
print("Analysis complete. Results saved to 'analyzed_reviews.csv'")

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

100%|██████████| 10/10 [03:34<00:00, 21.47s/it]

Analysis complete. Results saved to 'analyzed_reviews.csv'





In [5]:
results_df

Unnamed: 0,review_website,review_date,key_line,rating
0,https://unseenlibrary.com/2022/04/15/throwback...,Unknown,Throwback Thursday: Star Wars: Darth Plagueis ...,great
1,https://www.ign.com/articles/2012/02/03/star-w...,Unknown,Star Wars: Darth PlagueisStar Wars: Darth Plag...,good
2,https://www.fanthatracks.com/reviews/literatur...,Unknown,I’ve seen some fans criticise Luceno’s books a...,good
3,https://jeffreydebris.com/2020/02/15/review-da...,Unknown,The performance by Daniel Davis was superb too...,great
4,https://booksatruestory.com/2013/03/27/book-re...,Unknown,Book Review: Darth Plagueis by James Luceno M...,good
5,https://thegeeksattic.com/darthplagueis/,Unknown,"Luceno’s word structure, or prose, is incredible.",great
6,https://starwars.fandom.com/f/p/44000000000037...,Unknown,I loved reading about the relationship between...,great
7,https://starwarsreport.com/2012/01/10/book-rev...,Unknown,Book Review: Darth Plagueis Posted on January...,great
8,https://greatbooksguy.com/2023/06/17/book-revi...,Unknown,"James Luceno’s 2012 novel Darth Plagueis, one ...",good
9,https://swprotocol.wordpress.com/2015/05/25/bo...,Unknown,"Posted on May 25, 2015June 3, 2015 by DougBook...",good
