In [21]:
# Necessary libraries

import tweepy
import csv
import time
import random
from collections import Counter
import torch
from transformers import pipeline
import kagglehub
import re
import pandas as pd
import csv
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [22]:
!pip install kagglehub
!pip install tweepy



In [23]:
# Download latest version of tweet dataset from kaggle
path = kagglehub.dataset_download("mmmarchetti/tweets-dataset")

print("Path to dataset files:", path)

Path to dataset files: /root/.cache/kagglehub/datasets/mmmarchetti/tweets-dataset/versions/1


In [9]:
# -----------------------
# Step 1: Load Unique Categories from CSV
# -----------------------

# Load unique categories from the 'categories' column of the CSV file, excluding empty or null values
unique_categories = set()

with open('books.csv', mode='r', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    for row in reader:
        category = row['categories']
        if category and category.strip():  # Exclude null or empty strings
            unique_categories.add(category)

# Convert the set of unique categories to a list
candidate_labels = list(unique_categories)

# Fallback category to use if no suitable classification is found
fallback_category = "Fiction"

# -----------------------
# Step 2: Set Up Zero-Shot Classifier
# -----------------------

# Initialize the zero-shot classification model with GPU support if available
device = 0 if torch.cuda.is_available() else -1
classifier = pipeline(
    "zero-shot-classification",
    model="typeform/mobilebert-uncased-mnli",
    device=device
)

# -----------------------
# Step 3: Classify Books and Impute Missing Categories
# -----------------------

# List to store updated book data with imputed categories
book_predictions = []

# Process each book from the CSV file and classify missing categories
with open('books.csv', mode='r', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    for row in reader:
        title = row['title']
        category = row['categories']
        description = row['description']

        # If category is missing, predict it using the description
        if not category or category.strip() == "":
            if description and description.strip():  # Check if description is available
                # Step 1: Initial zero-shot classification
                result = classifier(description, candidate_labels)
                predicted_category = result['labels'][0]

                # Step 2: Handle case where the predicted category matches the title
                if predicted_category.lower() == title.lower():
                    alternative_labels = [label for label in candidate_labels if label.lower() != title.lower()]
                    if alternative_labels:  # Ensure alternative labels are available
                        retry_result = classifier(description, alternative_labels)
                        predicted_category = retry_result['labels'][0]

                # Step 3: Final check to avoid using the title as the category
                if predicted_category.lower() == title.lower():
                    for label in result['labels']:
                        if label.lower() != title.lower():
                            predicted_category = label
                            break
                    else:
                        # If no suitable category is found, use the fallback category
                        predicted_category = fallback_category
            else:
                # If no description is available, assign the fallback category
                predicted_category = fallback_category

            # Update the row with the predicted category
            row['categories'] = predicted_category

        # Add the row to the list of updated book data
        book_predictions.append(row)

# -----------------------
# Step 4: Save Updated Book Data to CSV
# -----------------------

# Save the updated book data to a new CSV file with imputed categories
output_file = 'books_with_imputed_categories.csv'
with open(output_file, mode='w', newline='', encoding='utf-8') as file:
    fieldnames = reader.fieldnames  # Use the original fieldnames from the CSV
    writer = csv.DictWriter(file, fieldnames=fieldnames)
    writer.writeheader()  # Write the header row
    writer.writerows(book_predictions)  # Write the updated book data

print(f"Updated book data saved to '{output_file}'.")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/98.5M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/268 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Updated book data saved to 'books_with_imputed_categories.csv'.


In [24]:
# -----------------------
# Step 5: Preprocess Tweets from CSV
# -----------------------

# Device setup to check for GPU availability
device = 0 if torch.cuda.is_available() else -1

# Initialize the zero-shot classification model with GPU support if available
classifier = pipeline(
    "zero-shot-classification",
    model="typeform/mobilebert-uncased-mnli",
    device=device
)

# -----------------------
# Function: Clean Tweet Content
# -----------------------
def clean_content(text):
    """
    Clean the tweet content by removing URLs, @mentions, emojis, and extra whitespace.
    Args:
        text (str): The raw tweet content.
    Returns:
        str: The cleaned tweet content.
    """
    # Remove URLs
    text = re.sub(r'http\S+|www\.\S+', '', str(text))
    # Remove mentions (@userid) with or without spaces
    text = re.sub(r'@\s*\w+', '', text)
    # Remove emojis and special characters except for word characters and spaces
    text = re.sub(r'[^\w\s,]', '', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# -----------------------
# Input and Output Paths
# -----------------------
input_file = os.path.join(path, 'tweets.csv')  # Input CSV file containing the tweets
output_file = os.path.join(path, 'tweets_with_imputed_categories.csv')  # Output CSV file for the cleaned tweets

# -----------------------
# Step 6: Process and Save Cleaned Tweets
# -----------------------

# List to store the cleaned and updated tweet rows
updated_rows = []

# Read the original CSV file and clean the 'content' column
with open(input_file, mode='r', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    fieldnames = reader.fieldnames  # Extract the headers from the original CSV file

    for row in reader:
        # Clean the content column for each row
        row['content'] = clean_content(row['content'])
        # Add the cleaned row to the list of updated rows
        updated_rows.append(row)

# Save the cleaned and updated tweet data to a new CSV file
with open(output_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.DictWriter(file, fieldnames=fieldnames)  # Use the same headers as the original file
    writer.writeheader()  # Write the header row
    writer.writerows(updated_rows)  # Write the cleaned tweet rows

print(f"Updated dataset saved to '{output_file}'.")


Updated dataset saved to '/root/.cache/kagglehub/datasets/mmmarchetti/tweets-dataset/versions/1/tweets_with_imputed_categories.csv'.


In [25]:
# -----------------------
# Step 7: Load CSV from Directory
# -----------------------

def load_csv_from_directory(directory, filename):
    """
    Loads a CSV file from a specified directory.

    Args:
        directory (str): The directory where the CSV file is located.
        filename (str): The name of the CSV file to load.

    Returns:
        DataFrame: A pandas DataFrame containing the CSV data.
    """

    # Construct the full file path
    file_path = os.path.join(directory, filename)

    # Check if the file exists before attempting to load
    if not os.path.isfile(file_path):
        raise FileNotFoundError(f"{filename} not found in directory {directory}.")

    print(f"Loading CSV file from: {file_path}")

    # Read the CSV file into a DataFrame and return it
    return pd.read_csv(file_path)

# -----------------------
# Load the Preprocessed Data
# -----------------------

# Specify the preprocessed file name
preprocessed_file = "tweets_with_imputed_categories.csv"  # Name of the preprocessed CSV file

# Load the preprocessed data into a DataFrame
df = load_csv_from_directory(path, preprocessed_file)

# Display the first few rows of the preprocessed DataFrame to verify the data
print("Displaying the first 5 rows of the preprocessed DataFrame:")
display(df.head())

Loading CSV file from: /root/.cache/kagglehub/datasets/mmmarchetti/tweets-dataset/versions/1/tweets_with_imputed_categories.csv
Displaying the first 5 rows of the preprocessed DataFrame:


Unnamed: 0,author,content,country,date_time,id,language,latitude,longitude,number_of_likes,number_of_shares
0,katyperry,Is history repeating itselfDONTNORMALIZEHATE,,12/01/2017 19:52,8.19633e+17,en,,,7900,3472
1,katyperry,Thank you for your incredible grace in leaders...,,11/01/2017 08:38,8.19101e+17,en,,,3689,1380
2,katyperry,Life goals,,11/01/2017 02:52,8.19014e+17,en,,,10341,2387
3,katyperry,Me right now,,11/01/2017 02:44,8.19012e+17,en,,,10774,2458
4,katyperry,SISTERS ARE DOIN IT FOR THEMSELVES,,10/01/2017 05:22,8.18689e+17,en,,,17620,4655


In [26]:
# -----------------------
# Step 8: Process and Combine Author Texts
# -----------------------

# Ensure all values in the 'content' column are strings and handle missing values
df['content'] = df['content'].fillna('').astype(str)

# Combine the 'content' for each unique 'author' into a single text string
author_texts = df.groupby('author')['content'].apply(lambda texts: ' '.join(texts)).reset_index()

# Rename the columns for better clarity
author_texts.columns = ['author', 'combined_text']

# Display the result
print("Displaying the combined text for each author:")
print(author_texts)


Displaying the combined text for each author:
           author                                      combined_text
0    ArianaGrande  my favorite face in the world thanks for guidi...
1     BarackObama  Tonight, President Obama reflects on eight yea...
2       Cristiano  Mood The Best FIFA 2016 award is already avail...
3   KimKardashian  Jet lag Awwww thank you The best time I love i...
4    TheEllenShow  Its the perfect day to snuggle with a kitten a...
5         Twitter  We shall dub thee Meme Master Ashley  Meme of ...
6         YouTube  You must be at least 1 ½ inches tall to ride L...
7   britneyspears  Mad love for this one Excited to introduce my ...
8          cnnbrk  British PM Theresa May commits to put final Br...
9        ddlovato  Cant wait to get back and watch my friend in h...
10      instagram  Fog as far as the eye can see TheWeekOnInstagr...
11    jimmyfallon  Tonight , , music from , and more FallonTonigh...
12    jtimberlake  I have two pockets Branchs face says i

In [27]:
# -----------------------
# Step 9: Load Categories and Descriptions from CSV
# -----------------------

def load_categories_from_csv(file_path='books_with_imputed_categories.csv'):
    """
    Load unique categories from the 'categories' column of the CSV file.

    Args:
        file_path (str): Path to the CSV file containing book data.

    Returns:
        list: A list of unique categories from the 'categories' column.
    """
    unique_categories = set()
    try:
        with open(file_path, mode='r', encoding='utf-8') as file:
            reader = csv.DictReader(file)
            for row in reader:
                unique_categories.add(row['categories'])
    except FileNotFoundError:
        print(f"CSV file '{file_path}' not found.")

    return list(unique_categories)


def load_description_from_csv(file_path='books_with_imputed_categories.csv', category=None, tweets=None):
    """
    Load book descriptions from the CSV file and compute the top 5 most similar books to the given tweets.

    Args:
        file_path (str): Path to the CSV file containing book data.
        category (str): The category to filter the books.
        tweets (str): The combined tweet content to compare with book descriptions.

    Returns:
        list: Titles of the top 5 most similar books.
    """
    if not category:
        print("The given category is empty.")
        return []

    # Load the CSV into a DataFrame
    books_df = pd.read_csv(file_path)

    # Filter the books by the given category
    filtered_books = books_df[books_df['categories'] == category]

    # Fill missing descriptions with the book title
    filtered_books['description'] = filtered_books['description'].fillna(filtered_books['title'])

    # Create a corpus of the tweet plus all book descriptions
    corpus = [tweets] + filtered_books['description'].tolist()

    # Calculate TF-IDF vectors for the corpus
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)

    # Extract the tweet vector and book vectors
    tweet_vector = tfidf_matrix[0]  # The tweet is the first entry
    book_vectors = tfidf_matrix[1:]  # The remaining entries are book descriptions

    # Compute cosine similarities between the tweet and all book descriptions
    similarities = cosine_similarity(tweet_vector, book_vectors).flatten()

    # Get the indices of the top 5 most similar books
    top_5_indices = similarities.argsort()[-5:][::-1]  # Indices of top 5 similar books
    top_5_books = filtered_books.iloc[top_5_indices]['title'].tolist()  # Get book titles
    top_5_scores = similarities[top_5_indices].tolist()  # Get similarity scores

    # Display the top 5 book titles and their similarity scores
    # print(f"Top 5 Scores are: {top_5_scores}\n Top 5 Books: {top_5_books}")

    return top_5_books


In [28]:
# -----------------------
# Step 10: Setup Zero-Shot Classification Model
# -----------------------

def setup_zero_shot_classifier():
    """
    Set up the zero-shot classification model using the Huggingface Transformers pipeline.

    Returns:
        pipeline: A zero-shot classification pipeline that can be used to classify text into categories.
    """
    # Check if GPU is available, otherwise use CPU
    device = 0 if torch.cuda.is_available() else -1

    # Initialize the zero-shot classification model using the Huggingface pipeline
    classifier = pipeline(
        "zero-shot-classification",  # Task type
        model="typeform/mobilebert-uncased-mnli",  # Pre-trained model
        device=device  # Use GPU if available, otherwise use CPU
    )

    return classifier


In [29]:
# -----------------------
# Step 11: Classify Tweets
# -----------------------

def classify_tweets(tweets, candidate_labels, classifier):
    """
    Classify a list of tweets to determine the most common category using zero-shot classification.

    Args:
        tweets (list of str): List of tweet texts to classify.
        candidate_labels (list of str): List of possible categories to classify tweets into.
        classifier: Huggingface zero-shot classification pipeline.

    Returns:
        str: The most common predicted category among the classified tweets.
    """

    # Initialize a counter to keep track of the number of times each category is predicted
    category_counter = Counter()

    # Loop through the first 2 tweets to classify them
    for i, text in enumerate(tweets[:2]):  # Limit to the first 2 tweets
        try:
            print(f"Classifying tweet {i+1}/{len(tweets)}: {text[:50]}...")

            # Perform zero-shot classification
            result = classifier(text, candidate_labels)
            predicted_category = result['labels'][0]  # Get the top predicted category

            # Update the category counter
            category_counter[predicted_category] += 1
        except Exception as e:
            print(f"An error occurred while classifying a tweet: {e}")

    # Determine the most common category from the classified tweets
    most_common_category = category_counter.most_common(1)[0][0] if category_counter else "No Category"

    return most_common_category



In [30]:
# -----------------------
# Step 12: Split Text into Chunks
# -----------------------

def split_text_into_chunks(text, max_length=512):
    """
    Split a large text string into smaller chunks, each with a specified maximum length.

    This function ensures that large texts are broken down into smaller segments,
    which is useful for models with input length limitations (like transformer-based models).

    Args:
        text (str): The large text string to be split into chunks.
        max_length (int): The maximum length of each chunk. Defaults to 512.

    Returns:
        list: A list of text chunks, where each chunk has a maximum length of 'max_length'.
    """

    # Create a list of chunks, where each chunk is a substring of the original text
    return [text[i:i + max_length] for i in range(0, len(text), max_length)]

In [31]:
# -----------------------
# Main Function: Classify Authors and Recommend Books
# -----------------------

def main():
    """
    Main function to classify author tweets into categories and recommend books based on classification.

    This function performs the following steps:
      1. Load unique categories from the CSV file.
      2. Set up the zero-shot classification model.
      3. Loop through each author, classify their tweets, and recommend top 5 books.
      4. Save the classification results to a CSV file.
    """

    # -----------------------
    # Step 1: Load Categories from CSV
    # -----------------------
    candidate_labels = load_categories_from_csv('books_with_imputed_categories.csv')
    if not candidate_labels:
        print("No categories found in the CSV file. Exiting...")
        exit()

    # -----------------------
    # Step 2: Set up Zero-Shot Classifier
    # -----------------------
    classifier = setup_zero_shot_classifier()

    # -----------------------
    # Step 3: Initialize Output CSV File
    # -----------------------
    output_results_file = 'author_classification_results.csv'

    # Create the output CSV file if it doesn't exist
    if not os.path.isfile(output_results_file):
        with open(output_results_file, mode='w', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)
            writer.writerow(['Author', 'Top_Category', 'Top_5_Books'])  # CSV Header

    # -----------------------
    # Step 4: Loop Through Each Author to Classify Tweets
    # -----------------------
    with open(output_results_file, mode='a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)

        for index, row in author_texts.iterrows():
            user = row['author']
            tweets = row['combined_text']

            # Step 5: Split the full text into 512-token chunks
            full_text_chunks = split_text_into_chunks(tweets, max_length=512)

            # Step 6: Classify tweets to get the most common category
            full_text_cat = classify_tweets(full_text_chunks, candidate_labels, classifier)
            print(f"Predicted category for {user}: {full_text_cat}")

            # Step 7: Load top 5 recommended books based on category
            top_5_books = load_description_from_csv('books_with_imputed_categories.csv', full_text_cat, tweets)
            print(f"Top 5 Books for {user}: {top_5_books}")

            # Step 8: Save classification results to the CSV file
            writer.writerow([user, full_text_cat, ', '.join(top_5_books)])

    print(f"Classification results saved to '{output_results_file}'.")

# Run the main function
if __name__ == "__main__":
    main()


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Classifying tweet 1/240: my favorite face in the world thanks for guiding m...
Classifying tweet 2/240: ou didnt love you amp see you soon lmaoooooo truet...
Predicted category for ArianaGrande: Reference
Top 5 Books for ArianaGrande: ["Schott's Original Miscellany", 'The Wicked Wit of Charles Dickens', 'Nietzsche For Beginners', 'Divine Comedy - Purgatorio', 'The John Deere Two-Cylinder Tractor Encyclopedia']
Classifying tweet 1/493: Tonight, President Obama reflects on eight years o...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_books['description'] = filtered_books['description'].fillna(filtered_books['title'])


Classifying tweet 2/493:  your options and lock in the one thats best for y...
Predicted category for BarackObama: Reference
Top 5 Books for BarackObama: ['Nineteen Sixty Eight', 'The John Deere Two-Cylinder Tractor Encyclopedia', "Schott's Original Miscellany", 'Metamorphoses', 'The Wicked Wit of Charles Dickens']
Classifying tweet 1/328: Mood The Best FIFA 2016 award is already available...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_books['description'] = filtered_books['description'].fillna(filtered_books['title'])


Classifying tweet 2/328: e possible without my teammates, coaches and you w...
Predicted category for Cristiano: Babytime resource
Top 5 Books for Cristiano: ['The Very Hungry Caterpillar']
Classifying tweet 1/257: Jet lag Awwww thank you The best time I love it th...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_books['description'] = filtered_books['description'].fillna(filtered_books['title'])


Classifying tweet 2/257: onsoring the class Dubai Love this look airportvib...
Predicted category for KimKardashian: Babytime resource
Top 5 Books for KimKardashian: ['The Very Hungry Caterpillar']
Classifying tweet 1/420: Its the perfect day to snuggle with a kitten and a...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_books['description'] = filtered_books['description'].fillna(filtered_books['title'])


Classifying tweet 2/420: nly need 15K more Did you hear that people who cur...
Predicted category for TheEllenShow: Reference
Top 5 Books for TheEllenShow: ["Schott's Original Miscellany", 'The Wicked Wit of Charles Dickens', 'The John Deere Two-Cylinder Tractor Encyclopedia', 'Divine Comedy - Purgatorio', 'Nietzsche For Beginners']
Classifying tweet 1/311: We shall dub thee Meme Master Ashley  Meme of the ...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_books['description'] = filtered_books['description'].fillna(filtered_books['title'])


Classifying tweet 2/311: 10 years of KingsDecade In 2016 we focused on stre...
Predicted category for Twitter: Behavior
Top 5 Books for Twitter: ['The Twits', "Mrs. Piggle-Wiggle's farm"]
Classifying tweet 1/291: You must be at least 1 ½ inches tall to ride Lego ...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_books['description'] = filtered_books['description'].fillna(filtered_books['title'])


Classifying tweet 2/291:  YouTubeRed 28 Home isnt where you are, its who yo...
Predicted category for YouTube: Characters and characteristics in motion pictures
Top 5 Books for YouTube: ['Well of Darkness', 'The Lord of the Rings', '20000 LEAGUES UNDER THE SEA', 'Redburn, White-Jacket, Moby-Dick']
Classifying tweet 1/302: Mad love for this one Excited to introduce my new ...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_books['description'] = filtered_books['description'].fillna(filtered_books['title'])


Classifying tweet 2/302: ch fun Looking forward to more of this in 2017 Hap...
Predicted category for britneyspears: Babytime resource
Top 5 Books for britneyspears: ['The Very Hungry Caterpillar']
Classifying tweet 1/344: British PM Theresa May commits to put final Brexit...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_books['description'] = filtered_books['description'].fillna(filtered_books['title'])


Classifying tweet 2/344: n in Pakistan has been sentenced to death for burn...
Predicted category for cnnbrk: British
Top 5 Books for cnnbrk: ['The Mimic Men', 'Merde Actually', 'Wide Sargasso Sea', 'Shōgun', 'Rumer Godden']
Classifying tweet 1/252: Cant wait to get back and watch my friend in his n...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_books['description'] = filtered_books['description'].fillna(filtered_books['title'])


Classifying tweet 2/252:  Happy NYE Wanted to give you all a little somethi...
Predicted category for ddlovato: Babytime resource
Top 5 Books for ddlovato: ['The Very Hungry Caterpillar']
Classifying tweet 1/281: Fog as far as the eye can see TheWeekOnInstagram s...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_books['description'] = filtered_books['description'].fillna(filtered_books['title'])


Classifying tweet 2/281:  for lucky fans in NYC Im inspired by my life expe...
Predicted category for instagram: Friendship
Top 5 Books for instagram: ['Where Rainbows End', 'The Little Baby Snoogle-Fleejer', 'The Brotherhood of the Rose', 'P.S. Longer Letter Later']
Classifying tweet 1/468: Tonight , , music from , and more FallonTonight an...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_books['description'] = filtered_books['description'].fillna(filtered_books['title'])


Classifying tweet 2/468: ht Tonight New First Drafts of Rock w, Greta Gerwi...
Predicted category for jimmyfallon: Music
Top 5 Books for jimmyfallon: ["The Beatles' Story on Capitol Records", "A Hard Day's Write, 3e", "Mozart's Don Giovanni", 'Deluxe Encyclopedia of Mandolin Chords', 'The John Adams Reader']
Classifying tweet 1/331: I have two pockets Branchs face says it all Happy ...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_books['description'] = filtered_books['description'].fillna(filtered_books['title'])


Classifying tweet 2/331:  a day until Feb 24th VOTE I AM unemployed GrindCi...
Predicted category for jtimberlake: Social action
Top 5 Books for jtimberlake: ['In Watermelon Sugar']
Classifying tweet 1/131: My I love you face 10yearsofkidrauhl where it all ...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_books['description'] = filtered_books['description'].fillna(filtered_books['title'])


Classifying tweet 2/131:  on sale now Dublin on sale too PurposeTour UK Bri...
Predicted category for justinbieber: Humorous stories
Top 5 Books for justinbieber: ['Stiff Upper Lip, Jeeves', 'A Sudden Wild Magic']
Classifying tweet 1/339: Is history repeating itselfDONTNORMALIZEHATE Thank...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_books['description'] = filtered_books['description'].fillna(filtered_books['title'])


Classifying tweet 2/339:  shopping is woke af blklivesmatter Have you been ...
Predicted category for katyperry: History
Top 5 Books for katyperry: ["Terry Jones' Medieval Lives", 'Laguna, I Love You', 'We Wish to Inform You That Tomorrow We Will Be Killed with Our Families', 'The Berlin Phenomenology', 'New Worlds, Ancient Texts']
Classifying tweet 1/349: WomensMarchOnWashington Im so proud to be a woman ...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_books['description'] = filtered_books['description'].fillna(filtered_books['title'])


Classifying tweet 2/349: e Monsters, congrats on your iHeartAwards nominati...
Predicted category for ladygaga: Spiritual life
Top 5 Books for ladygaga: ['Insights']
Classifying tweet 1/265: My new fragrance RiRiKiss is available now, look f...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_books['description'] = filtered_books['description'].fillna(filtered_books['title'])


Classifying tweet 2/265:  to get tested amp know yo Election2016 ElectionDa...
Predicted category for rihanna: Selling
Top 5 Books for rihanna: ['The Greatest Salesman in the World']
Classifying tweet 1/319: I love you, Christina Such a memorable show in Syd...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_books['description'] = filtered_books['description'].fillna(filtered_books['title'])


Classifying tweet 2/319:  the love I love you guys, God bless This industry...
Predicted category for selenagomez: Love
Top 5 Books for selenagomez: ['Getting the Love You Want']
Classifying tweet 1/411: With fellow SDG advocate at the world economic for...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_books['description'] = filtered_books['description'].fillna(filtered_books['title'])


Classifying tweet 2/411:  ShakHQ El video de Chantaje con es el tercer vide...
Predicted category for shakira: Babytime resource
Top 5 Books for shakira: ['The Very Hungry Caterpillar']
Classifying tweet 1/271: LiveByNight is out now Check out the guy in the ba...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_books['description'] = filtered_books['description'].fillna(filtered_books['title'])


Classifying tweet 2/271: oment when Kanye West secretly records your phone ...
Predicted category for taylorswift13: Babytime resource
Top 5 Books for taylorswift13: ['The Very Hungry Caterpillar']
Classification results saved to 'author_classification_results.csv'.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_books['description'] = filtered_books['description'].fillna(filtered_books['title'])


In [None]:
##----------------------------- below is for twitter api -------------------------------------------------------------------

In [None]:
# this is code for when we use twitter api

"""
# Step 1: Load categories from the uploaded CSV
candidate_labels = load_categories_from_csv('books_with_imputed_categories.csv')
if not candidate_labels:
    print("No categories found in the CSV file. Exiting...")
    exit()

# Step 2: Set up zero-shot classifier
classifier = setup_zero_shot_classifier()

# Step 3: Fetch recent tweets for Taylor Swift
user = 'BillGates'
print(f"Fetching 30 tweets for {user}...")
tweets = get_recent_tweets(user, tweet_count=30)
if not tweets:
    print("No tweets retrieved. Exiting...")
    exit()
# Try with all tweets concatenated to one big text
full_text = ''.join(tweets)

# Split the full text into chunks of 512 characters
full_text_chunks = split_text_into_chunks(full_text, max_length=512)

# Process each chunk individually
# full_text_cat = []
full_text_cat =  classify_tweets(full_text_chunks, candidate_labels, classifier)

# full_text_cat = classify_tweets(full_text, candidate_labels, classifier)
# Step 4: Classify tweets and determine the most common category
print("Classifying tweets into book categories...")
most_common_category = classify_tweets(tweets, candidate_labels, classifier)

# Step 5: Save the predicted category to a CSV file and download it
save_results_to_csv(user, most_common_category)
"""

In [None]:
# Twitter API Function

"""
def get_recent_tweets(username, tweet_count=30, max_retries=5):
    client = tweepy.Client(bearer_token=BEARER_TOKEN)
    for attempt in range(max_retries):
        try:
            response = client.get_users_tweets(
                id=client.get_user(username=username).data.id,
                max_results=tweet_count,
                exclude=['retweets', 'replies']
            )
            if not response.data:
                print(f"No tweets found for {username}.")
                return []

            tweets = [tweet.text for tweet in response.data]
            return tweets

        except tweepy.TooManyRequests as e:
            wait_time = (2 ** attempt) + random.uniform(0, 1)  # Exponential backoff
            print(f"429 Too Many Requests. Waiting {wait_time:.2f} seconds before retrying...")
            time.sleep(wait_time)
        except Exception as e:
            print(f"An error occurred: {e}")
            break

    print(f"Failed to fetch tweets for {username} after {max_retries} retries.")
    return []

"""

In [None]:
# Save results to CSV file

"""
def save_results_to_csv(user, predicted_category, output_file='user_predicted_categories.csv'):
    with open(output_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['User', 'Predicted_Category'])  # Column headers
        writer.writerow([user, predicted_category])
    print(f"Final predicted categories saved to '{output_file}'.")
    files.download(output_file)  # Download the CSV file to your system
"""


In [None]:
# Save Tweets to JSON File

"""
def save_tweets_to_json(tweets, output_file='tweets.json'):

    # Save the list of tweets to a JSON file.

    # Args:
    # - tweets (list of dict): List of tweet data (text, ID, timestamp, etc.).
    # - output_file (str): Name of the JSON file to save.

    with open(output_file, 'w', encoding='utf-8') as file:
        json.dump(tweets, file, ensure_ascii=False, indent=4)
    print(f"Tweets have been saved to '{output_file}'.")
    files.download(output_file)  # Download the JSON file to your system

# Step 4: Save tweets to a JSON file
save_tweets_to_json(tweets, 'tweets.json')
"""