In [None]:
import pandas as pd
import os
import re
from transformers import pipeline
import torch

# Path to the parent folder containing subfolders of tweets
parent_folder_path = '/kaggle/input/dataset/public_dataset/train/tweets/'

# Get a list of all subfolders in the parent folder
subfolders = [f for f in os.listdir(parent_folder_path) if os.path.isdir(os.path.join(parent_folder_path, f))]

# Define the sentiment analysis model
model = "cardiffnlp/twitter-roberta-base-sentiment-latest"
# Load the sentiment analysis pipeline with CUDA support if available
sentiment_task = pipeline("sentiment-analysis", model=model, return_all_scores=True, device=0 if torch.cuda.is_available() else -1)

# Loop through each subfolder
for subfolder in subfolders:
    folder_path = os.path.join(parent_folder_path, subfolder)

    # Get a list of all CSV files in the folder
    csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

    # Initialize an empty list to store dataframes
    df_list = []

    # Loop through the CSV files and read them into dataframes
    for file in csv_files:
        try:
            file_path = os.path.join(folder_path, file)
            df = pd.read_csv(file_path)
            df_list.append(df)
        except Exception as e:
            print(f"Error reading {file_path}: {e}")

    # Concatenate all dataframes into one
    if df_list:  # Check if the list is not empty
        merged_df = pd.concat(df_list, ignore_index=True)
    else:
        print(f"No valid CSV files found in {folder_path}.")
        continue  # Skip this subfolder if no valid CSV files were found

    # Print the length of the merged dataframe
    print(f"Number of rows in the merged dataframe for {subfolder}: {len(merged_df)}")

    # Drop unwanted columns (ensure these columns exist in the merged dataframe)
    columns_to_drop = [
        'time', 'timezone', 'retweet', 'quote_url', 'link', 'urls',
        'user_id', 'language', 'mentions', 'photos', 'video',
        'thumbnail', 'place', 'reply_to', 'username', 'name',
        'near', 'cashtags', 'hashtags', 'id', 'conversation_id',
        'created_at', 'geo', 'user_rt_id', 'user_rt',
        'retweet_id', 'retweet_date', 'translate',
        'trans_src', 'trans_dest', 'source'
    ]

    # Safely drop columns that exist in the merged dataframe
    merged_df.drop(columns=[col for col in columns_to_drop if col in merged_df.columns], inplace=True)

    # Create a new column with the sum of the specified columns
    columns_to_sum = ['replies_count', 'retweets_count', 'likes_count']
    if all(col in merged_df.columns for col in columns_to_sum):  # Check if columns exist
        merged_df['total_sum'] = merged_df[columns_to_sum].sum(axis=1)
        merged_df.drop(columns=columns_to_sum, inplace=True)

    # Process tweets
    if 'tweet' in merged_df.columns:  # Check if 'tweet' column exists
        # Function to process each tweet safely
        def process_tweet(tweet):
            if not isinstance(tweet, str):
                return ''
            tweet = re.split(r'[@!#\$]|https?://\S+', tweet)[0]
            words = tweet.split()
            return ' '.join(words[:5])

        # Apply the function to the 'tweet' column
        merged_df['processed_tweet'] = merged_df['tweet'].apply(process_tweet)
        merged_df.drop(columns=['tweet'], inplace=True)

        # Initialize sentiment score columns
        merged_df['negative'] = 0
        merged_df['neutral'] = 0
        merged_df['positive'] = 0

        # Perform sentiment analysis
        for i in range(len(merged_df)):
            tweet = merged_df["processed_tweet"][i]
            if tweet:  # Check if the tweet is not empty
                a = sentiment_task(f"{tweet}")
                merged_df.at[i, 'negative'] = a[0][0]['score']
                merged_df.at[i, 'neutral'] = a[0][1]['score']
                merged_df.at[i, 'positive'] = a[0][2]['score']

        # Save the result DataFrame to a CSV file for the current subfolder
        output_csv_path = f'/kaggle/working/{subfolder}_processed.csv'
        merged_df.to_csv(output_csv_path, index=False)
        print(f"DataFrame for {subfolder} saved as {subfolder}_processed.csv in the Kaggle working directory.")
    else:
        print(f"'tweet' column not found in {subfolder}.")
