In [None]:
# Cleaning script for YouTube comments
import pandas as pd
import re

def clean_comment(comment):
    if pd.isnull(comment):  # If it's NaN, return an empty string
        return ''
    # Remove URLs
    comment = re.sub(r'(http[s]?://\S+|www\.\S+)', '', comment)
    # Common emoticons
    comment = re.sub(r'[:;=][-~]?[)D]', '', comment) 
    # Remove non-alphabetic characters or numbers
    comment = re.sub(r'[^a-zA-ZáéíóúÁÉÍÓÚñÑüÜ\s]', '', comment)
    # Remove very short words (like "a", "b", "c")
    comment = re.sub(r'\b\w{1,2}\b', '', comment)
    # Remove multiple spaces
    comment = re.sub(r'\s+', ' ', comment).strip()
    return comment

def clean_comments(csv_path):
    # Load the comment data
    df = pd.read_csv(csv_path)
    
    # Clean the comments
    df['cleaned_comment'] = df['text'].apply(clean_comment)
    
    # Remove rows where the comment is empty after cleaning
    # Replies to empty comments are also removed
    print("Cantidad de mensajes iniciales: ", df.shape[0])
    mask_empty = (df['cleaned_comment'].str.strip() == '')
    print("Comentarios vacíos: ", mask_empty.sum())
    video_ids_to_filter = df[mask_empty]["comment_id"]
    mask_replies = ((df['is_reply'] == True) & (df['reply_to_comment_id'].isin(video_ids_to_filter)))
    print("Replies a comentarios vacíos: ", mask_replies.sum())
    mask_to_remove = mask_empty | mask_replies
    df = df[~mask_to_remove] # Remove empty messages or replies to empty messages
    print("Cantidad de mensajes finales: ", df.shape[0])
    
    # Save the cleaned results to a new CSV file
    #df.to_csv('../data/cleaned_comments.csv', index=False)
    print("Los comentarios han sido limpiados y guardados en '../data/cleaned_comments.csv'.")
    return df

cleaned_comments = clean_comments('../data/comments.csv')

Cantidad de mensajes iniciales:  73147
Comentarios vacíos:  1252
Replies a comentarios vacíos:  26
Cantidad de mensajes finales:  71870
Los comentarios han sido limpiados y guardados en '../data/cleaned_comments.csv'.


In [None]:
# Statistics on the number of comments per user and the number of videos they have commented on
def group_comments_by_user(csv_path):
    # Load the comment data
    df = pd.read_csv(csv_path)
    
    # Group by author name and count the number of comments per user
    user_comment_count = df['author_name'].value_counts().reset_index()
    user_comment_count.columns = ['author_name', 'comment_count']
    
    # Count how many different videos each user has commented on
    user_video_count = df.groupby('author_name')['video_id'].nunique().reset_index()
    user_video_count.columns = ['author_name', 'video_count']
    
    # Merge both tables (comments and videos)
    user_data = pd.merge(user_comment_count, user_video_count, on='author_name')
    
    # Sort users by number of comments
    user_data = user_data.sort_values(by='comment_count', ascending=False)
    
    return user_data

def save_grouped_comments(user_data, output_path='../data/user_comment_video_counts.csv'):
    # Save the results to a CSV
    user_data.to_csv(output_path, index=False)
    print(f"Datos de comentarios y videos por usuario guardados en: {output_path}")

user_data = group_comments_by_user("../data/cleaned_comments.csv")
save_grouped_comments(user_data)

Datos de comentarios y videos por usuario guardados en: ../data/user_comment_video_counts.csv
