In [None]:
import json
import pickle
import pandas as pd
import numpy as np
import langid
import re
import emoji
import seaborn as sns
import matplotlib.pyplot as plt
from textblob import TextBlob
from wordcloud import WordCloud
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
vader = SentimentIntensityAnalyzer()
import nltk
nltk.download('stopwords')
nltk.download('punkt')

**Importing the youtube dataset, removing duplicates and missing values**

In [None]:
# Load youtube dataset
df_youtube_dataset = pd.read_csv("./US_youtube_trending_data.csv")
print(df_youtube_dataset.shape)
df_youtube_dataset.head()

In [None]:
# Check for missing values in dataset
missing_values = df_youtube_dataset.isnull().sum()
print(missing_values)


In [None]:
#Check for duplicates in dataset
duplicate_rows = df_youtube_dataset[df_youtube_dataset.duplicated()]
print("Duplicate Rows:")
print(duplicate_rows)

In [None]:
#Drop duplicate rows
df_youtube_cleaned = df_youtube_dataset.dropna()
df_youtube_cleaned.shape

**Create new dataframe consisting of only columns of interest**

In [None]:
df_youtube_cleaned.head()

In [None]:
# Keep relevant columns
relevant_columns_youtube = ['video_id', 'title', 'view_count', 'likes', 'categoryId', 'dislikes', 'comment_count']
df_youtube_filtered = df_youtube_cleaned.loc[:, relevant_columns_youtube]

In [None]:
# Preview first five columns of filtered youtube dataset
df_youtube_filtered.head()

In [None]:
# Retrieve categories from 'US_category_id.json'
from IPython.display import display

file = open('./US_category_id.json', encoding="utf8")
category_data_youtube = json.load(file)
# Close file
file.close()

display(category_data_youtube)

In [None]:
# Create dictionary to map category IDs to titles
category_mapping = {item['id']: item['snippet']['title'] for item in category_data_youtube['items']}
# Convert keys from strings to integers
category_mapping = {int(key): value for key, value in category_mapping.items()}


print(category_mapping)

In [None]:
# Create new column 'category_name' by mapping category IDs to titles
df_youtube_filtered['category_name'] = df_youtube_filtered['categoryId'].map(category_mapping)

In [None]:
# Now remove 'categoryId' column from dataframe
df_youtube_filtered.drop(columns=['categoryId'], inplace=True)

In [None]:
# Check for missing values in all columns
missing_values_per_column = df_youtube_filtered.isna().any()

# Print columns with missing values
columns_with_missing_values = missing_values_per_column[missing_values_per_column].index.tolist()
if columns_with_missing_values:
    print("Columns with missing values:", columns_with_missing_values)
else:
    print("No missing values")


**Visualising the data**

In [None]:
# Plot distribution of videos by category
plt.figure(figsize=(12, 6))
sns.countplot(data=df_youtube_filtered, y='category_name', order=df_youtube_filtered['category_name'].value_counts().index)
plt.title('Distribution of Videos by Category')
plt.show()

In [None]:
# Plot average user engagement per video by category
engagement_metrics = ['view_count', 'likes', 'dislikes', 'comment_count']
category_engagement = df_youtube_filtered.groupby('category_name')[engagement_metrics].sum()
# Calculate the total number of videos in each category
category_video_count = df_youtube_filtered['category_name'].value_counts()

# Calculate the average engagement per video
category_avg_engagement = category_engagement.div(category_video_count, axis=0)

#
sorted_category_avg_engagement = category_avg_engagement.sort_values(by=engagement_metrics, ascending=False)

# Plotting the distribution
#category_engagement.plot(kind='bar', figsize=(12, 6))
sorted_category_avg_engagement.plot(kind='bar', figsize=(12, 6))
plt.title('Average User Engagement per Video by Category')
plt.xlabel('Category')
plt.ylabel('Average User Engagement')
plt.xticks(rotation=45, ha='right')
plt.legend(engagement_metrics)
plt.show()

**Removing rows with non-English titles from YouTube dataset**

In [None]:
# Function to detect language of text using langid
def detect_language(text):
    language, confidence = langid.classify(text)
    return language

In [None]:
# Apply language detection to the 'title' column
df_youtube_filtered['language'] = df_youtube_filtered['title'].apply(detect_language)

In [None]:
df_youtube_filtered.head()

In [None]:
# Filter out non-English rows
df_youtube_filtered = df_youtube_filtered[df_youtube_filtered['language'] == 'en']

In [None]:
# Check result of filtering on video distribution
plt.figure(figsize=(12, 6))
sns.countplot(data=df_youtube_filtered, y='category_name', order=df_youtube_filtered['category_name'].value_counts().index)
plt.title('Distribution of Videos by Category')
plt.show()

In [None]:
#Check result of filtering on average user engagement per video by category
engagement_metrics = ['view_count', 'likes', 'dislikes', 'comment_count']
category_engagement = df_youtube_filtered.groupby('category_name')[engagement_metrics].sum()
# Calculate the total number of videos in each category
category_video_count = df_youtube_filtered['category_name'].value_counts()

# Calculate the average engagement per video
category_avg_engagement = category_engagement.div(category_video_count, axis=0)

#
sorted_category_avg_engagement = category_avg_engagement.sort_values(by=engagement_metrics, ascending=False)

# Plotting the distribution
#category_engagement.plot(kind='bar', figsize=(12, 6))
sorted_category_avg_engagement.plot(kind='bar', figsize=(12, 6))
plt.title('Average User Engagement per Video by Category')
plt.xlabel('Category')
plt.ylabel('Average User Engagement')
plt.xticks(rotation=45, ha='right')
plt.legend(engagement_metrics)
plt.show()

**Sentiment analysis with VADER**

In [None]:
#Preprocessing function
def clean_text(text):
    # Make text lowercase
    text = text.lower()
    
    # Remove hashtags
    text = re.sub(r'#\w+', '', text)
    
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    
     # Remove numbers
    text = re.sub(r'\d+', '', text)
    
    # Tokenization
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # Join the tokens back into a preprocessed text
    cleaned_text = ' '.join(tokens)
    
    return cleaned_text

In [None]:
# Function to calculate sentiment
def calculate_sentiment(text):
    analyzer = SentimentIntensityAnalyzer()
    sentiment_scores = analyzer.polarity_scores(text)
    compound_score = sentiment_scores['compound']

    # Assign sentiment labels based on the compound score
    if compound_score >= 0.05:
        return 'positive'
    elif compound_score <= -0.05:
        return 'negative'
    else:
        return 'neutral'

In [None]:
# Create copy of dataframe before using vader
df_youtube_vader = df_youtube_filtered.copy()

In [None]:
#Apply preprocessing and tokenisation to title column of youtube dataset
df_youtube_vader['sentiment'] = df_youtube_vader['title'].apply(calculate_sentiment)

In [None]:
df_youtube_vader.head()

In [None]:
# Group the dataframe by sentiment and category_name
grouped = df_youtube_vader.groupby(['sentiment', 'category_name'])

# Function to select top five videos and extract their titles, categories, and sentiments from each group
def top_five_info(group):
    top_videos = group.nlargest(1, 'view_count')
    return top_videos[['title', 'category_name', 'sentiment']]

# Apply the function to each group and reset the index
top_videos_info = grouped.apply(top_five_info).reset_index(drop=True)

# Display the resulting dataframe with titles, categories, and sentiments
print(top_videos_info

In [None]:
# Print sentiment distribution
sentiment_counts = df_youtube_vader['sentiment'].value_counts()

print(sentiment_counts)

In [None]:
# Calculate sentiment distribution for each category
sentiment_distribution = df_youtube_vader.groupby(['category_name', 'sentiment']).size().unstack(fill_value=0)

# Plot the sentiment distribution
sentiment_distribution.plot(kind='bar', stacked=True, figsize=(10, 6))
plt.title('Distribution of Sentiments by Category')
plt.xlabel('Category')
plt.ylabel('Number of Videos')
plt.xticks(rotation=45, ha='right')
plt.legend(title='Sentiment')
plt.show()

In [None]:
# Calculate sentiment distribution for each category
sentiment_distribution = df_youtube_vader.groupby(['category_name', 'sentiment']).size().unstack(fill_value=0)

# Calculate the total number of videos in each category
total_videos_per_category = sentiment_distribution.sum(axis=1)

# Calculate proportions of each sentiment category relative to total videos
proportions = sentiment_distribution.divide(total_videos_per_category, axis=0)

# Plot the proportion of positive sentiments relative to total videos
positive_proportions = proportions['positive'].sort_values(ascending=False)
positive_proportions.plot(kind='bar', figsize=(10, 6))
plt.title('Proportion of Positive Sentiments by Category')
plt.xlabel('Category')
plt.ylabel('Proportion of Videos')
plt.xticks(rotation=45, ha='right')
plt.show()

In [None]:
# Plot the proportion of negative sentiments relative to total videos
negative_proportions = proportions['negative'].sort_values(ascending=False)
negative_proportions.plot(kind='bar', figsize=(10, 6))
plt.title('Proportion of Negative Sentiments by Category')
plt.xlabel('Category')
plt.ylabel('Proportion of Videos')
plt.xticks(rotation=45, ha='right')
plt.show()

In [None]:
# Plot the proportion of neutral sentiments relative to total videos
neutral_proportions = proportions['neutral'].sort_values(ascending=False)
neutral_proportions.plot(kind='bar', figsize=(10, 6))
plt.title('Proportion of Neutral Sentiments by Category')
plt.xlabel('Category')
plt.ylabel('Proportion of Videos')
plt.xticks(rotation=45, ha='right')
plt.show()

In [None]:
# Calculate sentiment distribution for each category
sentiment_distribution = df_youtube_vader.groupby(['category_name', 'sentiment']).size().unstack(fill_value=0)

# Calculate the total number of videos in each category
total_videos_per_category = sentiment_distribution.sum(axis=1)

# Calculate the proportion of negative sentiments relative to total videos
negative_proportions = sentiment_distribution['negative'] / total_videos_per_category

# Sort the negative proportions in descending order
sorted_negative_proportions = negative_proportions.sort_values(ascending=False)

# Create a pie chart
plt.figure(figsize=(8, 8))
plt.pie(sorted_negative_proportions, labels=sorted_negative_proportions.index, autopct='%1.1f%%', startangle=140, colors=plt.cm.Paired.colors)
plt.title('Proportion of Negative Sentiments by Category')
plt.show()


In [None]:
#Calculate average user engagement per category by sentiment
# Calculate the total engagement for each row and store it in a separate Series
df_youtube_vader['engagement'] = df_youtube_vader[engagement_metrics].sum(axis=1)

# Group by category and sentiment to get average engagement
average_engagement_by_sentiment = df_youtube_vader.groupby(['category_name', 'sentiment'])['engagement'].mean().unstack(fill_value=0)

# Plot the bar graph
average_engagement_by_sentiment.plot(kind='bar', figsize=(12, 6))
plt.title('Average User Engagement per Category by Sentiment')
plt.xlabel('Category')
plt.ylabel('Average User Engagement')
plt.xticks(rotation=45, ha='right')
plt.legend(title='Sentiment')
plt.show()

In [None]:
# Calculate the average engagement for each sentiment across all categories
average_engagement_across_sentiments = average_engagement_by_sentiment.mean()

# Sort the sentiments based on their average engagement values
sorted_sentiments = average_engagement_across_sentiments.sort_values(ascending=False).index

# Plot the sorted sentiments against their average engagement values
plt.figure(figsize=(10, 6))
plt.bar(sorted_sentiments, average_engagement_across_sentiments[sorted_sentiments])
plt.title('Average Engagement Rate Across Sentiments')
plt.xlabel('Sentiment')
plt.ylabel('Average Engagement Rate')
plt.xticks(rotation=45, ha='right')
plt.show()

In [None]:
# Calculate the total average engagement across all sentiments
total_average_engagement = average_engagement_across_sentiments.sum()

# Calculate the percentage of average engagement for each sentiment
percentage_engagement = (average_engagement_across_sentiments / total_average_engagement) * 100

# Create a pie chart
plt.figure(figsize=(8, 8))
plt.pie(percentage_engagement, labels=percentage_engagement.index, autopct='%.1f%%', startangle=140)
plt.title('Distribution of Average Engagement Rate Across Sentiments')
plt.show()

In [None]:
# Group by sentiment and calculate the sum of likes, dislikes, and comments
sentiment_metrics = ['likes', 'dislikes', 'comment_count']
sentiment_engagement = df_youtube_vader.groupby('sentiment')[sentiment_metrics].sum()

# Plot the stacked bar graph
sentiment_engagement.plot(kind='bar', stacked=True, figsize=(12, 6))
plt.title('Sentiment vs User Engagement')
plt.xlabel('Sentiment')
plt.ylabel('Total User Engagement')
plt.xticks(rotation=0)
plt.legend(title='Engagement Metrics')
plt.show()


**Importing tiktok dataset**

In [None]:
# Load tiktok dataset 
file = open('./trending.json', encoding="utf8")
raw_data = json.load(file)
# Close file
file.close()

# Select list with video data
trending_videos_list = raw_data['collector']


In [None]:
# Create DataFrame
df_tiktok_dataset = pd.DataFrame(trending_videos_list)

df_tiktok_dataset.head()


In [None]:
# Keep relevant columns
relevant_columns_tiktok = ['id', 'text', 'diggCount', 'playCount', 'shareCount', 'commentCount']
df_tiktok_filtered = df_tiktok_dataset.loc[:, relevant_columns_tiktok]
df_tiktok_filtered.head()

**Inspecting dataset**

In [None]:
# Check for duplicates in filtered tiktok dataset
df_tiktok_filtered['text'].describe()


In [None]:
#Drop duplicates in text column of tiktok dataset
df_tiktok_filtered.drop_duplicates(inplace=True)


In [None]:
df_tiktok_filtered.head(20)

**Remove rows with non-English text in tiktok dataset**

In [None]:
# Apply language detection to the 'text' column
df_tiktok_filtered['language'] = df_tiktok_filtered['text'].apply(detect_language)

# Display rows with non-English text
non_english_rows_tiktok = df_tiktok_filtered[df_tiktok_filtered['language'] != 'en']
print(non_english_rows_tiktok[['id', 'text', 'language']])



In [None]:
# Filter out non-English rows
df_tiktok_filtered = df_tiktok_filtered[df_tiktok_filtered['language'] == 'en']

In [None]:
# Save TikTok dataset as csv
df_tiktok_dataset.to_csv('tiktok_dataset.csv', index=False)

**Sentiment analysis**

In [None]:
df_tiktok_filtered['cleaned_text'] = df_tiktok_filtered['text'].apply(clean_text)

**Apply sentiment analysis to datasets**

In [None]:
df_tiktok_vader = df_tiktok_filtered.copy()

In [None]:
df_tiktok_vader['sentiment'] = df_tiktok_vader['text'].apply(calculate_sentiment)

**Engagement analysis**

In [None]:
wordcloud = WordCloud(width=800, height=800, background_color='black', max_words=30).generate(' '.join(df_tiktok_filtered['cleaned_text']))

# Plot the word cloud
plt.figure(figsize=(8, 8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of Cleaned Titles (TikTok)')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.set_style("whitegrid")
ax = sns.countplot(x="sentiment", data=df_tiktok_vader)
plt.title("Distribution of Sentiment Types in TikTok Titles")
plt.xlabel("Sentiment Category")
plt.ylabel("Count")
plt.show()


In [None]:
# Step 3: Visualize the sentiment distribution of the most liked videos
plt.figure(figsize=(10, 6))
sns.set_style("whitegrid")
sns.countplot(data=most_liked_videos, x='sentiment_category')
plt.title("Sentiment Distribution of Most Liked YouTube Videos")
plt.xlabel("Sentiment Category")
plt.ylabel("Count")
plt.show()

In [None]:
sentiment_counts = df_tiktok_vader['sentiment'].value_counts()

print(sentiment_counts)


In [None]:

sentiment_distribution = df_tiktok_vader.groupby('sentiment').size()
sentiment_distribution.plot(kind='bar')
plt.title('Distribution of Sentiments across Videos')
plt.xlabel('Sentiment')
plt.ylabel('Number of Videos')
plt.xticks(rotation=0)
plt.show()


In [None]:
# Calculate total user engagement for each row
engagement_metrics = ['diggCount', 'playCount', 'shareCount', 'commentCount']
df_tiktok_vader['total_engagement'] = df_tiktok_vader[engagement_metrics].sum(axis=1)

# Group by sentiment and calculate the mean user engagement for each sentiment
mean_engagement_by_sentiment = df_tiktok_vader.groupby('sentiment')['total_engagement'].mean()

# Create a bar graph
plt.figure(figsize=(8, 5))
mean_engagement_by_sentiment.plot(kind='bar')
plt.xlabel('Sentiment')
plt.ylabel('Average User Engagement')
plt.title('Distribution of Sentiments Across Average User Engagement')
plt.xticks(rotation=0)
plt.show()

In [None]:
# Calculate total user engagement for each row
engagement_metrics = ['diggCount', 'playCount', 'shareCount', 'commentCount']
df_tiktok_vader['total_engagement'] = df_tiktok_vader[engagement_metrics].sum(axis=1)

# Group by sentiment and calculate average user engagement
average_engagement_by_sentiment = df_tiktok_vader.groupby('sentiment')['total_engagement'].mean()

# Group by sentiment and count the number of videos in each sentiment category
sentiment_counts = df_tiktok_vader['sentiment'].value_counts()

# Calculate the average user engagement per video in each sentiment category
average_engagement_per_video = average_engagement_by_sentiment / sentiment_counts

# Plot the graph
fig, ax1 = plt.subplots(figsize=(10, 6))

# Bar plot for sentiment distribution
ax1.bar(average_engagement_per_video.index, sentiment_counts, color='b', label='Sentiment Distribution')
ax1.set_xlabel('Sentiment')
ax1.set_ylabel('Number of Videos', color='b')
ax1.tick_params('y', colors='b')

# Line plot for average user engagement
ax2 = ax1.twinx()
ax2.plot(average_engagement_per_video.index, average_engagement_per_video, color='r', marker='o', label='Avg User Engagement')
ax2.set_ylabel('Average User Engagement', color='r')
ax2.tick_params('y', colors='r')

# Add legend and title
fig.tight_layout()
fig.legend(loc='upper left', bbox_to_anchor=(0.15, 0.9))
plt.title('Sentiment Distribution vs Average User Engagement')

plt.show()


In [None]:
# Nine titles from each sentiment category
# Sort the DataFrame by total_engagement in descending order
df_tiktok_sorted = df_tiktok_vader.sort_values(by='total_engagement', ascending=False)

# Group by sentiment and get the top 5 titles with highest user engagement for each sentiment
top_titles_by_sentiment = df_tiktok_sorted.groupby('sentiment').head(9)[['text', 'sentiment', 'total_engagement']]

print(top_titles_by_sentiment)


In [None]:
# Total engagement per sentiment category
engagement_metrics = ['diggCount', 'playCount', 'shareCount', 'commentCount']

# Group the DataFrame by 'sentiment' and calculate the sum of engagement metrics
total_engagement_per_sentiment = df_tiktok_vader.groupby('sentiment')[engagement_metrics].sum()

# Plotting the bar graph
total_engagement_per_sentiment.plot(kind='bar', figsize=(10, 6))
plt.title('Total Engagement per Sentiment Category')
plt.xlabel('Sentiment')
plt.ylabel('Total Engagement')
plt.xticks(rotation=0)
plt.legend(title='Engagement Metric')
plt.show()