In [1]:
import pandas as pd
import numpy as np

# Load dataset
data = pd.read_csv("tiktok_dataset.csv")

# Inspect the first few rows of the dataset
data.head(10)

# Display general information about the dataset
data.info()

# Generate summary statistics
data.describe()

# Count unique claim statuses
claim_status_counts = data['claim_status'].value_counts()
print("Claim Status Counts:\n", claim_status_counts)

# Calculate mean and median view count for each claim status
claim_mean_views = data[data['claim_status'] == 'claim']['video_view_count'].mean()
claim_median_views = data[data['claim_status'] == 'claim']['video_view_count'].median()

opinion_mean_views = data[data['claim_status'] == 'opinion']['video_view_count'].mean()
opinion_median_views = data[data['claim_status'] == 'opinion']['video_view_count'].median()

print("Mean and Median View Counts:")
print(f"Claim - Mean: {claim_mean_views}, Median: {claim_median_views}")
print(f"Opinion - Mean: {opinion_mean_views}, Median: {opinion_median_views}")

# Count videos per combination of claim status and author ban status
claim_ban_counts = data.groupby(['claim_status', 'author_ban_status']).size().reset_index(name='count')
print("Claim and Ban Status Counts:\n", claim_ban_counts)

# Calculate median share count per author ban status
median_share_count = data.groupby('author_ban_status')['video_share_count'].median()
print("Median Share Count per Author Ban Status:\n", median_share_count)

# Aggregated engagement stats by author ban status
author_ban_stats = data.groupby('author_ban_status').agg(
    {
        'video_view_count': ['count', 'mean', 'median'],
        'video_like_count': ['count', 'mean', 'median'],
        'video_share_count': ['count', 'mean', 'median']
    }
)
print("Engagement Stats by Author Ban Status:\n", author_ban_stats)

# Creating new engagement metric columns
data['likes_per_view'] = data['video_like_count'] / data['video_view_count']
data['comments_per_view'] = data['video_comment_count'] / data['video_view_count']
data['shares_per_view'] = data['video_share_count'] / data['video_view_count']

# Grouping engagement metrics by claim status and author ban status
engagement_metrics = data.groupby(['claim_status', 'author_ban_status']).agg(
    {
        'likes_per_view': ['count', 'mean', 'median'],
        'comments_per_view': ['count', 'mean', 'median'],
        'shares_per_view': ['count', 'mean', 'median']
    }
)
print("Engagement Metrics by Claim Status and Author Ban Status:\n", engagement_metrics)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19382 entries, 0 to 19381
Data columns (total 12 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   #                         19382 non-null  int64  
 1   claim_status              19084 non-null  object 
 2   video_id                  19382 non-null  int64  
 3   video_duration_sec        19382 non-null  int64  
 4   video_transcription_text  19084 non-null  object 
 5   verified_status           19382 non-null  object 
 6   author_ban_status         19382 non-null  object 
 7   video_view_count          19084 non-null  float64
 8   video_like_count          19084 non-null  float64
 9   video_share_count         19084 non-null  float64
 10  video_download_count      19084 non-null  float64
 11  video_comment_count       19084 non-null  float64
dtypes: float64(5), int64(3), object(4)
memory usage: 1.8+ MB
Claim Status Counts:
 claim      9608
opinion    9476
Name: