# Nouvelle section
In this notebook we will focus on analyzing engagement metrics for videos to determine key factors influencing virality:
Data Loading: It loads a dataset using Pandas.
Correlation Analysis: It computes a correlation matrix between engagement metrics (play count, share count, comment count, digg count).
Viral Score Calculation: It defines a "viral score" as a weighted sum of engagement metrics and ranks the top 10 viral videos.
Optimization Insights: Markdown sections indicate an analysis of the best video duration and the best time to post.

In [2]:

import pandas
df=pandas.read_csv("my_dataframe (8).csv")
df.columns

Index(['id', 'play_count', 'share_count', 'comment_count', 'digg_count',
       'collect_count', 'followers', 'likes', 'video_count', 'hashtags',
       'create_time', 'duration', 'video_url', 'music_title', 'music_author',
       'music_id', 'username', 'music_play_url', 'description', 'author_id',
       'follower_count', 'heart_count', 'nickname', 'verified', 'friend_count',
       'sentiment', 'tiktok_web_url', 'audio_path'],
      dtype='object')

In [3]:
correlation_matrix = df[['play_count', 'share_count', 'comment_count', 'digg_count']].corr()
print("Correlation Matrix:")
print(correlation_matrix)

Correlation Matrix:
               play_count  share_count  comment_count  digg_count
play_count       1.000000     0.467992       0.358971    0.825355
share_count      0.467992     1.000000       0.275429    0.691314
comment_count    0.358971     0.275429       1.000000    0.529634
digg_count       0.825355     0.691314       0.529634    1.000000


In [4]:
# Define a viral score as a weighted sum of engagement metrics
df['viral_score'] = (df['play_count'] * 0.4 + df['share_count'] * 0.3 + df['comment_count'] * 0.2 + df['digg_count'] * 0.1)

# Now, 'viral_score' is a column in your DataFrame

# Sort by viral score (optional)
top_viral_videos = df.sort_values(by='viral_score', ascending=False).head(10)
print("Top 10 Viral Videos:")
print(top_viral_videos)

Top 10 Viral Videos:
                      id  play_count  share_count  comment_count  digg_count  \
236  7438631768900570401   192600000      1000000         121300    19800000   
229  7438631768900570401   192600000      1000000         121300    19800000   
222  7438631768900570401   192600000      1000000         121300    19800000   
223  7438631768900570401   192600000      1000000         121300    19800000   
224  7438631768900570401   192600000      1000000         121300    19800000   
225  7438631768900570401   192600000      1000000         121300    19800000   
226  7438631768900570401   192600000      1000000         121300    19800000   
227  7438631768900570401   192600000      1000000         121300    19800000   
228  7438631768900570401   192600000      1000000         121300    19800000   
230  7438631768900570401   192600000      1000000         121300    19800000   

     collect_count  followers      likes  video_count hashtags  ...  \
236        1000000   267000

In [5]:
df.head()

Unnamed: 0,id,play_count,share_count,comment_count,digg_count,collect_count,followers,likes,video_count,hashtags,...,author_id,follower_count,heart_count,nickname,verified,friend_count,sentiment,tiktok_web_url,audio_path,viral_score
0,7446111299458698498,4700000,54600,3247,885900,88200,709600,30100000,945,[],...,notsaarita,709600,30100000,𝓢 𝓐 𝓡 𝓐,False,0,0.0,https://www.tiktok.com/@notsaarita/video/74461...,audios/7446111299458698498.mp3,1985619.4
1,7446111299458698498,4700000,54600,3247,885900,88200,709600,30100000,945,[],...,notsaarita,709600,30100000,𝓢 𝓐 𝓡 𝓐,False,0,0.296,https://www.tiktok.com/@notsaarita/video/74461...,audios/7446111299458698498.mp3,1985619.4
2,7446111299458698498,4700000,54600,3247,885900,88200,709600,30100000,945,[],...,notsaarita,709600,30100000,𝓢 𝓐 𝓡 𝓐,False,0,0.2732,https://www.tiktok.com/@notsaarita/video/74461...,audios/7446111299458698498.mp3,1985619.4
3,7446111299458698498,4700000,54600,3247,885900,88200,709600,30100000,945,[],...,notsaarita,709600,30100000,𝓢 𝓐 𝓡 𝓐,False,0,0.0,https://www.tiktok.com/@notsaarita/video/74461...,audios/7446111299458698498.mp3,1985619.4
4,7446111299458698498,4700000,54600,3247,885900,88200,709600,30100000,945,[],...,notsaarita,709600,30100000,𝓢 𝓐 𝓡 𝓐,False,0,0.0,https://www.tiktok.com/@notsaarita/video/74461...,audios/7446111299458698498.mp3,1985619.4


# find the best duration of video

In [7]:
df.columns

Index(['id', 'play_count', 'share_count', 'comment_count', 'digg_count',
       'collect_count', 'followers', 'likes', 'video_count', 'hashtags',
       'create_time', 'duration', 'video_url', 'music_title', 'music_author',
       'music_id', 'username', 'music_play_url', 'description', 'author_id',
       'follower_count', 'heart_count', 'nickname', 'verified', 'friend_count',
       'sentiment', 'tiktok_web_url', 'audio_path', 'viral_score'],
      dtype='object')

In [8]:
import pandas as pd
# 🔹 1. Catégoriser les vidéos par durée
df["content_type"] = df["duration"].apply(lambda x: "short" if x <= 30 else "medium" if x <= 60 else "long")

# 🔹 2. Calculer l'engagement moyen pour chaque catégorie
engagement_metrics = ["play_count", "digg_count", "comment_count", "share_count"]

def compute_engagement(df_subset):
    """Calcule l'engagement moyen pour un sous-ensemble du dataset."""
    return {
        "average_play_count": df_subset["play_count"].mean(),
        "average_likes": df_subset["digg_count"].mean(),
        "average_comments": df_subset["comment_count"].mean(),
        "average_shares": df_subset["share_count"].mean()
    }

short_video_engagement = compute_engagement(df[df["content_type"] == "short"])
medium_video_engagement = compute_engagement(df[df["content_type"] == "medium"])
long_video_engagement = compute_engagement(df[df["content_type"] == "long"])

# 🔹 3. Définir une métrique d'engagement global
def engagement_score(engagement):
    """Calcule un score d'engagement pondéré."""
    return (engagement["average_play_count"] * 0.4 +
            engagement["average_likes"] * 0.3 +
            engagement["average_comments"] * 0.2 +
            engagement["average_shares"] * 0.1)

# 🔹 4. Calcul du score d'engagement pour chaque type de vidéo
short_score = engagement_score(short_video_engagement)
medium_score = engagement_score(medium_video_engagement)
long_score = engagement_score(long_video_engagement)

# 🔹 5. Trouver la meilleure durée en comparant les scores d'engagement
best_duration = "short" if short_score > max(medium_score, long_score) else (
    "medium" if medium_score > long_score else "long"
)

# 🔹 6. Trouver la durée exacte qui maximise l'engagement
df_filtered = df[df["content_type"] == best_duration]
best_video_length = df_filtered["duration"].mean()

# 🔹 7. Affichage des résultats
print("📊 Engagement Score par Type de Vidéo:")
print(f"Short Videos: {short_score:.2f}")
print(f"Medium Videos: {medium_score:.2f}")
print(f"Long Videos: {long_score:.2f}")
print()
print(f"🏆 Meilleure Durée de Vidéo : {best_video_length:.2f} secondes ({best_duration.upper()})")

# 🔹 8. Ajouter cette information au DataFrame final
df_features = pd.DataFrame(columns=[ "best_video_length", "best_post_time"])

# Ajouter la meilleure durée de vidéo
df_features["best_video_length"] = best_video_length

# Sauvegarder le fichier
df_features.to_csv("viral_engagement.csv", index=False)

print("viral_engagement.csv' créé/mis à jour avec la meilleure durée de vidéo !")

📊 Engagement Score par Type de Vidéo:
Short Videos: 6824847.44
Medium Videos: 5617668.50
Long Videos: 6579463.18

🏆 Meilleure Durée de Vidéo : 17.43 secondes (SHORT)
viral_engagement.csv' créé/mis à jour avec la meilleure durée de vidéo !


In [None]:
df.head()

# find the best moment to post the video

In [9]:
import pandas as pd

# Convert 'create_time' to datetime
df['create_time'] = pd.to_datetime(df['create_time'])

# Extract the hour from the datetime
df['post_hour'] = df['create_time'].dt.hour

In [10]:
# Group by 'post_hour' and calculate the mean of engagement metrics
engagement_by_hour = df.groupby('post_hour')[['play_count', 'share_count', 'comment_count', 'digg_count']].mean()

# Calculate a weighted engagement score (similar to the viral score)
engagement_by_hour['engagement_score'] = (
    engagement_by_hour['play_count'] * 0.4 +
    engagement_by_hour['share_count'] * 0.3 +
    engagement_by_hour['comment_count'] * 0.2 +
    engagement_by_hour['digg_count'] * 0.1
)

In [11]:
# Find the hour with the highest engagement score
best_hour = engagement_by_hour['engagement_score'].idxmax()

print(f"📊 Engagement Score by Hour:")
print(engagement_by_hour)

print(f"\n🏆 Best Time to Post: {best_hour}:00 - {best_hour + 1}:00")

📊 Engagement Score by Hour:
             play_count   share_count  comment_count    digg_count  \
post_hour                                                            
0          1.500979e+07  98666.402627   11923.908072  1.077416e+06   

           engagement_score  
post_hour                    
0              6.143641e+06  

🏆 Best Time to Post: 0:00 - 1:00


In [15]:
l=[]
l.append(best_hour+1)
l.append(best_duration)


[1, 'short']