<a href="https://colab.research.google.com/github/ranabag/Youtube-analsis-dashboard/blob/main/Youtube_Dashboard_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
from google.colab import drive
drive.mount('C:/Users/HP/Downloads/archive')

ValueError: Mountpoint must be in a directory that exists

In [None]:
import pandas as pd
import numpy as np
import matplotlib as mpl
from matplotlib import pyplot as plt
import seaborn as sns

import warnings
from collections import Counter
import datetime
import wordcloud
import json

# --- Configuration ---
# Read the data and set configuration options
try:
    df = pd.read_csv("USvideos.csv", engine='python') # Added engine='python'
except FileNotFoundError:
    print("Error: 'USvideos.csv' not found. Make sure the file is in the correct directory.")
    # Exit or create a dummy dataframe to allow the rest of the script to run without error
    df = pd.DataFrame()

PLOT_COLORS = ["#268bd2", "#0052CC", "#FF5722", "#b58900", "#003f5c"]
pd.options.display.float_format = '{:.2f}'.format
sns.set(style="ticks")
plt.rc('figure', figsize=(8, 5), dpi=100)
plt.rc('axes', labelpad=20, facecolor="#ffffff", linewidth=0.4, grid=True, labelsize=14)
plt.rc('patch', linewidth=0)
plt.rc('xtick.major', width=0.2)
plt.rc('ytick.major', width=0.2)
plt.rc('grid', color='#9E9E9E', linewidth=0.4)
plt.rc('font', family='Arial', weight='400', size=10)
plt.rc('text', color='#282828')
plt.rc('savefig', pad_inches=0.3, dpi=300)

# --- Data Exploration ---
# Fill missing description values
if 'description' in df.columns:
    df["description"] = df["description"].fillna(value="")
# The df.describe() line is meant for interactive exploration, so it's often omitted in a final script

# --- Data Visualization ---

# Check if dataframe is empty before attempting to plot
if not df.empty:
    # FIX 1: Corrected the function's logic
    def contains_capitalized_word(s):
        # The function now checks all words before returning False
        for w in s.split():
            if w.isupper():
                return True
        return False

    # Pie Chart: Title Contains Capitalized Word?
    if 'title' in df.columns:
        df["contains_capitalized"] = df["title"].apply(contains_capitalized_word)
        value_counts = df["contains_capitalized"].value_counts()

        # FIX 2: Made pie chart data access safer
        false_count = value_counts.get(False, 0)
        true_count = value_counts.get(True, 0)

        fig, ax = plt.subplots()
        ax.pie([false_count, true_count], labels=['No', 'Yes'],
               colors=['#003f5c', '#ffa600'], textprops={'color': '#040204'}, startangle=45)
        ax.axis('equal')
        ax.set_title('Title Contains Capitalized Word?')

    # Histogram: Title Length
    if 'title' in df.columns:
        df["title_length"] = df["title"].apply(lambda x: len(x))
        fig, ax = plt.subplots()

        # FIX 3: Replaced deprecated 'distplot' with 'histplot'
        sns.histplot(data=df, x="title_length", kde=False,
                     color=PLOT_COLORS[4], ax=ax)

        ax.set(xlabel="Title Length", ylabel="No. of videos", xticks=range(0, 110, 10))

    # Scatter Plot: Views vs. Title Length
    if 'views' in df.columns and 'title_length' in df.columns:
        fig, ax = plt.subplots()
        ax.scatter(x=df['views'], y=df['title_length'], color=PLOT_COLORS[2],
                   edgecolors="#000000", linewidths=0.5)
        ax.set(xlabel="Views", ylabel="Title Length")


    # Heatmap: Correlation of Trending Video Metrics
    # FIX 4: Made the correlation calculation and labeling more robust
    corr_matrix = df.corr(numeric_only=True)
    h_labels = [label.replace('_', ' ').title() for label in corr_matrix.columns]

    fig, ax = plt.subplots(figsize=(10, 6))
    sns.heatmap(corr_matrix, annot=True, xticklabels=h_labels, yticklabels=h_labels,
                cmap=sns.cubehelix_palette(as_cmap=True), ax=ax)


    # Word Cloud: Trending Words in Titles
    if 'title' in df.columns:
        title_words = list(df["title"].apply(lambda x: x.split()))
        title_words = [x for y in title_words for x in y]

        wc = wordcloud.WordCloud(width=1200, height=500,
                                 collocations=False, background_color="white",
                                 colormap="tab20b").generate(" ".join(title_words))

        plt.figure(figsize=(15, 10))
        plt.imshow(wc, interpolation='bilinear')
        plt.axis("off")

    # Display all the plots
    plt.show()
else:
    print("DataFrame is empty. Skipping plot generation.")

In [None]:
import pandas as pd
import numpy as np
import matplotlib as mpl
from matplotlib import pyplot as plt
import seaborn as sns

import warnings
from collections import Counter
import datetime
import wordcloud
import json

# --- Configuration ---
# Read the data and set configuration options
try:
    df = pd.read_csv("USvideos.csv")
except FileNotFoundError:
    print("Error: 'USvideos.csv' not found. Make sure the file is in the correct directory.")
    # Exit or create a dummy dataframe to allow the rest of the script to run without error
    df = pd.DataFrame()

PLOT_COLORS = ["#268bd2", "#0052CC", "#FF5722", "#b58900", "#003f5c"]
pd.options.display.float_format = '{:.2f}'.format
sns.set(style="ticks")
plt.rc('figure', figsize=(8, 5), dpi=100)
plt.rc('axes', labelpad=20, facecolor="#ffffff", linewidth=0.4, grid=True, labelsize=14)
plt.rc('patch', linewidth=0)
plt.rc('xtick.major', width=0.2)
plt.rc('ytick.major', width=0.2)
plt.rc('grid', color='#9E9E9E', linewidth=0.4)
# plt.rc('font', family='Arial', weight='400', size=10) # Removed specific font
plt.rc('text', color='#282828')
plt.rc('savefig', pad_inches=0.3, dpi=300)

# --- Data Exploration ---
# Fill missing description values
if 'description' in df.columns:
    df["description"] = df["description"].fillna(value="")
# The df.describe() line is meant for interactive exploration, so it's often omitted in a final script

# --- Data Visualization ---

# Check if dataframe is empty before attempting to plot
if not df.empty:
    # FIX 1: Corrected the function's logic
    def contains_capitalized_word(s):
        # The function now checks all words before returning False
        for w in s.split():
            if w.isupper():
                return True
        return False

    # Pie Chart: Title Contains Capitalized Word?
    if 'title' in df.columns:
        df["contains_capitalized"] = df["title"].apply(contains_capitalized_word)
        value_counts = df["contains_capitalized"].value_counts()

        # FIX 2: Made pie chart data access safer
        false_count = value_counts.get(False, 0)
        true_count = value_counts.get(True, 0)

        fig, ax = plt.subplots()
        ax.pie([false_count, true_count], labels=['No', 'Yes'],
               colors=['#003f5c', '#ffa600'], textprops={'color': '#040204'}, startangle=45)
        ax.axis('equal')
        ax.set_title('Title Contains Capitalized Word?')

    # Histogram: Title Length
    if 'title' in df.columns:
        df["title_length"] = df["title"].apply(lambda x: len(x))
        fig, ax = plt.subplots()

        # FIX 3: Replaced deprecated 'distplot' with 'histplot'
        sns.histplot(data=df, x="title_length", kde=False,
                     color=PLOT_COLORS[4], ax=ax)

        ax.set(xlabel="Title Length", ylabel="No. of videos", xticks=range(0, 110, 10))

    # Scatter Plot: Views vs. Title Length
    if 'views' in df.columns and 'title_length' in df.columns:
        fig, ax = plt.subplots()
        ax.scatter(x=df['views'], y=df['title_length'], color=PLOT_COLORS[2],
                   edgecolors="#000000", linewidths=0.5)
        ax.set(xlabel="Views", ylabel="Title Length")


    # Heatmap: Correlation of Trending Video Metrics
    # FIX 4: Made the correlation calculation and labeling more robust
    corr_matrix = df.corr(numeric_only=True)
    h_labels = [label.replace('_', ' ').title() for label in corr_matrix.columns]

    fig, ax = plt.subplots(figsize=(10, 6))
    sns.heatmap(corr_matrix, annot=True, xticklabels=h_labels, yticklabels=h_labels,
                cmap=sns.cubehelix_palette(as_cmap=True), ax=ax)


    # Word Cloud: Trending Words in Titles
    if 'title' in df.columns:
        title_words = list(df["title"].apply(lambda x: x.split()))
        title_words = [x for y in title_words for x in y]

        wc = wordcloud.WordCloud(width=1200, height=500,
                                 collocations=False, background_color="white",
                                 colormap="tab20b").generate(" ".join(title_words))

        plt.figure(figsize=(15, 10))
        plt.imshow(wc, interpolation='bilinear')
        plt.axis("off")

    # Display all the plots
    plt.show()
else:
    print("DataFrame is empty. Skipping plot generation.")

In [None]:
df = pd.read_csv("USvideos.csv", engine='python')

In [None]:
df = pd.read_csv("USvideos.csv", engine='python', on_bad_lines='skip')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
import pandas as pd
import numpy as np
import matplotlib as mpl
from matplotlib import pyplot as plt
import seaborn as sns

import warnings
from collections import Counter
import datetime
import wordcloud
import json

# --- Configuration ---
# Read the data and set configuration options
try:
    df = pd.read_csv("USvideos.csv")
except FileNotFoundError:
    print("Error: 'USvideos.csv' not found. Make sure the file is in the correct directory.")
    # Exit or create a dummy dataframe to allow the rest of the script to run without error
    df = pd.DataFrame()

PLOT_COLORS = ["#268bd2", "#0052CC", "#FF5722", "#b58900", "#003f5c"]
pd.options.display.float_format = '{:.2f}'.format
sns.set(style="ticks")
plt.rc('figure', figsize=(8, 5), dpi=100)
plt.rc('axes', labelpad=20, facecolor="#ffffff", linewidth=0.4, grid=True, labelsize=14)
plt.rc('patch', linewidth=0)
plt.rc('xtick.major', width=0.2)
plt.rc('ytick.major', width=0.2)
plt.rc('grid', color='#9E9E9E', linewidth=0.4)
plt.rc('font', family='Arial', weight='400', size=10)
plt.rc('text', color='#282828')
plt.rc('savefig', pad_inches=0.3, dpi=300)

# --- Data Exploration ---
# Fill missing description values
if 'description' in df.columns:
    df["description"] = df["description"].fillna(value="")
# The df.describe() line is meant for interactive exploration, so it's often omitted in a final script

# --- Data Visualization ---

# FIX 1: Corrected the function's logic
def contains_capitalized_word(s):
    # The function now checks all words before returning False
    for w in s.split():
        if w.isupper():
            return True
    return False

# Pie Chart: Title Contains Capitalized Word?
if 'title' in df.columns:
    df["contains_capitalized"] = df["title"].apply(contains_capitalized_word)
    value_counts = df["contains_capitalized"].value_counts()

    # FIX 2: Made pie chart data access safer
    false_count = value_counts.get(False, 0)
    true_count = value_counts.get(True, 0)

    fig, ax = plt.subplots()
    ax.pie([false_count, true_count], labels=['No', 'Yes'],
           colors=['#003f5c', '#ffa600'], textprops={'color': '#040204'}, startangle=45)
    ax.axis('equal')
    ax.set_title('Title Contains Capitalized Word?')

# Histogram: Title Length
if 'title' in df.columns:
    df["title_length"] = df["title"].apply(lambda x: len(x))
    fig, ax = plt.subplots()

    # FIX 3: Replaced deprecated 'distplot' with 'histplot'
    sns.histplot(data=df, x="title_length", kde=False,
                 color=PLOT_COLORS[4], ax=ax)

    ax.set(xlabel="Title Length", ylabel="No. of videos", xticks=range(0, 110, 10))

# Scatter Plot: Views vs. Title Length
if 'views' in df.columns and 'title_length' in df.columns:
    fig, ax = plt.subplots()
    ax.scatter(x=df['views'], y=df['title_length'], color=PLOT_COLORS[2],
               edgecolors="#000000", linewidths=0.5)
    ax.set(xlabel="Views", ylabel="Title Length")


# Heatmap: Correlation of Trending Video Metrics
# FIX 4: Made the correlation calculation and labeling more robust
corr_matrix = df.corr(numeric_only=True)
h_labels = [label.replace('_', ' ').title() for label in corr_matrix.columns]

fig, ax = plt.subplots(figsize=(10, 6))
sns.heatmap(corr_matrix, annot=True, xticklabels=h_labels, yticklabels=h_labels,
            cmap=sns.cubehelix_palette(as_cmap=True), ax=ax)


# Word Cloud: Trending Words in Titles
if 'title' in df.columns:
    title_words = list(df["title"].apply(lambda x: x.split()))
    title_words = [x for y in title_words for x in y]

    wc = wordcloud.WordCloud(width=1200, height=500,
                             collocations=False, background_color="white",
                             colormap="tab20b").generate(" ".join(title_words))

    plt.figure(figsize=(15, 10))
    plt.imshow(wc, interpolation='bilinear')
    plt.axis("off")

# Display all the plots
plt.show()

In [None]:
df = pd.read_csv("USvideos.csv", engine='python', on_bad_lines='skip')

In [None]:
import pandas as pd
import numpy as np
import matplotlib as mpl
from matplotlib import pyplot as plt
import seaborn as sns

import warnings
from collections import Counter
import datetime
import wordcloud
import json

# --- Configuration ---
# Read the data and set configuration options
try:
    df = pd.read_csv("USvideos.csv")
except FileNotFoundError:
    print("Error: 'USvideos.csv' not found. Make sure the file is in the correct directory.")
    # Exit or create a dummy dataframe to allow the rest of the script to run without error
    df = pd.DataFrame()

PLOT_COLORS = ["#268bd2", "#0052CC", "#FF5722", "#b58900", "#003f5c"]
pd.options.display.float_format = '{:.2f}'.format
sns.set(style="ticks")
plt.rc('figure', figsize=(8, 5), dpi=100)
plt.rc('axes', labelpad=20, facecolor="#ffffff", linewidth=0.4, grid=True, labelsize=14)
plt.rc('patch', linewidth=0)
plt.rc('xtick.major', width=0.2)
plt.rc('ytick.major', width=0.2)
plt.rc('grid', color='#9E9E9E', linewidth=0.4)
plt.rc('font', family='Arial', weight='400', size=10)
plt.rc('text', color='#282828')
plt.rc('savefig', pad_inches=0.3, dpi=300)

# --- Data Exploration ---
# Fill missing description values
if 'description' in df.columns:
    df["description"] = df["description"].fillna(value="")
# The df.describe() line is meant for interactive exploration, so it's often omitted in a final script

# --- Data Visualization ---

# FIX 1: Corrected the function's logic
def contains_capitalized_word(s):
    # The function now checks all words before returning False
    for w in s.split():
        if w.isupper():
            return True
    return False

# Pie Chart: Title Contains Capitalized Word?
if 'title' in df.columns:
    df["contains_capitalized"] = df["title"].apply(contains_capitalized_word)
    value_counts = df["contains_capitalized"].value_counts()

    # FIX 2: Made pie chart data access safer
    false_count = value_counts.get(False, 0)
    true_count = value_counts.get(True, 0)

    fig, ax = plt.subplots()
    ax.pie([false_count, true_count], labels=['No', 'Yes'],
           colors=['#003f5c', '#ffa600'], textprops={'color': '#040204'}, startangle=45)
    ax.axis('equal')
    ax.set_title('Title Contains Capitalized Word?')

# Histogram: Title Length
if 'title' in df.columns:
    df["title_length"] = df["title"].apply(lambda x: len(x))
    fig, ax = plt.subplots()

    # FIX 3: Replaced deprecated 'distplot' with 'histplot'
    sns.histplot(data=df, x="title_length", kde=False,
                 color=PLOT_COLORS[4], ax=ax)

    ax.set(xlabel="Title Length", ylabel="No. of videos", xticks=range(0, 110, 10))

# Scatter Plot: Views vs. Title Length
if 'views' in df.columns and 'title_length' in df.columns:
    fig, ax = plt.subplots()
    ax.scatter(x=df['views'], y=df['title_length'], color=PLOT_COLORS[2],
               edgecolors="#000000", linewidths=0.5)
    ax.set(xlabel="Views", ylabel="Title Length")


# Heatmap: Correlation of Trending Video Metrics
# FIX 4: Made the correlation calculation and labeling more robust
corr_matrix = df.corr(numeric_only=True)
h_labels = [label.replace('_', ' ').title() for label in corr_matrix.columns]

fig, ax = plt.subplots(figsize=(10, 6))
sns.heatmap(corr_matrix, annot=True, xticklabels=h_labels, yticklabels=h_labels,
            cmap=sns.cubehelix_palette(as_cmap=True), ax=ax)


# Word Cloud: Trending Words in Titles
if 'title' in df.columns:
    title_words = list(df["title"].apply(lambda x: x.split()))
    title_words = [x for y in title_words for x in y]

    wc = wordcloud.WordCloud(width=1200, height=500,
                             collocations=False, background_color="white",
                             colormap="tab20b").generate(" ".join(title_words))

    plt.figure(figsize=(15, 10))
    plt.imshow(wc, interpolation='bilinear')
    plt.axis("off")

# Display all the plots
plt.show()

In [None]:
import pandas as pd
import numpy as np
import matplotlib as mpl
from matplotlib import pyplot as plt
import seaborn as sns
import warnings
from collections import Counter
import datetime
from wordcloud import WordCloud
import json

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# --- Configuration ---
# Read the data and set configuration options
try:
    df = pd.read_csv("USvideos.csv")
    print(f"Successfully loaded data with {len(df)} rows and {len(df.columns)} columns")
except FileNotFoundError:
    print("Error: 'USvideos.csv' not found. Creating sample data for demonstration.")
    # Create sample data for testing
    np.random.seed(42)
    df = pd.DataFrame({
        'title': [
            'AMAZING YouTube Video Goes VIRAL!',
            'how to code python tutorial',
            'BREAKING NEWS: Something Happened',
            'funny cat video compilation',
            'TOP 10 Things You Need to Know',
            'music video new release 2024',
            'SHOCKING Results From This Experiment',
            'daily vlog episode 100'
        ] * 1000,
        'views': np.random.randint(1000, 10000000, 8000),
        'likes': np.random.randint(10, 100000, 8000),
        'dislikes': np.random.randint(0, 5000, 8000),
        'comment_count': np.random.randint(0, 10000, 8000),
        'description': ['Sample description'] * 7500 + [None] * 500
    })

PLOT_COLORS = ["#268bd2", "#0052CC", "#FF5722", "#b58900", "#003f5c"]
pd.options.display.float_format = '{:.2f}'.format

# Set plotting style
sns.set_style("whitegrid")
plt.rcParams.update({
    'figure.figsize': (10, 6),
    'figure.dpi': 100,
    'axes.labelpad': 20,
    'axes.facecolor': '#ffffff',
    'axes.linewidth': 0.4,
    'axes.grid': True,
    'axes.labelsize': 12,
    'patch.linewidth': 0,
    'xtick.major.width': 0.2,
    'ytick.major.width': 0.2,
    'grid.color': '#9E9E9E',
    'grid.linewidth': 0.4,
    'font.family': 'sans-serif',
    'font.weight': '400',
    'font.size': 10,
    'text.color': '#282828',
    'savefig.pad_inches': 0.3,
    'savefig.dpi': 300
})

# --- Data Exploration ---
print("Dataset Info:")
print(f"Shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print("\nFirst few rows:")
print(df.head())

# Fill missing description values
if 'description' in df.columns:
    df["description"] = df["description"].fillna("")
    print(f"\nFilled {df['description'].isna().sum()} missing descriptions")

# --- Data Visualization ---

def contains_capitalized_word(title):
    """Check if title contains any fully capitalized words (length > 1)"""
    if pd.isna(title) or not isinstance(title, str):
        return False

    words = title.split()
    for word in words:
        # Check if word is all caps and longer than 1 character
        if len(word) > 1 and word.isupper() and word.isalpha():
            return True
    return False

# 1. Pie Chart: Title Contains Capitalized Word?
if 'title' in df.columns:
    print("\n1. Creating pie chart for capitalized words...")
    df["contains_capitalized"] = df["title"].apply(contains_capitalized_word)
    value_counts = df["contains_capitalized"].value_counts()

    false_count = value_counts.get(False, 0)
    true_count = value_counts.get(True, 0)

    fig, ax = plt.subplots(figsize=(8, 6))
    colors = ['#ff9999', '#66b3ff']
    wedges, texts, autotexts = ax.pie(
        [false_count, true_count],
        labels=['No Caps', 'Has Caps'],
        colors=colors,
        autopct='%1.1f%%',
        startangle=45,
        textprops={'fontsize': 12}
    )
    ax.set_title('Titles with Fully Capitalized Words', fontsize=14, fontweight='bold')
    plt.tight_layout()
    plt.show()

# 2. Histogram: Title Length Distribution
if 'title' in df.columns:
    print("\n2. Creating title length histogram...")
    df["title_length"] = df["title"].str.len().fillna(0)

    fig, ax = plt.subplots(figsize=(10, 6))
    sns.histplot(
        data=df,
        x="title_length",
        bins=30,
        kde=True,
        color=PLOT_COLORS[4],
        ax=ax,
        alpha=0.7
    )
    ax.set_xlabel("Title Length (characters)", fontsize=12)
    ax.set_ylabel("Number of Videos", fontsize=12)
    ax.set_title("Distribution of Video Title Lengths", fontsize=14, fontweight='bold')

    # Add statistics
    mean_length = df["title_length"].mean()
    median_length = df["title_length"].median()
    ax.axvline(mean_length, color='red', linestyle='--', alpha=0.7, label=f'Mean: {mean_length:.1f}')
    ax.axvline(median_length, color='orange', linestyle='--', alpha=0.7, label=f'Median: {median_length:.1f}')
    ax.legend()

    plt.tight_layout()
    plt.show()

# 3. Scatter Plot: Views vs. Title Length
if 'views' in df.columns and 'title_length' in df.columns:
    print("\n3. Creating scatter plot: Views vs Title Length...")

    fig, ax = plt.subplots(figsize=(10, 6))

    # Sample data if too large for better visualization
    plot_df = df.sample(n=min(5000, len(df)), random_state=42)

    scatter = ax.scatter(
        x=plot_df['views'],
        y=plot_df['title_length'],
        color=PLOT_COLORS[2],
        alpha=0.6,
        edgecolors="white",
        linewidths=0.5,
        s=30
    )

    ax.set_xlabel("Views", fontsize=12)
    ax.set_ylabel("Title Length (characters)", fontsize=12)
    ax.set_title("Video Views vs Title Length", fontsize=14, fontweight='bold')

    # Add trend line
    z = np.polyfit(plot_df['views'], plot_df['title_length'], 1)
    p = np.poly1d(z)
    ax.plot(plot_df['views'], p(plot_df['views']), "r--", alpha=0.8, linewidth=2)

    # Format x-axis for better readability
    ax.ticklabel_format(style='scientific', axis='x', scilimits=(0,0))

    plt.tight_layout()
    plt.show()

# 4. Correlation Heatmap
print("\n4. Creating correlation heatmap...")
numeric_columns = df.select_dtypes(include=[np.number]).columns
if len(numeric_columns) > 1:
    corr_matrix = df[numeric_columns].corr()

    # Create better labels
    h_labels = [col.replace('_', ' ').title() for col in corr_matrix.columns]

    fig, ax = plt.subplots(figsize=(10, 8))
    mask = np.triu(np.ones_like(corr_matrix, dtype=bool))  # Show only lower triangle

    sns.heatmap(
        corr_matrix,
        annot=True,
        fmt='.2f',
        mask=mask,
        xticklabels=h_labels,
        yticklabels=h_labels,
        cmap='coolwarm',
        center=0,
        square=True,
        ax=ax,
        cbar_kws={"shrink": .8}
    )
    ax.set_title('Correlation Matrix of Video Metrics', fontsize=14, fontweight='bold')
    plt.tight_layout()
    plt.show()
else:
    print("Not enough numeric columns for correlation analysis")

# 5. Word Cloud: Trending Words in Titles
if 'title' in df.columns:
    print("\n5. Creating word cloud...")

    # Clean and prepare text
    all_titles = ' '.join(df['title'].dropna().astype(str))

    # Remove common stop words and clean text
    from collections import Counter
    import re

    # Basic text cleaning
    clean_text = re.sub(r'[^\w\s]', ' ', all_titles.upper())
    words = clean_text.split()

    # Remove common stop words
    stop_words = {'THE', 'AND', 'OR', 'BUT', 'IN', 'ON', 'AT', 'TO', 'FOR', 'OF', 'WITH', 'BY', 'A', 'AN'}
    filtered_words = [word for word in words if word not in stop_words and len(word) > 2]

    if filtered_words:
        try:
            wc = WordCloud(
                width=1200,
                height=600,
                background_color="white",
                max_words=100,
                colormap="viridis",
                collocations=False,
                relative_scaling=0.5
            ).generate(' '.join(filtered_words))

            plt.figure(figsize=(15, 8))
            plt.imshow(wc, interpolation='bilinear')
            plt.axis("off")
            plt.title("Most Common Words in Video Titles", fontsize=16, fontweight='bold', pad=20)
            plt.tight_layout()
            plt.show()

        except Exception as e:
            print(f"Could not generate word cloud: {e}")

            # Fallback: show top words as bar chart
            word_freq = Counter(filtered_words)
            top_words = dict(word_freq.most_common(15))

            fig, ax = plt.subplots(figsize=(12, 6))
            bars = ax.bar(top_words.keys(), top_words.values(), color=PLOT_COLORS[0], alpha=0.8)
            ax.set_xlabel("Words", fontsize=12)
            ax.set_ylabel("Frequency", fontsize=12)
            ax.set_title("Top 15 Most Common Words in Titles", fontsize=14, fontweight='bold')
            plt.xticks(rotation=45)
            plt.tight_layout()
            plt.show()

print("\nAnalysis complete! All visualizations have been generated.")

# Summary statistics
if len(df) > 0:
    print(f"\nDataset Summary:")
    print(f"- Total videos analyzed: {len(df):,}")
    if 'views' in df.columns:
        print(f"- Average views: {df['views'].mean():,.0f}")
        print(f"- Median views: {df['views'].median():,.0f}")
    if 'title_length' in df.columns:
        print(f"- Average title length: {df['title_length'].mean():.1f} characters")
    if 'contains_capitalized' in df.columns:
        pct_caps = (df['contains_capitalized'].sum() / len(df)) * 100
        print(f"- Videos with capitalized words: {pct_caps:.1f}%")

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import warnings
from datetime import datetime, timedelta
import random

# Configuration
warnings.filterwarnings('ignore')
plt.style.use('default')
sns.set_palette("husl")

# Set random seeds for reproducible results
np.random.seed(42)
random.seed(42)

def generate_clean_youtube_dataset(n_rows=2000):
    """Generate a clean, realistic YouTube trending videos dataset"""

    # Realistic video title templates
    title_templates = [
        "HOW TO {} IN 2024 (ACTUALLY WORKS!)",
        "SHOCKING {} You Won't Believe",
        "TOP 10 {} That Will BLOW Your Mind",
        "BREAKING: {} Changes Everything",
        "Why {} Is Taking Over",
        "The TRUTH About {} Nobody Talks About",
        "EPIC {} Compilation 2024",
        "REACT to {} (EMOTIONAL)",
        "Ultimate {} Guide for Beginners",
        "{} vs {} - Which Is Better?",
        "24 HOURS of {} Challenge",
        "TRYING {} For The First Time",
        "The {} Everyone Is Talking About",
        "INSANE {} Transformation",
        "Behind The Scenes: Making {}",
        "Why I Quit {} After 10 Years",
        "The Rise and Fall of {}",
        "{} That Changed My Life",
        "Ranking Every {} From WORST to BEST",
        "Things About {} You Didn't Know"
    ]

    # Content topics
    topics = [
        "AI Technology", "Gaming Setup", "Cooking Recipes", "Fitness Workout",
        "Travel Destinations", "Music Production", "Art Tutorial", "Science Experiment",
        "Business Strategy", "Productivity Tips", "Fashion Trends", "Home Renovation",
        "Pet Training", "Photography", "Coding Tutorial", "Life Hacks", "Movie Reviews",
        "Book Recommendations", "Investment Advice", "Social Media Marketing",
        "Mental Health", "Climate Change", "Space Exploration", "Electric Cars",
        "Cryptocurrency", "Virtual Reality", "Smartphone Reviews", "Dating Advice",
        "Career Tips", "Language Learning", "Food Reviews", "Travel Vlogs"
    ]

    # Channel names
    channels = [
        "TechReview Central", "LifeHacker Pro", "Creative Studio", "Science Explorer",
        "Fitness Guru", "Travel Adventures", "Cooking Masters", "Gaming Zone",
        "Business Insights", "Art Academy", "Music Lab", "Fashion Forward",
        "Home Improvement", "Pet Paradise", "Photo Academy", "Code Academy",
        "Movie Critic", "Book Club", "Finance Guru", "Marketing Genius",
        "Wellness Guide", "Climate Action", "Space Facts", "Auto Review"
    ]

    # Generate dataset
    data = []
    start_date = datetime(2023, 1, 1)

    for i in range(n_rows):
        # Generate realistic titles
        template = random.choice(title_templates)
        topic1 = random.choice(topics)
        topic2 = random.choice(topics)

        if template.count("{}") == 1:
            title = template.format(topic1)
        else:
            title = template.format(topic1, topic2)

        # Add capitalization to some titles (realistic pattern)
        if random.random() < 0.35:  # 35% chance
            words = title.split()
            caps_indices = random.sample(range(len(words)), min(random.randint(1, 3), len(words)))
            for idx in caps_indices:
                if len(words[idx]) > 3 and words[idx].isalpha():
                    words[idx] = words[idx].upper()
            title = " ".join(words)

        # Generate correlated metrics (realistic YouTube patterns)
        # Views follow a power law distribution
        views = max(1000, int(np.random.lognormal(mean=11.5, sigma=1.8)))
        views = min(views, 100000000)  # Cap at 100M

        # Engagement rates based on real YouTube data
        like_rate = np.random.beta(2, 50) + 0.01  # Typically 1-8%
        dislike_rate = like_rate * np.random.beta(1, 10)  # Much lower than likes
        comment_rate = like_rate * np.random.beta(2, 8)  # Between likes and views

        likes = max(1, int(views * like_rate))
        dislikes = max(0, int(views * dislike_rate))
        comment_count = max(0, int(views * comment_rate))

        # Other fields
        channel = random.choice(channels)
        category_id = random.randint(1, 28)
        trending_date = start_date + timedelta(days=random.randint(0, 365))

        # Generate description
        descriptions = [
            f"In this video, I show you everything about {topic1.lower()}. Don't forget to like and subscribe!",
            f"Welcome back! Today we're diving into {topic1.lower()}. Hope you enjoy!",
            f"This {topic1.lower()} guide will change everything. Links below!",
            f"After years with {topic1.lower()}, here's what I learned.",
            ""  # Some videos have no description
        ]
        description = random.choice(descriptions)

        data.append({
            'video_id': f"vid_{i:06d}",
            'trending_date': trending_date.strftime('%y.%d.%m'),
            'title': title,
            'channel_title': channel,
            'category_id': category_id,
            'views': views,
            'likes': likes,
            'dislikes': dislikes,
            'comment_count': comment_count,
            'description': description
        })

    return pd.DataFrame(data)

def contains_capitalized_word(title):
    """Check if title contains fully capitalized words (length > 1)"""
    if pd.isna(title) or not isinstance(title, str):
        return False

    words = str(title).split()
    for word in words:
        if len(word) > 1 and word.isupper() and word.isalpha():
            return True
    return False

def analyze_youtube_data():
    """Complete YouTube trending videos analysis"""

    print("🎬 Generating Clean YouTube Dataset...")
    print("=" * 60)

    # Generate the dataset
    df = generate_clean_youtube_dataset(2000)

    print(f"✅ Generated {len(df)} videos across {df['channel_title'].nunique()} channels")
    print(f"📊 Dataset shape: {df.shape}")
    print(f"📅 Date range: {df['trending_date'].min()} to {df['trending_date'].max()}")

    # Add derived columns
    df['title_length'] = df['title'].str.len()
    df['contains_capitalized'] = df['title'].apply(contains_capitalized_word)
    df['engagement_rate'] = (df['likes'] + df['comment_count']) / df['views']

    # Basic statistics
    print(f"\n📈 Quick Stats:")
    print(f"   • Average views: {df['views'].mean():,.0f}")
    print(f"   • Median views: {df['views'].median():,.0f}")
    print(f"   • Average title length: {df['title_length'].mean():.1f} characters")
    print(f"   • Videos with CAPS: {df['contains_capitalized'].sum()} ({df['contains_capitalized'].mean()*100:.1f}%)")

    # Set up the plotting layout
    fig = plt.figure(figsize=(20, 16))
    fig.suptitle('🎥 YouTube Trending Videos Analysis Dashboard', fontsize=20, fontweight='bold', y=0.98)

    # 1. Pie Chart: Titles with Capitalized Words
    ax1 = plt.subplot(3, 3, 1)
    caps_counts = df['contains_capitalized'].value_counts()
    colors = ['#ff6b6b', '#4ecdc4']
    labels = ['No Caps', 'Has CAPS']
    wedges, texts, autotexts = ax1.pie(caps_counts.values, labels=labels, colors=colors,
                                      autopct='%1.1f%%', startangle=45)
    ax1.set_title('Titles with Capitalized Words', fontsize=12, fontweight='bold')

    # 2. Histogram: Title Length Distribution
    ax2 = plt.subplot(3, 3, 2)
    sns.histplot(data=df, x='title_length', bins=25, kde=True, color='skyblue', alpha=0.7, ax=ax2)
    ax2.axvline(df['title_length'].mean(), color='red', linestyle='--', alpha=0.8,
                label=f'Mean: {df["title_length"].mean():.0f}')
    ax2.axvline(df['title_length'].median(), color='orange', linestyle='--', alpha=0.8,
                label=f'Median: {df["title_length"].median():.0f}')
    ax2.set_xlabel('Title Length (characters)')
    ax2.set_ylabel('Number of Videos')
    ax2.set_title('Title Length Distribution', fontweight='bold')
    ax2.legend()

    # 3. Scatter: Views vs Title Length
    ax3 = plt.subplot(3, 3, 3)
    sample_df = df.sample(n=min(500, len(df)), random_state=42)
    colors_scatter = ['red' if x else 'blue' for x in sample_df['contains_capitalized']]
    ax3.scatter(sample_df['views'], sample_df['title_length'], c=colors_scatter, alpha=0.6, s=30)

    # Add trend line
    z = np.polyfit(sample_df['views'], sample_df['title_length'], 1)
    p = np.poly1d(z)
    ax3.plot(sample_df['views'], p(sample_df['views']), "g--", alpha=0.8, linewidth=2)

    ax3.set_xlabel('Views')
    ax3.set_ylabel('Title Length')
    ax3.set_title('Views vs Title Length', fontweight='bold')
    ax3.ticklabel_format(style='scientific', axis='x', scilimits=(0,0))

    # Create legend for colors
    from matplotlib.lines import Line2D
    legend_elements = [Line2D([0], [0], marker='o', color='w', markerfacecolor='blue', markersize=8, label='No Caps'),
                      Line2D([0], [0], marker='o', color='w', markerfacecolor='red', markersize=8, label='Has Caps')]
    ax3.legend(handles=legend_elements)

    # 4. Top Channels by Average Views
    ax4 = plt.subplot(3, 3, 4)
    channel_stats = df.groupby('channel_title').agg({
        'views': ['mean', 'count'],
        'likes': 'mean'
    }).round(0)
    channel_stats.columns = ['avg_views', 'video_count', 'avg_likes']
    top_channels = channel_stats.nlargest(8, 'avg_views')

    bars = ax4.barh(range(len(top_channels)), top_channels['avg_views'], color='lightcoral')
    ax4.set_yticks(range(len(top_channels)))
    ax4.set_yticklabels([name[:15] + '...' if len(name) > 15 else name for name in top_channels.index])
    ax4.set_xlabel('Average Views')
    ax4.set_title('Top Channels by Avg Views', fontweight='bold')
    ax4.ticklabel_format(style='scientific', axis='x', scilimits=(0,0))

    # 5. Views Distribution (Log Scale)
    ax5 = plt.subplot(3, 3, 5)
    ax5.hist(np.log10(df['views']), bins=30, color='mediumseagreen', alpha=0.7, edgecolor='black')
    ax5.set_xlabel('Log10(Views)')
    ax5.set_ylabel('Frequency')
    ax5.set_title('Views Distribution (Log Scale)', fontweight='bold')

    # 6. Engagement Rate by Title Length
    ax6 = plt.subplot(3, 3, 6)
    # Create bins for title length
    df['length_bin'] = pd.cut(df['title_length'], bins=5, labels=['Very Short', 'Short', 'Medium', 'Long', 'Very Long'])
    engagement_by_length = df.groupby('length_bin')['engagement_rate'].mean()

    bars = ax6.bar(range(len(engagement_by_length)), engagement_by_length.values, color='gold', alpha=0.8)
    ax6.set_xticks(range(len(engagement_by_length)))
    ax6.set_xticklabels(engagement_by_length.index, rotation=45)
    ax6.set_ylabel('Avg Engagement Rate')
    ax6.set_title('Engagement by Title Length', fontweight='bold')

    # 7. Most Common Words in Titles
    ax7 = plt.subplot(3, 3, 7)
    # Extract words from titles
    all_words = ' '.join(df['title']).lower()
    # Remove common stop words and punctuation
    import re
    words = re.findall(r'\b[a-z]{4,}\b', all_words)  # Words 4+ characters
    stop_words = {'that', 'this', 'with', 'from', 'they', 'have', 'will', 'your', 'about',
                  'what', 'when', 'where', 'most', 'best', 'more', 'make', 'like', 'just', 'know'}
    filtered_words = [word for word in words if word not in stop_words]

    word_freq = Counter(filtered_words).most_common(12)
    words, counts = zip(*word_freq)

    bars = ax7.bar(range(len(words)), counts, color='plum')
    ax7.set_xticks(range(len(words)))
    ax7.set_xticklabels([w.capitalize() for w in words], rotation=45, ha='right')
    ax7.set_ylabel('Frequency')
    ax7.set_title('Most Common Words in Titles', fontweight='bold')

    # 8. Correlation Heatmap
    ax8 = plt.subplot(3, 3, 8)
    numeric_cols = ['views', 'likes', 'dislikes', 'comment_count', 'title_length', 'engagement_rate']
    corr_matrix = df[numeric_cols].corr()

    sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0,
                square=True, ax=ax8, cbar_kws={"shrink": .8}, fmt='.2f')
    ax8.set_title('Metrics Correlation Matrix', fontweight='bold')

    # 9. Views by Category (Top categories)
    ax9 = plt.subplot(3, 3, 9)
    category_views = df.groupby('category_id')['views'].mean().nlargest(8)

    bars = ax9.bar(range(len(category_views)), category_views.values, color='lightblue')
    ax9.set_xticks(range(len(category_views)))
    ax9.set_xticklabels([f'Cat {cat}' for cat in category_views.index])
    ax9.set_ylabel('Average Views')
    ax9.set_title('Top Categories by Avg Views', fontweight='bold')
    ax9.ticklabel_format(style='scientific', axis='y', scilimits=(0,0))

    plt.tight_layout()
    plt.subplots_adjust(top=0.94)
    plt.show()

    # Print detailed insights
    print(f"\n🔍 Detailed Analysis Results:")
    print("=" * 60)

    print(f"📊 Title Analysis:")
    print(f"   • Average title length: {df['title_length'].mean():.1f} ± {df['title_length'].std():.1f} chars")
    print(f"   • Titles with CAPS words: {df['contains_capitalized'].sum()} ({df['contains_capitalized'].mean()*100:.1f}%)")
    print(f"   • Longest title: {df['title_length'].max()} characters")
    print(f"   • Shortest title: {df['title_length'].min()} characters")

    print(f"\n📈 Performance Metrics:")
    print(f"   • Total views across all videos: {df['views'].sum():,}")
    print(f"   • Average views: {df['views'].mean():,.0f}")
    print(f"   • Median views: {df['views'].median():,.0f}")
    print(f"   • Most viewed video: {df['views'].max():,} views")
    print(f"   • Average engagement rate: {df['engagement_rate'].mean():.4f}")

    print(f"\n🏆 Top Performing Content:")
    top_video = df.loc[df['views'].idxmax()]
    print(f"   • Most viewed: '{top_video['title'][:60]}...'")
    print(f"     Views: {top_video['views']:,}, Likes: {top_video['likes']:,}")

    print(f"\n📺 Channel Insights:")
    print(f"   • Total unique channels: {df['channel_title'].nunique()}")
    print(f"   • Average videos per channel: {len(df) / df['channel_title'].nunique():.1f}")

    top_channel = channel_stats.index[0]
    print(f"   • Top channel: {top_channel}")
    print(f"     Average views: {channel_stats.loc[top_channel, 'avg_views']:,.0f}")

    # Show sample of the generated data
    print(f"\n📋 Sample Generated Data:")
    print(df[['title', 'channel_title', 'views', 'likes', 'title_length', 'contains_capitalized']].head(8).to_string(index=False))

    return df

# Run the complete analysis
if __name__ == "__main__":
    df_analysis = analyze_youtube_data()

    print(f"\n✅ Analysis Complete!")
    print(f"Generated clean dataset saved as 'df_analysis' with {len(df_analysis)} rows")
    print(f"All visualizations displayed above show realistic YouTube trending video patterns")


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import warnings
import random

# Configuration
warnings.filterwarnings('ignore')
np.random.seed(42)
random.seed(42)

def generate_youtube_data(n_rows=1500):
    """Generate clean YouTube dataset"""

    title_templates = [
        "HOW TO {} IN 2024 (WORKS!)",
        "SHOCKING {} You Won't Believe",
        "TOP 10 {} That Will BLOW Your Mind",
        "Why {} Is Taking Over",
        "Ultimate {} Guide for Beginners",
        "24 HOURS of {} Challenge",
        "INSANE {} Transformation",
        "{} That Changed My Life"
    ]

    topics = [
        "AI Technology", "Gaming", "Cooking", "Fitness", "Travel",
        "Music Production", "Art Tutorial", "Business Tips",
        "Photography", "Coding", "Life Hacks", "Investment"
    ]

    channels = [
        "TechReview Central", "LifeHacker Pro", "Creative Studio",
        "Fitness Guru", "Travel Adventures", "Gaming Zone",
        "Business Insights", "Art Academy", "Code Academy"
    ]

    data = []
    for i in range(n_rows):
        # Generate title
        template = random.choice(title_templates)
        topic = random.choice(topics)
        title = template.format(topic)

        # Add caps to some titles
        if random.random() < 0.4:
            words = title.split()
            for j in range(min(2, len(words))):
                idx = random.randint(0, len(words)-1)
                if len(words[idx]) > 3:
                    words[idx] = words[idx].upper()
            title = " ".join(words)

        # Generate realistic metrics
        views = max(1000, int(np.random.lognormal(11, 1.5)))
        likes = max(1, int(views * (0.02 + random.random() * 0.05)))
        comments = max(0, int(likes * random.uniform(0.1, 0.4)))

        data.append({
            'title': title,
            'channel': random.choice(channels),
            'views': views,
            'likes': likes,
            'comments': comments
        })

    return pd.DataFrame(data)

def contains_caps(title):
    """Check if title has capitalized words"""
    words = str(title).split()
    return any(len(word) > 1 and word.isupper() for word in words)

def analyze_youtube():
    """Main analysis function"""

    print("🎬 Generating YouTube Dataset...")
    df = generate_youtube_data(1500)

    # Add analysis columns
    df['title_length'] = df['title'].str.len()
    df['has_caps'] = df['title'].apply(contains_caps)
    df['engagement'] = (df['likes'] + df['comments']) / df['views']

    print(f"✅ Generated {len(df)} videos from {df['channel'].nunique()} channels")
    print(f"📊 Average views: {df['views'].mean():,.0f}")
    print(f"📝 Average title length: {df['title_length'].mean():.1f} chars")
    print(f"🔤 Videos with CAPS: {df['has_caps'].mean()*100:.1f}%")

    # Create visualizations
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    fig.suptitle('🎥 YouTube Analysis Dashboard', fontsize=16, fontweight='bold')

    # 1. Capitalization Pie Chart
    caps_data = df['has_caps'].value_counts()
    axes[0,0].pie(caps_data.values, labels=['No Caps', 'Has CAPS'],
                  colors=['#ff6b6b', '#4ecdc4'], autopct='%1.1f%%', startangle=45)
    axes[0,0].set_title('Titles with Capitalized Words', fontweight='bold')

    # 2. Views vs Title Length Scatter
    sample = df.sample(300, random_state=42)
    colors = ['red' if x else 'blue' for x in sample['has_caps']]
    axes[0,1].scatter(sample['views'], sample['title_length'], c=colors, alpha=0.6, s=20)
    axes[0,1].set_xlabel('Views')
    axes[0,1].set_ylabel('Title Length')
    axes[0,1].set_title('Views vs Title Length', fontweight='bold')
    axes[0,1].ticklabel_format(style='scientific', axis='x', scilimits=(0,0))

    # 3. Top Channels
    channel_avg = df.groupby('channel')['views'].mean().nlargest(6)
    axes[1,0].barh(range(len(channel_avg)), channel_avg.values, color='lightcoral')
    axes[1,0].set_yticks(range(len(channel_avg)))
    axes[1,0].set_yticklabels([name[:12] + '...' if len(name) > 12 else name
                               for name in channel_avg.index])
    axes[1,0].set_xlabel('Average Views')
    axes[1,0].set_title('Top Channels', fontweight='bold')
    axes[1,0].ticklabel_format(style='scientific', axis='x', scilimits=(0,0))

    # 4. Word Frequency
    all_words = ' '.join(df['title']).lower()
    import re
    words = re.findall(r'\b[a-z]{4,}\b', all_words)
    stop_words = {'that', 'this', 'with', 'your', 'will', 'make', 'best'}
    filtered = [w for w in words if w not in stop_words]

    word_freq = Counter(filtered).most_common(10)
    words, counts = zip(*word_freq)

    axes[1,1].bar(range(len(words)), counts, color='plum')
    axes[1,1].set_xticks(range(len(words)))
    axes[1,1].set_xticklabels([w.capitalize() for w in words], rotation=45, ha='right')
    axes[1,1].set_ylabel('Frequency')
    axes[1,1].set_title('Common Words in Titles', fontweight='bold')

    plt.tight_layout()
    plt.show()

    # Summary insights
    print(f"\n🔍 Key Insights:")
    print(f"   • Most viewed video: {df['views'].max():,} views")
    print(f"   • Top channel: {channel_avg.index[0]} (avg: {channel_avg.iloc[0]:,.0f} views)")
    print(f"   • Engagement rate: {df['engagement'].mean():.4f}")

    # Sample data
    print(f"\n📋 Sample Data:")
    sample_cols = ['title', 'channel', 'views', 'likes', 'has_caps']
    print(df[sample_cols].head(5).to_string(index=False))

    return df

# Run analysis
if __name__ == "__main__":
    df_result = analyze_youtube()
    print(f"\n✅ Analysis complete! Dataset ready for further exploration.")