In [None]:
# ==============================================================================
# FINAL SCRIPT WITH HIGH-ENGAGEMENT ANALYSIS (CORRECTED)
# ==============================================================================

# STEP 1: SETUP
print("STEP 1: Setting up the environment...")
!pip install wordcloud -q
import nltk
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
print("✅ Environment setup is complete.\n")


# STEP 2: MAIN ANALYSIS
import pandas as pd
import numpy as np
import re
from collections import Counter
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

# --- Loading Your Data from Colab's Session Storage ---
print("STEP 2: Loading data...")
try:
    # Using your specified filenames
    fans_df = pd.read_csv('your_fans_data.csv')
    official_df = pd.read_csv('your_official_data.csv')
    print("✅ Data loaded successfully!")
except FileNotFoundError:
    print("❌ FATAL ERROR: Files not found! Please re-upload your CSV files.")
    raise

# --- NLP & Data Preprocessing Pipeline ---

# 1. Likes Conversion
def convert_likes_to_int(like_value):
    if pd.isna(like_value): return 0
    if isinstance(like_value, str):
        cleaned_string = like_value.replace(',', '').strip()
        if not cleaned_string: return 0
        try: return int(float(cleaned_string))
        except (ValueError, TypeError): return 0
    try: return int(like_value)
    except (ValueError, TypeError): return 0

print("\nSTEP 3: Preprocessing data...")
fans_df['likes_int'] = fans_df['likes'].apply(convert_likes_to_int)
official_df['likes_int'] = official_df['likes'].apply(convert_likes_to_int)
print(" -> 'likes' column converted to integers.")

# 2. Text Preprocessing Setup
lemmatizer = WordNetLemmatizer()
english_stop_words = set(stopwords.words('english'))
custom_stop_words = {'delhi', 'capitals', 'dc', 'ipl', 'hai', 'nayi', 'dilli', 'yeh', 'match', 'game', 'vs', 'year', 'go', 'let', 'come', 'one'}
hinglish_stop_words = {'aur', 'bhi', 'bhai', 'bahut', 'chalo', 'gaya', 'ho', 'hua', 'iss', 'ka', 'ke', 'ki', 'ko', 'kya', 'mein', 'se', 'tha', 'the', 'toh', 'kar', 'jeetna', 'saal', 'bura', 'laga', 'jeetenge'}
stop_words = english_stop_words.union(custom_stop_words).union(hinglish_stop_words)

def preprocess_text(text):
    if not isinstance(text, str): return ""
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+|\#','', text)
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = re.findall(r'\b\w+\b', text)
    processed_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words and len(word) > 2]
    return " ".join(processed_tokens)

# Apply text cleaning
# This handles the different structures of your two files correctly.
fans_df['full_text'] = fans_df['caption'].fillna('') + ' ' + fans_df['hashtags'].fillna('')
# This part correctly handles that 'official_df' has no 'hashtags' column.
official_df['full_text'] = official_df['caption'].fillna('')

fans_df['cleaned_text'] = fans_df['full_text'].apply(preprocess_text)
official_df['cleaned_text'] = official_df['full_text'].apply(preprocess_text)
print(" -> Text preprocessing complete.")
print("✅ Preprocessing finished successfully.")

# --- Helper Functions for Analysis & Visualization ---
def get_top_n_words(corpus, n=None):
    if corpus.empty or corpus.str.strip().eq('').all(): return []
    vec = CountVectorizer().fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

def plot_frequent_words(top_words, title, color='skyblue'):
    if not top_words:
        print(f"Skipping plot for '{title}' as there are no words to display.")
        return
    df = pd.DataFrame(top_words, columns=['Word', 'Frequency'])
    # Making plot taller to accommodate 50 words
    plt.figure(figsize=(12, 15))
    sns.barplot(x='Frequency', y='Word', data=df, palette=[color])
    plt.title(title, fontsize=16); plt.xlabel('Frequency', fontsize=12); plt.ylabel('Word', fontsize=12); plt.show()

def plot_word_cloud(corpus, title):
    all_text = " ".join(corpus)
    if not all_text.strip():
        print(f"Cannot generate word cloud for '{title}' because there are no words to display.")
        return
    wordcloud = WordCloud(width=800, height=400, background_color='white', colormap='viridis').generate(all_text)
    plt.figure(figsize=(15, 8)); plt.imshow(wordcloud, interpolation='bilinear'); plt.axis('off'); plt.title(title, fontsize=16); plt.show()


# ==============================================================================
# NEW SECTION: ANALYSIS OF HIGH-ENGAGEMENT POSTS (TOP 50% BY LIKES)
# ==============================================================================
print("\n" + "="*60)
print("ANALYSIS OF HIGH-ENGAGEMENT POSTS (TOP 50% BY LIKES)")
print("="*60 + "\n")

# --- Analysis for Fans' High-Engagement Posts ---
fan_likes_median = fans_df['likes_int'].quantile(0.5)
print(f"Fan Posts Median Likes (Top 50% Threshold): {fan_likes_median:,.0f}")
top_50_percent_fans_df = fans_df[fans_df['likes_int'] >= fan_likes_median].copy()

if not top_50_percent_fans_df.empty:
    # <<< FIX #1: Changed from 20 to 50 to match your request >>>
    top_50_fan_words = get_top_n_words(top_50_percent_fans_df['cleaned_text'], 50)

    # <<< FIX #2: Changed variable name from top_20_fan_words to top_50_fan_words >>>
    # <<< FIX #3: Changed plot title from "Top 20" to "Top 50" >>>
    plot_frequent_words(
        top_50_fan_words,
        "Top 50 Words from High-Engagement Fan Posts (Top 50% by Likes)",
        color='#004C97'
    )
    plot_word_cloud(
        top_50_percent_fans_df['cleaned_text'],
        "Word Cloud from High-Engagement Fan Posts"
    )
else:
    print("No fan posts found in the top 50% by likes to analyze.")


# --- Analysis for Official Account's High-Engagement Posts ---
print("-" * 60)
official_likes_median = official_df['likes_int'].quantile(0.5)
print(f"Official Account Median Likes (Top 50% Threshold): {official_likes_median:,.0f}")
top_50_percent_official_df = official_df[official_df['likes_int'] >= official_likes_median].copy()

if not top_50_percent_official_df.empty:
    # <<< FIX #1: Changed from 20 to 50 to match your request >>>
    top_50_official_words = get_top_n_words(top_50_percent_official_df['cleaned_text'], 50)

    # <<< FIX #2: Changed variable name from top_20_official_words to top_50_official_words >>>
    # <<< FIX #3: Changed plot title from "Top 20" to "Top 50" >>>
    plot_frequent_words(
        top_50_official_words,
        "Top 50 Words from High-Engagement Official Posts (Top 50% by Likes)",
        color='#EF1E23'
    )
    plot_word_cloud(
        top_50_percent_official_df['cleaned_text'],
        "Word Cloud from High-Engagement Official Posts"
    )
else:
    print("No official posts found in the top 50% by likes to analyze.")


# ==============================================================================
# ORIGINAL OVERALL ANALYSIS (Still useful for comparison)
# This part is left unchanged to show Top 20 overall words.
# ==============================================================================
print("\n" + "="*60)
print("OVERALL ANALYSIS (ALL POSTS)")
print("="*60 + "\n")

top_fan_words_overall = get_top_n_words(fans_df['cleaned_text'], 20)
plot_frequent_words(top_fan_words_overall, "Top 20 Words from ALL Fan Posts", color='#004C97')

top_official_words_overall = get_top_n_words(official_df['cleaned_text'], 20)
plot_frequent_words(top_official_words_overall, "Top 20 Words from ALL Official Posts", color='#EF1E23')