In [2]:
import asyncio
from playwright.async_api import async_playwright
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import os

import nest_asyncio
nest_asyncio.apply()

In [3]:

# === Influencer decision logic ===
def is_influencer(followers, verified):
    def convert_to_number(s):
        s = s.lower().replace(",", "").strip()
        s = re.sub(r'[^0-9.km]', '', s)
        if 'm' in s:
            return float(s.replace("m", "")) * 1_000_000
        elif 'k' in s:
            return float(s.replace("k", "")) * 1_000
        try:
            return float(s)
        except:
            return 0

    followers_num = convert_to_number(followers)

    if verified and followers_num > 10_000:
        return "✅ This person is likely an INFLUENCER."
    elif not verified and followers_num > 10_000:
        return "✅ This person is likely an INFLUENCER."
    else:
        return "❌ This person is NOT likely an influencer."


In [4]:

# === Settings ===
USERNAME = "maggie_888880"
PASSWORD = "Miss_maggie8088"
TARGET_USER = "mostlysane"  # Replace with actual username
POST_COUNT = 3
comments_data = []

async def fetch_instagram_data():
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        context = await browser.new_context()
        page = await context.new_page()

        # Login
        await page.goto("https://www.instagram.com/accounts/login/")
        await page.wait_for_timeout(5000)
        await page.fill("input[name='username']", USERNAME)
        await page.fill("input[name='password']", PASSWORD)
        await page.click("button[type='submit']")
        await page.wait_for_timeout(7000)

        for button_text in ["Not Now", "Save Info", "Turn on Notifications"]:
            try:
                btn = await page.wait_for_selector(f'//button[contains(text(), "{button_text}")]', timeout=5000)
                await btn.click()
                await page.wait_for_timeout(2000)
            except:
                pass

        await page.goto(f"https://www.instagram.com/{TARGET_USER}/")
        await page.wait_for_timeout(5000)

        # Check if account is private
        try:
            private_text_elem = await page.query_selector("h2")
            private_text = await private_text_elem.inner_text() if private_text_elem else ""
            if "This Account is Private" in private_text or "This Account is Private." in private_text:
                print(f"❌ The account @{TARGET_USER} is PRIVATE.")
                await browser.close()
                return {
                    "df": pd.DataFrame(),
                    "csv_path": None,
                    "followers": "0",
                    "following": "0",
                    "verified": False,
                    "is_private": True
                }
        except Exception:
            pass

        # Extract profile info
        try:
            stats = await page.query_selector_all("header section ul li")
            followers, following = "N/A", "N/A"
            if len(stats) >= 3:
                followers = await stats[1].inner_text()
                following = await stats[2].inner_text()
            is_verified = await page.query_selector("header svg[aria-label='Verified']") is not None
            bio_elem = await page.query_selector("header section div.-vDIg > span")
            about = await bio_elem.inner_text() if bio_elem else "N/A"

            print(f"\n📄 Profile: {TARGET_USER}")
            print(f"✅ Verified: {is_verified}")
            print(f"👥 Followers: {followers}")
            print(f"➡️ Following: {following}")
            print(f"📝 Bio: {about}")

        except Exception as e:
            print("❌ Error extracting profile data:", str(e))
            followers, following, is_verified = "0", "0", False

        # Scroll to load posts
        await page.mouse.wheel(0, 3000)
        await page.wait_for_timeout(3000)

        anchors = await page.query_selector_all("a")
        post_links = []
        for a in anchors:
            href = await a.get_attribute("href")
            if href and "/p/" in href:
                full_url = "https://www.instagram.com" + href
                if full_url not in post_links:
                    post_links.append(full_url)
            if len(post_links) >= POST_COUNT:
                break

        print(f"📌 {len(post_links)} post(s) found.\n")

        for i, post_url in enumerate(post_links):
            post_num = i + 1
            print(f"🔍 Scraping comments for Post {post_num}")
            await page.goto(post_url)
            await page.wait_for_timeout(5000)

            for _ in range(10):
                try:
                    more_btn = await page.query_selector('//button[contains(text(), "Load more comments")]')
                    if more_btn:
                        await more_btn.click()
                        await page.wait_for_timeout(2000)
                    else:
                        break
                except:
                    break

            for _ in range(10):
                await page.mouse.wheel(0, 1500)
                await page.wait_for_timeout(1000)

            try:
                comment_blocks = await page.query_selector_all("ul ul div li span")
                for block in comment_blocks:
                    try:
                        comment = await block.inner_text()
                        if comment.strip():
                            comments_data.append({
                                "Post Number": post_num,
                                "Comment": comment.strip()
                            })
                            print(f"💬 {comment.strip()}")
                    except:
                        continue
            except:
                print("⚠️ No comments found on this post.")
                continue

        await browser.close()
        df = pd.DataFrame(comments_data)
        csv_path = f"comments_postwise_{TARGET_USER}.csv"
        df.to_csv(csv_path, index=False)
        print(f"\n✅ Comments saved to: {csv_path}")

        return {
            "df": df,
            "csv_path": csv_path,
            "followers": followers,
            "following": following,
            "verified": is_verified,
            "is_private": False
        }

def clean_comment(comment):
    comment = re.sub(r'[^\x00-\x7F]+', '', comment)
    if re.match(r'^[A-Za-z0-9_.]+$', comment):
        return ''
    comment = re.sub(r'\b(?:Reply|likes?|See translation)\b', '', comment, flags=re.IGNORECASE)
    comment = re.sub(r'\s+', ' ', comment).strip()
    if re.search(r'\d', comment):
        return ''
    return comment

async def main():
    result = await fetch_instagram_data()

    if result.get("is_private", False):
        print(f"\n🧠 Final Decision:\n❌ The account @{TARGET_USER} is PRIVATE and NOT an influencer.")
        return  # Stop further processing if private

    csv_path = result['csv_path']
    followers = result['followers']
    verified = result['verified']

    # Check if CSV path is valid and file exists
    if not csv_path or not os.path.exists(csv_path):
        print(f"\n⚠️ Comments CSV not found, stopping analysis.")
        print(f"\n🧠 Final Decision:\n❌ The account @{TARGET_USER} is NOT an influencer.")
        return

    # Check if CSV file is empty
    if os.path.getsize(csv_path) == 0:
        print(f"\n⚠️ Comments CSV is empty. No comments found.")
        print(f"\n🧠 Final Decision:\n❌ The account @{TARGET_USER} is NOT an influencer.")
        return

    # Load CSV safely with try-except for empty or malformed files
    try:
        df = pd.read_csv(csv_path)
    except pd.errors.EmptyDataError:
        print(f"\n⚠️ Comments CSV is empty or malformed.")
        print(f"\n🧠 Final Decision:\n❌ The account @{TARGET_USER} is NOT an influencer.")
        return

    if df.empty:
        print(f"\n⚠️ Comments CSV has no data rows.")
        print(f"\n🧠 Final Decision:\n❌ The account @{TARGET_USER} is NOT an influencer.")
        return

    comments = df['Comment'].astype(str)

    cleaned_comments = comments.apply(clean_comment)
    cleaned_comments = cleaned_comments[cleaned_comments != '']
    cleaned_df = pd.DataFrame({'Comment': cleaned_comments})
    cleaned_df.to_csv("cleaned_comments_no_numbers.csv", index=False)

    # === Sentiment analysis (for display only) ===
    try:
        labeled_df = pd.read_csv("instagram_sentiment_dataset.csv")  # Must exist
    except Exception as e:
        print(f"❌ Sentiment dataset missing or error reading: {e}")
        print(f"\n🧠 Final Decision:\n{is_influencer(followers, verified)}")
        return

    X_train, X_test, y_train, y_test = train_test_split(
        labeled_df['Comment'], labeled_df['Label'], test_size=0.2, random_state=42
    )

    model = Pipeline([
        ('tfidf', TfidfVectorizer(stop_words='english')),
        ('classifier', MultinomialNB())
    ])

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    print(f"\n✅ Sentiment Model Accuracy: {accuracy:.2f}")
    print("\n📊 Classification Report:\n", classification_report(y_test, y_pred))

    real_comments = cleaned_df['Comment'].dropna()
    predicted_sentiments = model.predict(real_comments)
    cleaned_df['Predicted_Sentiment'] = predicted_sentiments
    cleaned_df.to_csv("cleaned_comments_with_predictions.csv", index=False)

    sentiment_counts = cleaned_df['Predicted_Sentiment'].value_counts()
    print("\n📊 Sentiment Distribution:\n", sentiment_counts)

    # === Final influencer decision ===
    print(f"\n🧠 Final Decision:\n{is_influencer(followers, verified)}")

# Run in Jupyter or interactive environment:
await main()


📄 Profile: mostlysane
✅ Verified: True
👥 Followers: 8.8M followers
➡️ Following: 5,390 following
📝 Bio: N/A
📌 3 post(s) found.

🔍 Scraping comments for Post 1
💬 adii.redkar
💬 adii.redkar
💬 AREY VARUN BHAIYA
💬 10h36 likesReply
💬 36 likes
💬 Reply
💬 prajakta__forever
💬 prajakta__forever
💬 Birthdayy countdown starts cutiee 👀🥳
💬 11h5 likesReply
💬 5 likes
💬 Reply
💬 _.mostlyprajakta._
💬 _.mostlyprajakta._
💬 🔥🙌🙌
💬 11h1 likeReply
💬 1 like
💬 Reply
💬 _chahathasija
💬 _chahathasija
💬 ❤️😍
💬 11h1 likeReply
💬 1 like
💬 Reply
💬 _ig.akash_
💬 _ig.akash_
💬 LoveLove 💕
💬 11h1 likeReply
💬 1 like
💬 Reply
💬 _.mostlyprajakta._
💬 _.mostlyprajakta._
💬 That freaking contagious smile of yours!!🥹🥹❤️
💬 11h1 likeReply
💬 1 like
💬 Reply
💬 mahima_chettri17
💬 mahima_chettri17
💬 😍❤️
💬 7hReply
💬 Reply
💬 _parth_0102
💬 _parth_0102
💬 Areee varun bhaiya
💬 11h10 likesReply
See translation
💬 10 likes
💬 Reply
💬 See translation
💬 prajakta__forever
💬 prajakta__forever
💬 Youuu happpy ,, we happpyyy 😭
💬 11h2 likesReply
💬 2 likes
💬 Rep