# Data Identification

In [209]:
# Basic libraries
import pandas as pd
import numpy as np

In [211]:
# NLP features (optional later)
from textblob import TextBlob   # for sentiment

In [212]:
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt

In [None]:
#Import data
data=pd.read_csv(r"D:\Datathon\DatasetCombined\CombinedComments.csv")

In [214]:
#Take a preview
#Displays the first 5 rows of the dataset to get an initial look at the data structure and sample values.
data.head(5)

Unnamed: 0,Source.Name,kind,commentId,channelId,videoId,authorId,textOriginal,parentCommentId,likeCount,publishedAt,updatedAt
0,comments1.csv,youtube#comment,1781382,14492,74288,2032536,PLEASE LESBIAN FLAG I BEG YOU \n\nYou would ro...,,0,16/8/2023 5:48,16/8/2023 5:48
1,comments1.csv,youtube#comment,289571,14727,79618,3043229,Apply mashed potato juice and mixed it with curd,3198066.0,0,2/10/2023 21:08,2/10/2023 21:08
2,comments1.csv,youtube#comment,569077,3314,51826,917006,69 missed calls from mars👽,,0,31/5/2024 20:03,31/5/2024 20:03
3,comments1.csv,youtube#comment,2957962,5008,58298,1853470,Baaa,,0,13/2/2024 23:48,13/2/2024 23:48
4,comments1.csv,youtube#comment,673093,21411,1265,2584166,you look like raven from phenomena raven no cap,,0,16/2/2020 6:28,16/2/2020 6:28


In [215]:
#Displays the last 5 rows of the dataset to check how the dataset ends and verify data consistency.
data.tail(5)

Unnamed: 0,Source.Name,kind,commentId,channelId,videoId,authorId,textOriginal,parentCommentId,likeCount,publishedAt,updatedAt
9990,comments5.csv,youtube#comment,921956,49359,1074,1636820,How she do it agar ma krugi meko sb face p he ...,,0,17/3/2025 12:27,17/3/2025 12:27
9991,comments5.csv,youtube#comment,1937406,18820,79136,1976114,Табий гузалроқ эди катта ёшликларга ухшади 😢😢,,0,28/3/2024 3:31,28/3/2024 3:31
9992,comments5.csv,youtube#comment,2639661,28652,25093,2644409,Beautiful 😍😍😍❤❤❤,,2,14/10/2021 15:46,14/10/2021 15:46
9993,comments5.csv,youtube#comment,1199495,1911,39559,1995432,Chloé eau de perfume is feminine and delicate,,1,1/5/2022 13:56,1/5/2022 13:56
9994,comments5.csv,youtube#comment,4656204,23924,90564,1528784,You can tell by her arms that she works out 😄💪🏼,,3,16/2/2023 19:28,16/2/2023 19:28


In [216]:
#General information
#Provides a summary of the dataset including column names, data types, non-null counts, and memory usage.
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9995 entries, 0 to 9994
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Source.Name      9995 non-null   object 
 1   kind             9995 non-null   object 
 2   commentId        9995 non-null   int64  
 3   channelId        9995 non-null   int64  
 4   videoId          9995 non-null   int64  
 5   authorId         9995 non-null   int64  
 6   textOriginal     9995 non-null   object 
 7   parentCommentId  1088 non-null   float64
 8   likeCount        9995 non-null   int64  
 9   publishedAt      9995 non-null   object 
 10  updatedAt        9995 non-null   object 
dtypes: float64(1), int64(5), object(5)
memory usage: 859.1+ KB


In [217]:
# Descriptive stats for like_count column only
print(data["likeCount"].describe())

count     9995.000000
mean        15.876138
std        452.085664
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max      32259.000000
Name: likeCount, dtype: float64


In [218]:
print(data["textOriginal"].sample(3, random_state=42), "\n")

3125    So stormi is gonna reproduce ta product ok 💀 o...
1441    Are tu bolti hui bilkul bhi achi  nahi lagti  ...
4510                             😱 your so beautiful ❤️❤️
Name: textOriginal, dtype: object 



# Data Preprocessing

In [219]:
#Sums up the number of missing (null) entries in each column.
missing_values = data.isnull().sum()

#Prints the count of missing values for all columns to decide if imputation is necessary.
print("Missing values in each column:")
print(missing_values)

Missing values in each column:
Source.Name           0
kind                  0
commentId             0
channelId             0
videoId               0
authorId              0
textOriginal          0
parentCommentId    8907
likeCount             0
publishedAt           0
updatedAt             0
dtype: int64


In [None]:
#Detect comment beside English

from langdetect import detect, DetectorFactory
DetectorFactory.seed = 0  # make results reproducible

def detect_language(text):
    try:
        return detect(text)
    except:
        return "unknown"  # if detection fails

# New column for language
data["language"] = data["textOriginal"].apply(detect_language)

print("Language distribution:")
print(data["language"].value_counts().head(10))  # see top 10 detected languages

# Filter for English only 
data = data[data["language"] == "en"]

Language distribution:
language
en         5241
unknown     947
id          430
so          258
fr          216
de          210
af          199
ro          194
tl          189
et          186
Name: count, dtype: int64


In [222]:
#Clean original text 

def clean_text(text):
    # normalize case
    text = str(text).lower()  
    # remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)  
    # remove mentions (@user) 
    text = re.sub(r'@\w+', '', text)                      
    # hashtags: remove "#" but keep the word
    text = text.replace("#", "")
    # keep emojis, remove other unwanted characters
    text = re.sub(r'[^a-zA-Z0-9\s\U0001F600-\U0001F64F]', '', text)
    # Remove repeated punctuation
    text = re.sub(r'([!?.,])\1{2,}', r'\1', text)
    # Remove filler words
    text = re.sub(r"\b(lol|omg|idk|lmao|haha|hehe)\b", "", text)
    
    return text.strip()

data["cleaned_comment"] = data["textOriginal"].apply(clean_text)

In [223]:
# Quick check
print(data[["textOriginal", "cleaned_comment"]].sample(5, random_state=25))

                                           textOriginal  \
7093  Next time it happens don't be afraid to ask. M...   
784                   It’s that time of the Ear again 😂   
3874                           THE HARRY STYLES SOUND ❤   
3534  Why do i think latina and baddie mameup is so ...   
4051  But u r looks different without makeup mam ......   

                                        cleaned_comment  
7093  next time it happens dont be afraid to ask men...  
784                    its that time of the ear again 😂  
3874                             the harry styles sound  
3534  why do i think latina and baddie mameup is so ...  
4051  but u r looks different without makeup mam m t...  


# Sentiment Analysis

In [224]:
# Emoji helper
def emoji_sentiment(text):
    if any(e in text for e in ["❤","💕","😘","😊","😂","😍"]):
        return "positive"
    if any(e in text for e in ["😡","😠","💔","😢"]):
        return "negative"
    return None

In [225]:
# Main sentiment function
def get_sentiment(text):
    # Check emoji sentiment first
    emoji_score = emoji_sentiment(text)
    if emoji_score:
        return emoji_score

    # Fallback to TextBlob
    analysis = TextBlob(text)
    if analysis.sentiment.polarity > 0:
        return "positive"
    elif analysis.sentiment.polarity < 0:
        return "negative"
    else:
        return "neutral"

In [226]:
# Apply to dataset
data["sentiment"] = data["cleaned_comment"].apply(get_sentiment)

In [228]:
# Show some sample results
print("\nSample Sentiment Results:")
print(data[["cleaned_comment", "sentiment"]].sample(10, random_state=19))


Sample Sentiment Results:
                                        cleaned_comment sentiment
1806        the editing is toptier seriously impressive  positive
6831                              me too bro me too 😂😂😂  positive
1686          but payton is your best friend is it over  positive
9265  could you do a unicorn inspired one \nlove you...  positive
3937  o everyone who says shes a man this comment if...   neutral
4659  extremely short you made his eyebrows much sho...  negative
8501                              more like xqc haircut  positive
7441  you should use the ice cream hair glue brand i...  positive
3596                                            urqueen   neutral
9232                                        pinkand red   neutral


In [229]:
# Show overall distribution
print("\nSentiment Distribution (Counts):")
print(data["sentiment"].value_counts())

print("\nSentiment Distribution (Percentage):")
print(data["sentiment"].value_counts(normalize=True) * 100)


Sentiment Distribution (Counts):
sentiment
positive    2825
neutral     1795
negative     621
Name: count, dtype: int64

Sentiment Distribution (Percentage):
sentiment
positive    53.901927
neutral     34.249189
negative    11.848884
Name: proportion, dtype: float64


# Spam Detection

In [230]:
import re

def is_spam(text):
    text = str(text).lower().strip()
    
    # 1. Emoji handling
    emoji_list = "❤💕😘😊😂😍😡😠💔😢😮😱🥰🔥✨💯👍🙏"
    emoji_count = sum(1 for ch in text if ch in emoji_list)

    # Emoji-only (spammy)
    if emoji_count > 5 and len(text.split()) == 0:
        return True

    # Emojis dominate (>60% of content is emojis)
    if emoji_count > 0 and (emoji_count / max(1, len(text))) > 0.6:
        return True

    # 2. Suspicious links or promo
    if "http" in text or "www" in text or "bit.ly" in text or "t.me" in text:
        return True

    # 3. Very short meaningless comment (only 1-2 chars)
    if len(text.split()) == 1 and len(text) < 3:
        return True

    # 4. Nonsense/random short alphanumeric mix
    if re.fullmatch(r"[a-zA-Z]*[0-9]+[a-zA-Z]*", text) and len(text) <= 6:
        return True

    # 5. Spammy phrases (hard rule)
    spammy_phrases = [
        "follow me", "check my channel", "subscribe", "buy now", 
        "click here", "dm me", "promo code", "giveaway", "visit my page"
    ]
    if any(phrase in text for phrase in spammy_phrases):
        return True
    
    # 6. Numbers + symbols together (common spam pattern)
    if re.search(r"[0-9]+.*[$%&]+|[$%&]+.*[0-9]+", text):
        return True
    
    # 7. Excessive punctuation or symbols
    if re.search(r"[!?.]{3,}", text) or re.search(r"[$]{3,}", text):
        return True
    
    # 8. Foreign/unicode junk (but allow common emojis handled earlier)
    if re.search(r"[^\x00-\x7F]+", text):
        return True

    return False

# Apply spam detection
data["is_spam"] = data["cleaned_comment"].apply(is_spam)

# Show spam vs non-spam
print("\nSpam distribution:")
print(data["is_spam"].value_counts())


Spam distribution:
is_spam
False    4223
True     1018
Name: count, dtype: int64


In [231]:
# Show a random sample of 20 spam comments
print("\nExample Spam Comments:")
print(data[data["is_spam"] == True][["cleaned_comment"]].sample(20, random_state=17))


Example Spam Comments:
                                        cleaned_comment
705                                you look beautiful 😍
7267                   your eyes very much beautiful 😊😊
3352  she was the only one with the special paint on...
8855  first time i saw her she was an 80 year old wo...
1900           you did understand our india flag a fun😮
8695                      thank you dear 😘 please share
1465                   help the eybrow raise i cant 😭😭😭
4475                                       it was 1970😅
5656  dude thats it bro i was literally the girl the...
3998  shes naturally awe 😮 she dont need a wig or ma...
7294                        im a girl  what this is to😂
1832               why u fine ashl in every single one😂
1714  how is shrek fiona😂😂😂😂😂😂 do you still have a d...
628          night look is mind blowing stay blessed😊😊😊
6123  even i did the same thing 6 months back  regre...
1360                 imagine if henry reacted to this 😂
2492                    

# Categories

In [232]:
# Expanded keyword dictionary

phrase_keywords = {
    "skincare": ["face mask", "face wash", "vitamin c", "anti aging", "anti-aging", "sheet mask", "eye cream", "glow up"],
    "makeup":  ["eye shadow", "lip gloss", "beauty blender", "beautyblender", "eye brow pencil", "foundation shade"],
    "fragrance": ["eau de parfum", "eau de toilette", "top notes", "base notes"],
    "service": ["shipping", "delivery", "refund", "customer service", "order arrived", "tracking number"],
    "price": ["expensive", "cheap", "price", "cost", "discount", "sale", "promo"],
    "question": ["how to use", "how do i", "where can i", "what is", "which shade", "how much"],
    "praise": ["i love", "so good", "so beautiful", "amazing", "best product", "highly recommend"],
    "hair": ["hair care", "hair style", "hairstyle", "hair cut", "hair color", "dandruff"]
}

word_keywords = {
    "skincare": [
        "cream","moisturizer","skin","lotion","sunscreen","serum","toner","cleanser",
        "mask","acne","hydrating","oil","exfoliator","spf","retinol","moisturiser", "natural"
    ],
    "makeup": [
        "lipstick","foundation","eyeliner","mascara","blush","makeup", "make up", "concealer","primer",
        "powder","highlighter","brow","palette","bronzer","contour","lashes","lipgloss","eyelash","eyeshadow","shade"
    ],
    "fragrance": [
        "perfume","scent","fragrance","cologne","parfum","spray","aroma","notes","scented"
    ],
    "service": [
        "shipping","delivery","refund","return","order","tracking","customer","support","warehouse","arrived","late"
    ],
    "price": [
        "price","expensive","cheap","discount","sale","deal","cost","worth"
    ],
    "question": [
        "how","what","where","which","why","help","does","is","can"  
    ],
    "praise": [
        "love","amazing","beautiful","best","nice","wow","loveit","cute","perfect","recommend","thanks","thankyou", "gorgeous", "great", "cool", "pretty", "slay", "fabulous"
    ],
    "hair": [
        "hair care", "hair style", "hairstyle", "hair cut", "haircuts", "hair color", "dandruff", "hair", "scalp", "wig"
    ]
}

In [233]:
# Updated categorization function
def categorize_comment(text):
    text = str(text).lower()

    # 1. Check phrase-level keywords first
    for cat, phrases in phrase_keywords.items():
        for phrase in phrases:
            if phrase in text:
                return cat

    # 2. If no phrase found, check single-word keywords
    for cat, words in word_keywords.items():
        for word in words:
            if word in text.split():  # safer: match whole words
                return cat

    # 3. If nothing matches, label as "other"
    return "other"

In [234]:
# Apply categorization
data["category"] = data["cleaned_comment"].apply(categorize_comment)

In [235]:
# See distribution
print("\nCategory Distribution:")
print(data["category"].value_counts())


Category Distribution:
category
other        2069
question     1226
praise       1018
makeup        386
hair          238
skincare      231
price          40
service        21
fragrance      12
Name: count, dtype: int64


In [237]:
# Sample categorized comments
print("\nSample Categorized Comments:")
print(data[["cleaned_comment", "category"]].sample(10, random_state=17))


Sample Categorized Comments:
                                        cleaned_comment  category
9683                 you look amazing in all the styles    praise
9151                       can u you do curly next time  question
159   love your tutorial and how you apply makeup th...    makeup
98    truly a super lady how composed confident inte...  question
1743  i watched this too many times bcuz she is real...    praise
3267  you deserve it bestie btw what about the neck ...  question
2266  incredible ill have to try that \nshare more v...     other
3979                                           gorgeous    praise
7040  people out there acting like being fat is a cr...  question
7967  i literally laughed because it was you 😂  also...      hair


In [236]:
print(data[data["category"] == "other"][["cleaned_comment"]].sample(10, random_state=17))

                                        cleaned_comment
1724                                   bro 5 m on insta
6163               giving weird not a fashion statement
6279                       bro looks like carl weathers
4736           theyre not women you were born a man sir
253   of all the make up you have and then thats my ...
4775  she didnt do any thing wrong youre just being ...
5730            girl you are talking about boys not men
7037  she said it looked like she was punched by the...
5866                          346 😂 loved this reaction
4340               thats fed up i hope i never go blind


# Machine Learning Model (Comments Category)

In [238]:
# Prepare Features and Labels for ML
X = data["cleaned_comment"]
y = data["category"]  # <-- TARGET VARIABLE

In [239]:
# Split train-test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=25)

In [240]:
# TF-IDF Vectorization
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2), stop_words='english')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [241]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [247]:
# ==============================
# Machine Learning Models
# ==============================

# --- Naive Bayes ---
nb_clf = MultinomialNB()
nb_clf.fit(X_train_tfidf, y_train)
y_pred_nb = nb_clf.predict(X_test_tfidf)

print("=== Naive Bayes ===")
print("Accuracy:", accuracy_score(y_test, y_pred_nb))
print(classification_report(y_test, y_pred_nb))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_nb))

=== Naive Bayes ===
Accuracy: 0.5919923736892279
              precision    recall  f1-score   support

   fragrance       0.00      0.00      0.00         2
        hair       0.00      0.00      0.00        48
      makeup       0.94      0.18      0.30        83
       other       0.60      0.93      0.73       420
      praise       0.70      0.77      0.73       202
       price       0.00      0.00      0.00         6
    question       0.38      0.25      0.30       232
     service       0.00      0.00      0.00         7
    skincare       1.00      0.06      0.12        49

    accuracy                           0.59      1049
   macro avg       0.40      0.24      0.24      1049
weighted avg       0.58      0.59      0.53      1049


Confusion Matrix:
 [[  0   0   0   2   0   0   0   0   0]
 [  0   0   0  13   6   0  29   0   0]
 [  0   0  15  34  24   0  10   0   0]
 [  0   0   0 389   6   0  25   0   0]
 [  0   0   1  23 156   0  22   0   0]
 [  0   0   0   5   0   0   1  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [248]:
# --- Logistic Regression ---
lr_clf = LogisticRegression(max_iter=1000, random_state=25)
lr_clf.fit(X_train_tfidf, y_train)
y_pred_lr = lr_clf.predict(X_test_tfidf)

print("=== Logistic Regression ===")
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_lr))

=== Logistic Regression ===
Accuracy: 0.6873212583412774
              precision    recall  f1-score   support

   fragrance       0.00      0.00      0.00         2
        hair       0.71      0.42      0.53        48
      makeup       0.88      0.73      0.80        83
       other       0.68      0.90      0.77       420
      praise       0.82      0.82      0.82       202
       price       0.00      0.00      0.00         6
    question       0.45      0.32      0.37       232
     service       0.00      0.00      0.00         7
    skincare       0.88      0.47      0.61        49

    accuracy                           0.69      1049
   macro avg       0.49      0.41      0.43      1049
weighted avg       0.67      0.69      0.67      1049


Confusion Matrix:
 [[  0   0   0   0   1   0   1   0   0]
 [  0  20   0   8   3   0  16   0   1]
 [  0   0  61  16   3   0   3   0   0]
 [  0   0   0 378   4   0  37   0   1]
 [  0   2   2   9 165   0  24   0   0]
 [  0   0   1   5   0  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [249]:
# --- Random Forest ---
rf_clf = RandomForestClassifier(n_estimators=200, random_state=25)
rf_clf.fit(X_train_tfidf, y_train)
y_pred_rf = rf_clf.predict(X_test_tfidf)

print("=== Random Forest ===")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))

=== Random Forest ===
Accuracy: 0.7435653002859867
              precision    recall  f1-score   support

   fragrance       0.00      0.00      0.00         2
        hair       0.67      0.77      0.72        48
      makeup       0.91      0.93      0.92        83
       other       0.72      0.92      0.81       420
      praise       0.80      0.90      0.85       202
       price       0.50      0.17      0.25         6
    question       0.56      0.25      0.35       232
     service       1.00      0.29      0.44         7
    skincare       0.93      0.76      0.83        49

    accuracy                           0.74      1049
   macro avg       0.68      0.55      0.57      1049
weighted avg       0.72      0.74      0.71      1049


Confusion Matrix:
 [[  0   0   0   0   1   0   1   0   0]
 [  0  37   0   3   3   0   4   0   1]
 [  0   1  77   1   4   0   0   0   0]
 [  0   0   2 386   3   1  28   0   0]
 [  0   3   2   4 182   0  10   0   1]
 [  0   0   1   4   0   1   0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [250]:
# --- Decision Tree ---
from sklearn.tree import DecisionTreeClassifier

# Initialize Decision Tree classifier
dt_clf = DecisionTreeClassifier(random_state=25, max_depth=None, min_samples_split=2)
dt_clf.fit(X_train_tfidf, y_train)
y_pred_dt = dt_clf.predict(X_test_tfidf)

# Evaluation
print("=== Decision Tree ===")
print("Accuracy:", accuracy_score(y_test, y_pred_dt))
print(classification_report(y_test, y_pred_dt))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_dt))

=== Decision Tree ===
Accuracy: 0.6949475691134414
              precision    recall  f1-score   support

   fragrance       0.00      0.00      0.00         2
        hair       0.70      0.73      0.71        48
      makeup       0.90      0.84      0.87        83
       other       0.74      0.81      0.77       420
      praise       0.73      0.86      0.79       202
       price       0.33      0.17      0.22         6
    question       0.40      0.31      0.35       232
     service       0.75      0.43      0.55         7
    skincare       0.88      0.76      0.81        49

    accuracy                           0.69      1049
   macro avg       0.60      0.54      0.56      1049
weighted avg       0.68      0.69      0.68      1049


Confusion Matrix:
 [[  0   0   0   0   1   0   1   0   0]
 [  0  35   0   2   2   0   8   0   1]
 [  0   1  70   2   7   1   2   0   0]
 [  1   0   3 339   4   1  71   1   0]
 [  0   2   3   3 173   0  20   0   1]
 [  0   0   1   2   0   1   1

In [251]:
# ==============================
# Predict new comments
# ==============================
def predict_category(new_comments):
    new_tfidf = vectorizer.transform(new_comments)
    # you can switch clf to nb_clf, lr_clf, or rf_clf
    return lr_clf.predict(new_tfidf)  # example using Logistic Regression

sample_comments = [
    "I love this moisturizer, it works wonders",
    "Your skin is literally glowing",
    "Your hair is so shiny and curly!"
]

print("Predicted Categories for Sample Comments:")
print(predict_category(sample_comments))

Predicted Categories for Sample Comments:
['praise' 'skincare' 'hair']


# Machine Learning Models (Comments Sentiment)

In [252]:
# ==============================
# Features and target
# ==============================
X = data["cleaned_comment"]
y = data["sentiment"]  # Target variable: positive, negative, neutral

In [253]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [254]:
# ==============================
# TF-IDF Vectorization
# ==============================
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2), stop_words='english')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [256]:
# ==============================
# Decision Tree Classifier
# ==============================
dt_clf = DecisionTreeClassifier(random_state=42)
dt_clf.fit(X_train_tfidf, y_train)
y_pred_dt = dt_clf.predict(X_test_tfidf)

print("=== Decision Tree Sentiment Analysis ===")
print("Accuracy:", accuracy_score(y_test, y_pred_dt))
print("\nClassification Report:\n", classification_report(y_test, y_pred_dt))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_dt))

=== Decision Tree Sentiment Analysis ===
Accuracy: 0.784556720686368

Classification Report:
               precision    recall  f1-score   support

    negative       0.66      0.55      0.60       129
     neutral       0.71      0.84      0.77       335
    positive       0.87      0.80      0.83       585

    accuracy                           0.78      1049
   macro avg       0.75      0.73      0.73      1049
weighted avg       0.79      0.78      0.78      1049


Confusion Matrix:
 [[ 71  28  30]
 [  9 283  43]
 [ 27  89 469]]


In [257]:
# ==============================
# Random Forest Classifier
# ==============================
rf_clf = RandomForestClassifier(n_estimators=200, random_state=42)
rf_clf.fit(X_train_tfidf, y_train)
y_pred_rf = rf_clf.predict(X_test_tfidf)

print("\n=== Random Forest Sentiment Analysis ===")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("\nClassification Report:\n", classification_report(y_test, y_pred_rf))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))


=== Random Forest Sentiment Analysis ===
Accuracy: 0.7969494756911344

Classification Report:
               precision    recall  f1-score   support

    negative       0.79      0.38      0.51       129
     neutral       0.70      0.92      0.79       335
    positive       0.88      0.82      0.85       585

    accuracy                           0.80      1049
   macro avg       0.79      0.71      0.72      1049
weighted avg       0.81      0.80      0.79      1049


Confusion Matrix:
 [[ 49  39  41]
 [  3 307  25]
 [ 10  95 480]]


In [258]:
# ==============================
# Logistic Regression
# ==============================
log_reg = LogisticRegression(max_iter=1000, random_state=42)
log_reg.fit(X_train_tfidf, y_train)
y_pred_lr = log_reg.predict(X_test_tfidf)

print("\n=== Logistic Regression Sentiment Analysis ===")
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print("\nClassification Report:\n", classification_report(y_test, y_pred_lr))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_lr))


=== Logistic Regression Sentiment Analysis ===
Accuracy: 0.784556720686368

Classification Report:
               precision    recall  f1-score   support

    negative       0.86      0.29      0.44       129
     neutral       0.71      0.86      0.77       335
    positive       0.83      0.85      0.84       585

    accuracy                           0.78      1049
   macro avg       0.80      0.67      0.69      1049
weighted avg       0.80      0.78      0.77      1049


Confusion Matrix:
 [[ 38  38  53]
 [  0 287  48]
 [  6  81 498]]


# KPI Summary

In [259]:
# ==============================
# KPI Summary Report
# ==============================

print("\n=== KPI Summary ===")

# 1. Spam vs Quality
spam_dist = data["is_spam"].value_counts(normalize=True) * 100
print("\n1. Spam vs Quality (%):")
print(spam_dist)

# 2. Distribution by Category
category_dist = data["category"].value_counts(normalize=True) * 100
print("\n2. Comment Distribution by Category (%):")
print(category_dist)

# 3. Sentiment Breakdown (overall)
sentiment_dist = data["sentiment"].value_counts(normalize=True) * 100
print("\n3. Overall Sentiment Distribution (%):")
print(sentiment_dist)

# 4. Sentiment within each Category
print("\n4. Sentiment Breakdown per Category (%):")
sentiment_per_cat = pd.crosstab(data["category"], data["sentiment"], normalize="index") * 100
print(sentiment_per_cat.round(2))


=== KPI Summary ===

1. Spam vs Quality (%):
is_spam
False    80.576226
True     19.423774
Name: proportion, dtype: float64

2. Comment Distribution by Category (%):
category
other        39.477199
question     23.392482
praise       19.423774
makeup        7.365007
hair          4.541118
skincare      4.407556
price         0.763213
service       0.400687
fragrance     0.228964
Name: proportion, dtype: float64

3. Overall Sentiment Distribution (%):
sentiment
positive    53.901927
neutral     34.249189
negative    11.848884
Name: proportion, dtype: float64

4. Sentiment Breakdown per Category (%):
sentiment  negative  neutral  positive
category                              
fragrance     33.33    16.67     50.00
hair          18.91    36.97     44.12
makeup        12.44    35.23     52.33
other         12.61    53.21     34.17
praise         1.87     0.98     97.15
price         22.50    30.00     47.50
question      15.91    32.54     51.55
service       14.29    33.33     52.38
ski