In [9]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import sys

In [10]:
print("Downloading NLTK data...")
nltk.download('stopwords')
nltk.download('wordnet')
print("Downloads complete.")

Downloading NLTK data...
Downloads complete.


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dviss\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\dviss\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
import pandas as pd
import sys

print("--- Starting Dataset Creation (v3, with 7-Category Augmentation) ---")

# --- 1. Load Source Data ---
try:
    df_existing = pd.read_csv("comments.csv")
    df_toxic_source = pd.read_csv("train.csv")
    print("Successfully loaded 'comments.csv' and 'train.csv'.")
except FileNotFoundError:
    print("ERROR: Make sure 'comments.csv' and 'train.csv' are in the same folder.")

# --- 2. Map Your 0-6 Labels to the New Categories ---
def map_label_to_category(label):
    if label == 1:  # 1:joy
        return 'praise'
    if label == 2:  # 2:love
        return 'support'
    if label in [3, 6]:  # 3:anger, 6:toxic
        return 'hate_abuse'
    if label in [0, 4, 5]:  # 0:sadness, 4:fear, 5:surprise
        return 'emotional'
    return None

print("Remapping existing labels...")
df_remapped = pd.DataFrame()
df_remapped['text'] = df_existing['text']
df_remapped['category'] = df_existing['label'].apply(map_label_to_category)
df_remapped = df_remapped.dropna(subset=['category'])

# --- 3. Add Missing Categories & Augment ALL Categories ---

# Category 1: Threat (from train.csv)
df_threat = df_toxic_source[df_toxic_source['threat'] == 1].copy()
df_threat_to_add = pd.DataFrame({'text': df_threat['comment_text'], 'category': 'threat'})

# --- !! NEW !! Using your high-quality examples ---

# 1. Praise
praise_texts = [
    "This is brilliant! You explained a complex topic so simply.",
    "Flawless execution. That was incredibly well done.",
    "Best video I've seen this week. Subscribed!"
] * 20
df_praise_to_add = pd.DataFrame({'text': praise_texts, 'category': 'praise'})

# 2. Support
support_texts = [
    "Don't let the negative comments get to you. This is valuable content.",
    "I really appreciate the amount of work you must have put into this.",
    "So excited to see your next video!"
] * 20
df_support_to_add = pd.DataFrame({'text': support_texts, 'category': 'support'})

# 3. Constructive Criticism
crit_texts = [
    "Great video, but the audio quality was a bit echoey. A different mic might help.",
    "I liked the overall point, but the pacing felt a bit slow in the middle.",
    "The code is good, but you could make it more efficient by using a different data structure for that part."
] * 20
df_crit_to_add = pd.DataFrame({'text': crit_texts, 'category': 'constructive_criticism'})

# 4. Hate/Abuse
hate_texts = [
    "You have no idea what you're talking about.",
    "This is the stupidest thing I've ever read.",
    "Just delete your channel already."
] * 20
df_hate_to_add = pd.DataFrame({'text': hate_texts, 'category': 'hate_abuse'})

# 5. Threat (Augmenting the ones from train.csv)
threat_texts = [
    "If you don't take this down, I'm going to flag all your videos.",
    "I've screenshotted this and I'm sending it to your boss.",
    "You're going to regret posting this."
] * 20
df_threat_aug_to_add = pd.DataFrame({'text': threat_texts, 'category': 'threat'})

# 6. Emotional
emo_texts = [
    "This actually made me tear up a bit. So powerful.",
    "I was having a really bad day, and this made me laugh out loud. Thank you.",
    "Wow, this is deeply moving. It really makes you think."
] * 20
df_emo_to_add = pd.DataFrame({'text': emo_texts, 'category': 'emotional'})

# 7. Irrelevant/Spam
spam_texts = [
    "Check out my profile for amazing deals!",
    "First!",
    "www. buy-this-scam. com",
    "sub for sub? I subscribed to you."
] * 20
df_spam_to_add = pd.DataFrame({'text': spam_texts, 'category': 'spam_irrelevant'})

# (We'll also keep the question category from before)
question_texts = [
    "What software did you use to make this?",
    "Can you make a video on topic X next?",
] * 20
df_question_to_add = pd.DataFrame({'text': question_texts, 'category': 'question_suggestion'})


# --- 4. Combine Everything into One DataFrame ---
print("Combining all datasets...")
df = pd.concat([
    df_remapped,            # Your main 400k+ rows
    df_threat_to_add,       # Original threat data
    df_praise_to_add,       # Your new praise examples
    df_support_to_add,      # Your new support examples
    df_crit_to_add,         # Your new constructive criticism examples
    df_hate_to_add,         # Your new hate examples
    df_threat_aug_to_add,   # Your new threat examples
    df_emo_to_add,          # Your new emotional examples
    df_spam_to_add,         # Your new spam examples
    df_question_to_add      # The question examples
], ignore_index=True)

# --- 5. VERIFY: Check Your New Categories ---
print("\n--- SUCCESS! Your model is now training on your new examples ---")
print(df['category'].value_counts())

--- Starting Dataset Creation ---
Successfully loaded 'comments.csv' and 'train.csv'.
Remapping existing labels...
Adding 'threat' category...
Creating 'constructive_criticism' category...
Creating 'spam_irrelevant' category...
Combining all datasets...

--- SUCCESS! Your 7 Categories Are Ready ---
category
emotional                 183871
praise                    141067
hate_abuse                 57317
support                    34554
threat                       478
constructive_criticism        60
spam_irrelevant               40
Name: count, dtype: int64


In [12]:
# --- Define preprocessing components ---
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-z\s]', '', text) # Remove punctuation/numbers
    tokens = text.split()               # Use simple .split()
    
    # Lemmatize and remove stopwords
    cleaned_tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    
    return " ".join(cleaned_tokens)

# --- Apply the preprocessing ---
print("Starting text preprocessing...")
df['cleaned_text'] = df['text'].apply(preprocess_text)
print("Preprocessing complete. 'cleaned_text' column is now in the DataFrame.")

Starting text preprocessing...
Preprocessing complete. 'cleaned_text' column is now in the DataFrame.


In [13]:
print("Defining X and y...")
# This will now work, because 'df' has the correct columns
X = df['cleaned_text']
y = df['category']

# Split the data
print("Splitting data...")
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Vectorize
print("Vectorizing text...")
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train
print("Training the model...")
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)

print("Model training complete.")

Defining X and y...
Splitting data...
Vectorizing text...
Training the model...
Model training complete.


In [14]:
print("Evaluating the model...")
y_pred = model.predict(X_test_tfidf)

print("\n--- Classification Report ---")
print(classification_report(y_test, y_pred))

Evaluating the model...

--- Classification Report ---
                        precision    recall  f1-score   support

constructive_criticism       1.00      1.00      1.00        12
             emotional       0.95      0.96      0.95     36774
            hate_abuse       0.91      0.90      0.90     11463
                praise       0.92      0.93      0.93     28214
       spam_irrelevant       1.00      1.00      1.00         8
               support       0.81      0.77      0.79      6911
                threat       1.00      0.46      0.63        96

              accuracy                           0.92     83478
             macro avg       0.94      0.86      0.89     83478
          weighted avg       0.92      0.92      0.92     83478



In [15]:
# These functions use the 'model' and 'vectorizer' from Cell 5
def classify_comment(comment):
    cleaned_comment = preprocess_text(comment)
    comment_tfidf = vectorizer.transform([cleaned_comment])
    prediction = model.predict(comment_tfidf)
    return prediction[0]

def get_reply_suggestion(category):
    templates = {
        'praise_support': "Thank you so much for the kind words!",
        'hate_abuse': "[Action: Monitor user or escalate to moderation.]",
        'emotional': "Thank you for sharing that with us. It means a lot.",
        'threat': "[Action: Escalate to security/legal team immediately.]",
        'constructive_criticism': "That's valuable feedback. We'll pass it to the team.",
        'spam_irrelevant': "[Action: Remove comment and monitor user.]",
        'question_suggestion': "That's a great question! We'll look into it."
    }
    return templates.get(category, "No suggestion available.")

print("Tool functions 'classify_comment' and 'get_reply_suggestion' are ready.")

Tool functions 'classify_comment' and 'get_reply_suggestion' are ready.


In [16]:
# Test 1: Constructive
test_comment_1 = "I loved the video, but the audio was really hard to hear."
category_1 = classify_comment(test_comment_1)
print(f"Comment: '{test_comment_1}'")
print(f"Category: {category_1}")
print(f"Suggested Reply: {get_reply_suggestion(category_1)}\n")

# Test 2: Praise
test_comment_2 = "This is amazing! You guys are the best!"
category_2 = classify_comment(test_comment_2)
print(f"Comment: '{test_comment_2}'")
print(f"Category: {category_2}")
print(f"Suggested Reply: {get_reply_suggestion(category_2)}\n")

# Test 3: Threat
test_comment_3 = "I'll report you if this continues."
category_3 = classify_comment(test_comment_3)
print(f"Comment: '{test_comment_3}'")
print(f"Category: {category_3}")
print(f"Suggested Reply: {get_reply_suggestion(category_3)}\n")

# Test 4: Spam
test_comment_4 = "check out my website www.buy-stuff.com"
category_4 = classify_comment(test_comment_4)
print(f"Comment: '{test_comment_4}'")
print(f"Category: {category_4}")
print(f"Suggested Reply: {get_reply_suggestion(category_4)}\n")

Comment: 'I loved the video, but the audio was really hard to hear.'
Category: support
Suggested Reply: No suggestion available.

Comment: 'This is amazing! You guys are the best!'
Category: emotional
Suggested Reply: Thank you for sharing that with us. It means a lot.

Comment: 'I'll report you if this continues.'
Category: emotional
Suggested Reply: Thank you for sharing that with us. It means a lot.

Comment: 'check out my website www.buy-stuff.com'
Category: emotional
Suggested Reply: Thank you for sharing that with us. It means a lot.

