In [None]:
import re
import nltk
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from collections import Counter
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn.model_selection import train_test_split, learning_curve
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix,
    roc_curve, auc
)



# 1) LOAD DATA
train_df = pd.read_csv('/kaggle/input/ai-2-deep-learning-for-nlp-homework-1/train_dataset.csv')
val_df   = pd.read_csv('/kaggle/input/ai-2-deep-learning-for-nlp-homework-1/val_dataset.csv')
test_df  = pd.read_csv('/kaggle/input/ai-2-deep-learning-for-nlp-homework-1/test_dataset.csv')

# 2) ADVANCED PREPROCESSING (PORTER STEMMER + NEGATION)
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))
negation_words = {"no", "not", "never", "none", "nobody", "nothing", "nowhere", "nor", "without"}
stop_words = stop_words.difference(negation_words)

def advanced_preprocess(text):
    """
    1) Removes URLs/@mentions
    2) Lowercases text
    3) Normalizes elongated words
    4) Handles negations
    5) Removes stopwords (except negations)
    6) Applies PorterStemmer
    """
    # Remove URLs, @mentions
    text = re.sub(r"http\S+|www\S+|@\S+", " ", text)
    # Remove '#' but keep its text
    text = text.replace("#", " ")
    # Lowercase
    text = text.lower()
    # Remove repeated chars
    text = re.sub(r"(.)\1{2,}", r"\1\1", text)

    # Tokenize
    tokens = word_tokenize(text)

    # Negation handling
    processed_tokens = []
    negate = False
    for token in tokens:
        if token in negation_words:
            negate = True
            processed_tokens.append(token)  # keep negation word
            continue
        if token in [".", ",", "!", "?", ";", ":"]:
            negate = False
        if negate:
            token = token + "_NEG"
        processed_tokens.append(token)

    # Filter & Stem
    filtered_tokens = []
    for tok in processed_tokens:
        if tok.isalpha() or "_NEG" in tok:  # keep if letter-based or has negation
            if tok not in stop_words:
                stemmed = stemmer.stem(tok)
                filtered_tokens.append(stemmed)

    return " ".join(filtered_tokens).strip()

# Apply preprocessing
train_df['cleaned_text'] = train_df['Text'].apply(advanced_preprocess)
val_df['cleaned_text']   = val_df['Text'].apply(advanced_preprocess)
test_df['cleaned_text']  = test_df['Text'].apply(advanced_preprocess)

# 3) TF-IDF VECTORIZATION
vectorizer = TfidfVectorizer(
    ngram_range=(1, 2),  # unigrams + bigrams
    min_df=2,            # drop words appearing once
    sublinear_tf=True    # log-scaling term frequency
)
X_train = vectorizer.fit_transform(train_df['cleaned_text'])
y_train = train_df['Label']

X_val = vectorizer.transform(val_df['cleaned_text'])
y_val = val_df['Label']

X_test = vectorizer.transform(test_df['cleaned_text'])

# 4) LOGISTIC REGRESSION WITH TUNED HYPERPARAMS
model = LogisticRegression(
    penalty='l2',
    C=0.5,
    solver='liblinear',
    max_iter=200,
    random_state=42
)
model.fit(X_train, y_train)

# 5) VALIDATION METRICS
y_val_pred = model.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
print("\n=== Validation Performance ===")
print(f"Validation Accuracy: {val_accuracy:.4f}")
print("Classification Report:\n", classification_report(y_val, y_val_pred))

# 5.1) Confusion Matrix
cm = confusion_matrix(y_val, y_val_pred)
plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix (Validation)")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# 5.2) ROC Curve
# For ROC, we need predicted probabilities for class 1
y_val_probs = model.predict_proba(X_val)[:, 1]
fpr, tpr, thresholds = roc_curve(y_val, y_val_probs)
roc_auc = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, label=f"ROC curve (AUC = {roc_auc:.2f})", lw=2)
plt.plot([0, 1], [0, 1], linestyle="--", color="r", label="Random Guess")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve - Validation")
plt.legend(loc="lower right")
plt.show()
print(f"Validation AUC: {roc_auc:.4f}")

# 5.3) LEARNING CURVE
# Evaluate how model performance changes as we vary the training set size.
from sklearn.model_selection import learning_curve

train_sizes, train_scores, val_scores = learning_curve(
    estimator=model,
    X=X_train,
    y=y_train,
    cv=5,            # 5-fold CV
    scoring='accuracy',
    train_sizes=np.linspace(0.1, 1.0, 5),
    random_state=42
)

train_means = np.mean(train_scores, axis=1)
val_means   = np.mean(val_scores, axis=1)

plt.figure()
plt.plot(train_sizes, train_means, 'o-', label='Training Accuracy')
plt.plot(train_sizes, val_means, 'o-', label='Validation Accuracy')
plt.title("Learning Curve (Logistic Regression)")
plt.xlabel("Training Examples")
plt.ylabel("Accuracy")
plt.legend()
plt.show()

# 6) FINAL SUBMISSION
test_preds = model.predict(X_test)
submission_df = pd.DataFrame({
    'ID': test_df['ID'],
    'Label': test_preds
})
submission_df.to_csv('submission.csv', index=False)
print("\n=> 'submission.csv' created for Kaggle.")


In [None]:
nltk.download("punkt")
nltk.download("wordnet")
nltk.download("omw-1.4")

In [None]:
!ls -lh /kaggle/working
