In [3]:
# Step 1 : Import Libraries 
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import RidgeClassifier, LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score
# Download NLTK data
nltk.download("stopwords")
nltk.download("wordnet")

# Step 2 : Load Data
train = pd.read_csv("/kaggle/input/map-charting-student-math-misunderstandings/train.csv")
test = pd.read_csv("/kaggle/input/map-charting-student-math-misunderstandings/test.csv")
sample_sub = pd.read_csv("/kaggle/input/map-charting-student-math-misunderstandings/sample_submission.csv")
print("Train shape:", train.shape)
print("Test shape:", test.shape)
print("Sample submission shape:", sample_sub.shape)

# Step 3 : Define Features & Labels 
text_col = "QuestionText"

# Clean text
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", " ", text)
    words = text.split()
    words = [lemmatizer.lemmatize(w) for w in words if w not in stop_words]
    return " ".join(words)

train[text_col] = train[text_col].fillna("").apply(clean_text)
test[text_col] = test[text_col].fillna("").apply(clean_text)

# Split multi-labels
train['Misconception'] = train['Misconception'].fillna("").apply(lambda x: x.split(','))

# Encode labels
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(train['Misconception'])

# Features
X = train[text_col]
print("X shape:", X.shape)
print("y shape:", y.shape)
print("Labels:", mlb.classes_)

# Step 4 : Train-Validation-Split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Train size:", X_train.shape[0], "Validation size:", X_val.shape[0])
print("y_train shape:", y_train.shape, "y_val shape:", y_val.shape)

# Step 5 : Text Vectorization
vectorizer = TfidfVectorizer(max_features=50000, ngram_range=(1,3))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_val_tfidf = vectorizer.transform(X_val)
X_test_tfidf = vectorizer.transform(test[text_col].fillna(""))
print("TF-IDF shapes:", X_train_tfidf.shape, X_val_tfidf.shape, X_test_tfidf.shape)

# Step 6 : Define & Train Model
model = OneVsRestClassifier(LogisticRegression(max_iter=1000,solver="saga"))
model.fit(X_train_tfidf, y_train)

# Step 7 : Validate Model
y_val_pred = model.predict(X_val_tfidf)
f1 = f1_score(y_val, y_val_pred, average='micro')
print("Validation F1 score:", f1)

# Step 8 : Predict test Data
y_test_pred = model.predict(X_test_tfidf)

# Step 9 : Submission
submission = pd.DataFrame(y_test_pred, columns=mlb.classes_)
if 'id' in test.columns:
    submission.insert(0, 'id', test['id'])
else:
    submission.insert(0, 'id', range(len(test)))

submission.to_csv("submission.csv", index=False)
print("✅ Submission file created successfully!")

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Train shape: (36696, 7)
Test shape: (3, 5)
Sample submission shape: (3, 2)
X shape: (36696,)
y shape: (36696, 36)
Labels: ['' 'Adding_across' 'Adding_terms' 'Additive' 'Base_rate' 'Certainty'
 'Definition' 'Denominator-only_change' 'Division' 'Duplication'
 'Firstterm' 'FlipChange' 'Ignores_zeroes' 'Incomplete'
 'Incorrect_equivalent_fraction_addition' 'Interior' 'Inverse_operation'
 'Inversion' 'Irrelevant' 'Longer_is_bigger' 'Mult' 'Multiplying_by_4'
 'Not_variable' 'Positive' 'Scale' 'Shorter_is_bigger' 'Subtraction'
 'SwapDividend' 'Tacking' 'Unknowable' 'WNB' 'Whole_numbers_larger'
 'Wrong_Fraction' 'Wrong_Operation' 'Wrong_fraction' 'Wrong_term']
Train size: 29356 Validation size: 7340
y_train shape: (29356, 36) y_val shape: (7340, 36)
TF-IDF shapes: (29356, 267) (7340, 267) (3, 267)
Validation F1 score: 0.7189373297002724
✅ Submission file created successfully!
