In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, csr_matrix

# Load data
train = pd.read_csv("P1_train.csv")
test = pd.read_csv("P1_test.csv")

# Combine train and test for consistent preprocessing
train["is_train"] = 1
test["is_train"] = 0
test["Consumer disputed?"] = np.nan
df = pd.concat([train, test], axis=0)

# Convert date columns
df["Date received"] = pd.to_datetime(df["Date received"])
df["Date sent to company"] = pd.to_datetime(df["Date sent to company"])
df["received_month"] = df["Date received"].dt.month
df["received_day"] = df["Date received"].dt.day
df["received_dow"] = df["Date received"].dt.dayofweek
df["days_to_send"] = (df["Date sent to company"] - df["Date received"]).dt.days

# Add missing value indicators
for col in ['Consumer complaint narrative', 'Company public response', 'Tags', 'Consumer consent provided?']:
    df[f"{col}_missing"] = df[col].isna().astype(int)

# Fill missing values
df.fillna("missing", inplace=True)

# Encode categorical columns
categorical_cols = [
    "Product", "Sub-product", "Sub-issue", "Company", "State",
    "Tags", "Consumer consent provided?", "Submitted via",
    "Company response to consumer", "Timely response?"
]
for col in categorical_cols:
    df[col] = LabelEncoder().fit_transform(df[col])

# TF-IDF on narrative and issue
tfidf_narr = TfidfVectorizer(max_features=500, stop_words="english")
tfidf_issue = TfidfVectorizer(max_features=100, stop_words="english")
narr_tfidf = tfidf_narr.fit_transform(df["Consumer complaint narrative"])
issue_tfidf = tfidf_issue.fit_transform(df["Issue"])

# Final feature matrix
drop_cols = [
    "Date received", "Date sent to company", "Consumer complaint narrative", "Issue",
    "Complaint ID", "Consumer disputed?", "is_train"
]
X_structured = df.drop(columns=drop_cols).copy()

# Encode any remaining object columns
for col in X_structured.select_dtypes(include="object").columns:
    X_structured[col] = LabelEncoder().fit_transform(X_structured[col])

X_structured_sparse = csr_matrix(X_structured.astype(float))
X_final = hstack([X_structured_sparse, narr_tfidf, issue_tfidf])

# Split back
X_train = X_final[df["is_train"] == 1]
X_test = X_final[df["is_train"] == 0]
y_train = (train["Consumer disputed?"] == "Yes").astype(int)

In [2]:
# Stratified K-Fold
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
test_preds_accum = np.zeros(X_test.shape[0])
val_auc_scores = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train, y_train)):
    X_tr, X_val = X_train[train_idx], X_train[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

    model = LogisticRegression(
        solver="liblinear",
        max_iter=1000,
        class_weight="balanced",
        random_state=42
    )
    model.fit(X_tr, y_tr)
    val_probs = model.predict_proba(X_val)[:, 1]
    auc = roc_auc_score(y_val, val_probs)
    val_auc_scores.append(auc)
    print(f"Fold {fold+1} AUC: {auc:.4f}")
    test_preds_accum += model.predict_proba(X_test)[:, 1]

# Average test predictions
test_preds_avg = test_preds_accum / kf.n_splits
y_test_pred = np.where(test_preds_avg >= 0.5, "Yes", "No")

print(f"\nMean Validation AUC: {np.mean(val_auc_scores):.4f}")

Fold 1 AUC: 0.6013
Fold 2 AUC: 0.5969
Fold 3 AUC: 0.5977
Fold 4 AUC: 0.6003
Fold 5 AUC: 0.5954

Mean Validation AUC: 0.5983


In [16]:
#Create submission
submission = pd.DataFrame({
    "Complaint ID": test["Complaint ID"],
    "Consumer disputed": y_test_pred
})
submission.to_csv("submission1.csv", index=False)