In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from scipy.sparse import csr_matrix, hstack


# Load data
train = pd.read_csv("P1_train.csv")
test = pd.read_csv("P1_test.csv")

# Combine for consistent processing
train['is_train'] = 1
test['is_train'] = 0
test['Consumer disputed?'] = np.nan
df = pd.concat([train, test], axis=0)

# Dates
df['Date received'] = pd.to_datetime(df['Date received'])
df['Date sent to company'] = pd.to_datetime(df['Date sent to company'])
df['received_month'] = df['Date received'].dt.month
df['received_day'] = df['Date received'].dt.day
df['received_dow'] = df['Date received'].dt.dayofweek
df['days_to_send'] = (df['Date sent to company'] - df['Date received']).dt.days

# Missing indicators
for col in ['Consumer complaint narrative', 'Company public response', 'Tags', 'Consumer consent provided?']:
    df[f'{col}_missing'] = df[col].isna().astype(int)

df.fillna('missing', inplace=True)

In [2]:
# Label encode
categorical_cols = [
    'Product', 'Sub-product', 'Sub-issue', 'Company',
    'State', 'Tags', 'Consumer consent provided?', 'Submitted via',
    'Company response to consumer', 'Timely response?'
]
for col in categorical_cols:
    df[col] = LabelEncoder().fit_transform(df[col])

# TF-IDF
tfidf_narr = TfidfVectorizer(max_features=2000, stop_words='english')
tfidf_issue = TfidfVectorizer(max_features=100, stop_words='english')
narr_tfidf = tfidf_narr.fit_transform(df['Consumer complaint narrative'])
issue_tfidf = tfidf_issue.fit_transform(df['Issue'])

# Final features
drop_cols = ['Date received', 'Date sent to company', 'Consumer complaint narrative',
             'Issue', 'Complaint ID', 'Consumer disputed?', 'is_train']
X_structured = df.drop(columns=drop_cols).copy()

for col in X_structured.select_dtypes(include='object').columns:
    X_structured[col] = LabelEncoder().fit_transform(X_structured[col])

X_sparse = csr_matrix(X_structured.astype(float))
X_final = hstack([X_sparse, narr_tfidf, issue_tfidf])

# Split back
X_train = X_final[df['is_train'] == 1]
X_test = X_final[df['is_train'] == 0]
y_train = (train['Consumer disputed?'] == 'Yes').astype(int)

# Validation split
X_dev, X_val, y_dev, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42, stratify=y_train
)

In [3]:
# Model

model = LogisticRegression(
    solver='liblinear',
    max_iter=1000,
    class_weight='balanced',
    random_state=42
)
model.fit(X_dev, y_dev)

# AUC
val_preds = model.predict_proba(X_val)[:, 1]
print("Validation AUC:", roc_auc_score(y_val, val_preds))

Validation AUC: 0.5995961954321247


In [4]:
# Predict + Submit
#y_test_probs = model.predict_proba(X_test)[:, 1]
#y_test_pred = np.where(y_test_probs >= 0.5, "Yes", "No")

#submission = pd.DataFrame({
#    "Complaint ID": test["Complaint ID"],
#    "Consumer disputed": y_test_pred
#})
#submission.to_csv("submission.csv", index=False)