In [1]:
pip install xgboost scikit-learn pandas


Note: you may need to restart the kernel to use updated packages.


In [None]:
print("Loading dataset...")
df = pd.read_csv("PIO_Final_Synthetic_60000_fresh.csv")

df["Combined_Text"] = (
    df["Problem_Description"].fillna("") +
    " " +
    df["Keywords"].fillna("")
).apply(preprocess_text)

X = df["Combined_Text"]
y_dept = df["Department"]
y_auth = df["Authority_Name"]

print("Splitting...")

X_train, X_test, y_train_dept, y_test_dept, y_train_auth, y_test_auth = train_test_split(
    X, y_dept, y_auth, test_size=0.2, random_state=42
)

# -----------------------------
# TF-IDF must come AFTER splitting
# -----------------------------
print("TF-IDF...")

vectorizer = TfidfVectorizer(
    max_features=12000,
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.90
)

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# -----------------------------
# FIX: Convert labels to integers
# -----------------------------
dept_label_map = {v: i for i, v in enumerate(sorted(y_dept.unique()))}
auth_label_map = {v: i for i, v in enumerate(sorted(y_auth.unique()))}

y_train_dept = y_train_dept.map(dept_label_map)
y_test_dept = y_test_dept.map(dept_label_map)

y_train_auth = y_train_auth.map(auth_label_map)
y_test_auth = y_test_auth.map(auth_label_map)

# -----------------------------
# XGBOOST MODELS
# -----------------------------
dept_model = XGBClassifier(
    objective="multi:softmax",
    num_class=len(dept_label_map),
    eval_metric="mlogloss",
    n_estimators=300,
    max_depth=10,
    learning_rate=0.15,
    subsample=0.9,
    colsample_bytree=0.9,
    tree_method="hist"
)

auth_model = XGBClassifier(
    objective="multi:softmax",
    num_class=len(auth_label_map),
    eval_metric="mlogloss",
    n_estimators=300,
    max_depth=10,
    learning_rate=0.15,
    subsample=0.9,
    colsample_bytree=0.9,
    tree_method="hist"
)

print("Training Department model...")
dept_model.fit(X_train_tfidf, y_train_dept)

print("Training Authority model...")
auth_model.fit(X_train_tfidf, y_train_auth)

print("Evaluating...")

pred_dept = dept_model.predict(X_test_tfidf)
pred_auth = auth_model.predict(X_test_tfidf)

print("\nAccuracy:")
print("Department:", accuracy_score(y_test_dept, pred_dept))
print("Authority:", accuracy_score(y_test_auth, pred_auth))

print("\nDepartment Report:")
print(classification_report(y_test_dept, pred_dept))

print("\nAuthority Report:")
print(classification_report(y_test_auth, pred_auth))


Loading dataset...
Splitting...
TF-IDF...
Training Department model...
