In [1]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from xgboost import XGBClassifier
from scipy.sparse import csr_matrix, issparse
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [26]:
def prepare_imbalanced_data(df, target_col="hasrej103_x"):
    """
    1. Creates a binary rejection column ('reject_hasrej103').
    2. Splits data into 70% Train, 15% Validation, 15% Test (stratified).
    3. Downsamples the majority class in the training set to achieve a 50/50 balance.
    4. Ensures the validation and test sets mimic the original class distribution.

    :param df: The input DataFrame containing the target column.
    :param target_col: The column to use for the target label (e.g., "hasrej103").
    :return: Indices and labels for the balanced training set, and the real-world val/test sets.
    """

    # 1. Select relevant column and create the binary target
    #df_claims = df[[target_col]].copy()

    # Create the new binary column: reject_hasrej103 (1 if >= 1, else 0)
    df_claims["reject_hasrej103"] = (df_claims[target_col].fillna(0) >= 1).astype(int)

    # Separate index (X) and the new target label (y)
    X = df_claims.index.to_series() # Use index as 'X' for splitting
    y = df_claims["reject_hasrej103"]

    print("--- Initial Data Status ---")
    print(f"Total Samples: {len(df_claims)}")
    print(f"Class Distribution:")

    # 2. Split into Train (70%) and Temp (30%), stratified
    X_train_idx, X_temp_idx, y_train, y_temp = train_test_split(
        X, y, test_size=0.3, random_state=42, stratify=y
    )

    # Split Temp (30%) into Validation (15% of total) and Test (15% of total), stratified
    X_val_idx, X_test_idx, y_val, y_test = train_test_split(
        X_temp_idx, y_temp, test_size=0.5, random_state=42, stratify=y_temp
    )

    # Function to print information
    def print_split_info(name, y_data):
        count = len(y_data)
        # Recalculate distribution directly from the balanced/split labels
        dist = y_data.value_counts(normalize=True).mul(100).round(2)
        # Use .get() for safety
        class_1_perc = dist.get(1, 0.0)
        class_0_perc = dist.get(0, 0.0)
        print(f"**{name}** (N={count}): {class_1_perc}% Class 1 | {class_0_perc}% Class 0")

    #print_split_info("Train (Balanced)", y_train_balanced, train_indices_balanced)
    # The val/test sets will still show the real-world distribution
    print_split_info("Train", y_train)
    print_split_info("Validation (Real-World)", y_val)
    print_split_info("Test (Real-World)", y_test)

   return X_train_idx, X_val_idx.index, X_test_idx.index, y_train, y_val, y_test

In [23]:
# --- 1. Define File Paths ---
# **UPDATE THESE PATHS TO YOUR ACTUAL FILE LOCATIONS**
from google.colab import drive
drive.mount('/content/drive')
LABEL_CSV = "all_features_df.csv"
TARGET_COLUMN = "hasrej103_x"
FEATURES_JOBLIB = "/content/drive/MyDrive/MSAI 339 - Data Science/tfidf_features.joblib"
SAVE_MODEL_PATH = "xgboost_103_model.joblib"

# --- 2. Load Data ---
print("--- Loading Raw Data for Splitting ---")
"""try:
  # Load the raw data to get the index and target column
  raw_df = pd.read_csv(LABEL_CSV, usecols=[TARGET_COLUMN])
except Exception as e:
  print(f"Error loading raw data from {LABEL_CSV}: {e}")
  # --- Fallback to Simulated Data (For testing if files aren't available) ---
  print("Using a simulated DataFrame for demonstration...")
  N_total = 1000
  raw_df = pd.DataFrame({
      TARGET_COLUMN: ([0] * 800) + ([1] * 200) # 80/20 split
  }).sample(frac=1, random_state=42).reset_index(drop=True)
  raw_df.index.name = 'original_index'
"""
#df = pd.read_csv('/content/drive/MyDrive/MSAI 339 - Data Science/pg_claims_tokenized.csv')
df = pd.read_csv('/content/drive/MyDrive/MSAI 339 - Data Science/combined_df.csv')
df = df.reset_index(drop=True)
df_claims = df[["claim_text", "hasrej101", "hasrej102", "hasrej103", "hasrej112"]]
# Perform the custom data split and downsampling
train_indices_b, val_indices, test_indices, y_train_b, y_val, y_test = prepare_imbalanced_data(
  None,
  target_col=TARGET_COLUMN
)



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
--- Loading Raw Data for Splitting ---


KeyboardInterrupt: 

In [27]:
TARGET_COLUMN = "hasrej103_x"
df_claims = df[["claim_text", "hasrej101_x", "hasrej102_x", "hasrej103_x", "hasrej112_x"]]
# Perform the custom data split and downsampling
print(df_claims.head(20))
train_indices_b, val_indices, test_indices, y_train_b, y_val, y_test = prepare_imbalanced_data(
  None,
  target_col=TARGET_COLUMN
)

                                           claim_text  hasrej101_x  \
0   ['nonamplifi', 'detect', 'polynucleotid', 'pro...          1.0   
1   ['compound', 'follow', 'formula', 'pharmaceut'...          0.0   
2   ['method', 'target', 'genom', 'modif', 'within...          0.0   
3   ['mammalian', 'cell', 'line', 'compris', 'firs...          0.0   
4   ['method', 'determin', 'amount', 'andor', 'con...          0.0   
5   ['isol', 'antibodi', 'antigen', 'bind', 'porti...          0.0   
6   ['insulin', 'receptor', 'aptam', 'compris', 'n...          0.0   
7   ['method', 'oper', 'electron', 'regul', 'retur...          0.0   
8   ['vaccin', 'composit', 'remov', 'boar', 'taint...          0.0   
9   ['cell', 'lack', 'function', 'express', 'mhc',...          0.0   
10  ['compstatin', 'analogu', 'repres', 'formula',...          NaN   
11  ['apparatu', 'compris', 'transceiv', 'commun',...          0.0   
12  ['heavi', 'chain', 'variabl', 'region', 'antib...          0.0   
13  ['imag', 'form',

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_claims["reject_hasrej103"] = (df_claims[target_col].fillna(0) >= 1).astype(int)


--- Downsampling Training Set ---
Majority Class Label: 1 (Downsampled)
Minority Class Label: 0 (Retained)
Downsampled Count (50%): 876259
------------------------------
--- Final Set Sizes & Distributions ---
**Validation (Real-World)** (N=586488): 67.98% Class 1 | 32.02% Class 0
**Test (Real-World)** (N=586489): 67.98% Class 1 | 32.02% Class 0


In [6]:
df.columns

Index(['application_number', 'entity_size_MICRO', 'entity_size_SMALL',
       'entity_size_UNDISCOUNTED', 'aia_first_to_file_False',
       'aia_first_to_file_True', 'has_rej101', 'has_rej102', 'has_rej103',
       'has_rej112', 'final_outcome_x', 'Rejected', 'pgpub_id_x',
       'patent_id_x', 'claim_text', 'hasrej101_x', 'hasrej102_x',
       'hasrej103_x', 'hasrej112_x', 'submissionDate_x',
       'groupartunitnumber_x', 'Unnamed: 0', 'application_invention_type',
       'examiner_full_name', 'examiner_art_unit', 'uspc_class',
       'uspc_subclass', 'confirm_number', 'atty_docket_number',
       'appl_status_desc', 'appl_status_date', 'file_location',
       'earliest_pgpub_number', 'earliest_pgpub_date', 'patent_number',
       'patent_issue_date', 'invention_title', 'small_entity_indicator',
       'aia_first_to_file', 'applicant_organization', 'geographical_region',
       'pgpub_id_y', 'application_id', 'patent_id_y',
       'patentApplicationNumber', 'hasrej101_y', 'hasrej102_

In [28]:
# Load the sparse features
print("\n--- Loading Sparse TF-IDF Features ---")
try:
    X_all = joblib.load(FEATURES_JOBLIB)
except Exception as e:
    print(f"Error loading features from {FEATURES_JOBLIB}: {e}")
    # Exit if features can't be loaded, as the classifier requires them
    exit()

# Ensure X is a sparse matrix
if not issparse(X_all):
    X_all = csr_matrix(X_all)
    print("Features converted to sparse matrix.")

print(f"Sparse matrix shape loaded: {X_all.shape}")

# Map the indices to the sparse feature matrix
X_train = X_all[train_indices_b, :]
X_test = X_all[test_indices, :]


--- Loading Sparse TF-IDF Features ---
Sparse matrix shape loaded: (3909923, 637906)


In [16]:
print(X_train.shape)
print(X_test.shape)

(1752518, 637906)
(586489, 637906)


In [19]:
# Load the sparse features
print("\n--- Loading Sparse TF-IDF Features ---")
try:
    X_all = joblib.load(FEATURES_JOBLIB)
except Exception as e:
    print(f"Error loading features from {FEATURES_JOBLIB}: {e}")
    # Exit if features can't be loaded, as the classifier requires them
    exit()

# Ensure X is a sparse matrix
if not issparse(X_all):
    X_all = csr_matrix(X_all)
    print("Features converted to sparse matrix.")

print(f"Sparse matrix shape loaded: {X_all.shape}")

# Map the indices to the sparse feature matrix
X_train = X_all[train_indices_b, :]
X_test = X_all[test_indices, :]
X_val = X_all[val_indices, :]

# --- 3. Train XGBoost Model ---
print("\n--- Training XGBoost Classifier ---")

# Calculate the scale_pos_weight for handling imbalanced classes
num_pos = (y_train_b == 1).sum()
num_neg = (y_train_b == 0).sum()
# Since we explicitly balanced the training set to 50/50, scale_pos_weight will be 1.0,
# but we calculate it here for robustness if the split ratio changes.
scale_pos_weight_value = num_neg / num_pos
print(f"Positive class weight (scale_pos_weight) set to: {scale_pos_weight_value:.2f}")




--- Loading Sparse TF-IDF Features ---
Sparse matrix shape loaded: (3909923, 637906)

--- Training XGBoost Classifier ---
Positive class weight (scale_pos_weight) set to: 1.00


In [30]:
# **XGBoost Classifier Definition**
model = XGBClassifier(
    objective='binary:logistic',
    n_estimators=100,
    max_depth=7,
    learning_rate=0.1,
    use_label_encoder=False,
    eval_metric='logloss',
    n_jobs=-1,
    random_state=42,
    #scale_pos_weight=scale_pos_weight_value
)

# Train the model using the balanced feature set
model.fit(X_train, y_train_b)

# --- 4. Save Model ---
joblib.dump(model, SAVE_MODEL_PATH)
print(f"\nModel saved to: {SAVE_MODEL_PATH}")

# --- 5. Evaluate Model ---
print("\n--- Evaluating XGBoost Model on Test Set (Real-World Distribution) ---")
predictions = model.predict(X_test)

# Print the full classification report
print(classification_report(y_test, predictions))


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



Model saved to: xgboost_103_model.joblib

--- Evaluating XGBoost Model on Test Set (Real-World Distribution) ---
              precision    recall  f1-score   support

           0       0.63      0.15      0.24    187770
           1       0.71      0.96      0.81    398719

    accuracy                           0.70    586489
   macro avg       0.67      0.55      0.53    586489
weighted avg       0.68      0.70      0.63    586489



In [32]:
predictions_val = model.predict(X_val)
print(classification_report(y_val, predictions_val))
print('---------')
predictions_train = model.predict(X_train)
print(classification_report(y_train_b, predictions_train))

              precision    recall  f1-score   support

           0       0.63      0.15      0.24    187769
           1       0.71      0.96      0.81    398719

    accuracy                           0.70    586488
   macro avg       0.67      0.56      0.53    586488
weighted avg       0.68      0.70      0.63    586488

---------
              precision    recall  f1-score   support

           0       0.66      0.16      0.25    876259
           1       0.71      0.96      0.82   1860687

    accuracy                           0.70   2736946
   macro avg       0.68      0.56      0.53   2736946
weighted avg       0.69      0.70      0.64   2736946



In [29]:

  # --- Optional: Naive Bayes implementation ---
  # To run Naive Bayes, uncomment the imports at the top and the code below.
  # print("\n--- Optional: Multinomial Naive Bayes Classifier ---")
  nb_model = MultinomialNB()
  nb_model.fit(X_train, y_train_b) # Train on the same balanced data
  nb_predictions = nb_model.predict(X_test)
  print(classification_report(y_test, nb_predictions))
  print('------------')
  nb_val_predictions = nb_model.predict(X_val)
  print(classification_report(y_val, nb_val_predictions))

              precision    recall  f1-score   support

           0       0.60      0.08      0.15    187770
           1       0.69      0.97      0.81    398719

    accuracy                           0.69    586489
   macro avg       0.65      0.53      0.48    586489
weighted avg       0.66      0.69      0.60    586489

------------
              precision    recall  f1-score   support

           0       0.60      0.08      0.15    187769
           1       0.69      0.97      0.81    398719

    accuracy                           0.69    586488
   macro avg       0.65      0.53      0.48    586488
weighted avg       0.66      0.69      0.60    586488

