In [5]:
# Import libraries
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, roc_auc_score
from imblearn.over_sampling import SMOTE

In [6]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

In [7]:
# Start timing
start_time = time.time()

In [8]:

# Load datasets
train_data = pd.read_csv("C:/Users/shahp/OneDrive/Desktop/train.csv")
test_data = pd.read_csv("C:/Users/shahp/OneDrive/Desktop/test.csv")

print(f"Train shape: {train_data.shape}, Test shape: {test_data.shape}")

Train shape: (260753, 299), Test shape: (173836, 298)


In [9]:
# Prepare features and target
X = train_data.drop(columns=['QuoteConversion_Flag'])
y = train_data['QuoteConversion_Flag'].fillna(train_data['QuoteConversion_Flag'].mode()[0])

print("Target distribution:\n", y.value_counts())

Target distribution:
 QuoteConversion_Flag
0    211859
1     48894
Name: count, dtype: int64


In [10]:
# Train/Validation split
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Identify common columns
common_cols = X_train.columns.intersection(test_data.columns)

# Separate numeric and non-numeric columns
numeric_cols = X_train[common_cols].select_dtypes(include=['int64', 'float64']).columns
non_numeric_cols = X_train[common_cols].select_dtypes(exclude=['int64', 'float64']).columns

print(f"Numeric columns: {len(numeric_cols)} | Non-numeric columns: {len(non_numeric_cols)}")

Numeric columns: 270 | Non-numeric columns: 28


In [11]:
# Impute missing values
num_imputer = SimpleImputer(strategy='mean')
cat_imputer = SimpleImputer(strategy='most_frequent')

# Impute numeric
X_train[numeric_cols] = num_imputer.fit_transform(X_train[numeric_cols])
X_valid[numeric_cols] = num_imputer.transform(X_valid[numeric_cols])
test_data[numeric_cols] = num_imputer.transform(test_data[numeric_cols])

# Impute non-numeric
X_train[non_numeric_cols] = cat_imputer.fit_transform(X_train[non_numeric_cols])
X_valid[non_numeric_cols] = cat_imputer.transform(X_valid[non_numeric_cols])
test_data[non_numeric_cols] = cat_imputer.transform(test_data[non_numeric_cols])

In [12]:
# Convert potential date columns to features
for col in non_numeric_cols:
    try:
        X_train[col] = pd.to_datetime(X_train[col], errors='coerce')
        X_valid[col] = pd.to_datetime(X_valid[col], errors='coerce')
        test_data[col] = pd.to_datetime(test_data[col], errors='coerce')

        for df in [X_train, X_valid, test_data]:
            df[f"{col}_year"] = df[col].dt.year
            df[f"{col}_month"] = df[col].dt.month
            df[f"{col}_day"] = df[col].dt.day
            df.drop(columns=col, inplace=True)

        print(f"Processed datetime column: {col}")

    except Exception:
        continue

  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day


Processed datetime column: Original_Quote_Date


  X_train[col] = pd.to_datetime(X_train[col], errors='coerce')
  X_valid[col] = pd.to_datetime(X_valid[col], errors='coerce')
  test_data[col] = pd.to_datetime(test_data[col], errors='coerce')
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day


Processed datetime column: Field6


  X_train[col] = pd.to_datetime(X_train[col], errors='coerce')
  X_valid[col] = pd.to_datetime(X_valid[col], errors='coerce')
  test_data[col] = pd.to_datetime(test_data[col], errors='coerce')
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day


Processed datetime column: Field10


  X_train[col] = pd.to_datetime(X_train[col], errors='coerce')
  X_valid[col] = pd.to_datetime(X_valid[col], errors='coerce')
  test_data[col] = pd.to_datetime(test_data[col], errors='coerce')
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day


Processed datetime column: Field12


  X_train[col] = pd.to_datetime(X_train[col], errors='coerce')
  X_valid[col] = pd.to_datetime(X_valid[col], errors='coerce')
  test_data[col] = pd.to_datetime(test_data[col], errors='coerce')
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day


Processed datetime column: CoverageField8


  X_train[col] = pd.to_datetime(X_train[col], errors='coerce')
  X_valid[col] = pd.to_datetime(X_valid[col], errors='coerce')
  test_data[col] = pd.to_datetime(test_data[col], errors='coerce')
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day


Processed datetime column: CoverageField9


  X_train[col] = pd.to_datetime(X_train[col], errors='coerce')
  X_valid[col] = pd.to_datetime(X_valid[col], errors='coerce')
  test_data[col] = pd.to_datetime(test_data[col], errors='coerce')
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day


Processed datetime column: SalesField7


  X_train[col] = pd.to_datetime(X_train[col], errors='coerce')
  X_valid[col] = pd.to_datetime(X_valid[col], errors='coerce')
  test_data[col] = pd.to_datetime(test_data[col], errors='coerce')
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day


Processed datetime column: PersonalField7


  X_train[col] = pd.to_datetime(X_train[col], errors='coerce')
  X_valid[col] = pd.to_datetime(X_valid[col], errors='coerce')
  test_data[col] = pd.to_datetime(test_data[col], errors='coerce')
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day


Processed datetime column: PersonalField16


  X_train[col] = pd.to_datetime(X_train[col], errors='coerce')
  X_valid[col] = pd.to_datetime(X_valid[col], errors='coerce')
  test_data[col] = pd.to_datetime(test_data[col], errors='coerce')
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day


Processed datetime column: PersonalField17


  X_train[col] = pd.to_datetime(X_train[col], errors='coerce')
  X_valid[col] = pd.to_datetime(X_valid[col], errors='coerce')
  test_data[col] = pd.to_datetime(test_data[col], errors='coerce')
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day


Processed datetime column: PersonalField18


  X_train[col] = pd.to_datetime(X_train[col], errors='coerce')
  X_valid[col] = pd.to_datetime(X_valid[col], errors='coerce')
  test_data[col] = pd.to_datetime(test_data[col], errors='coerce')
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day


Processed datetime column: PersonalField19


  X_train[col] = pd.to_datetime(X_train[col], errors='coerce')
  X_valid[col] = pd.to_datetime(X_valid[col], errors='coerce')
  test_data[col] = pd.to_datetime(test_data[col], errors='coerce')
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day


Processed datetime column: PropertyField3


  X_train[col] = pd.to_datetime(X_train[col], errors='coerce')
  X_valid[col] = pd.to_datetime(X_valid[col], errors='coerce')
  test_data[col] = pd.to_datetime(test_data[col], errors='coerce')
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day


Processed datetime column: PropertyField4


  X_train[col] = pd.to_datetime(X_train[col], errors='coerce')
  X_valid[col] = pd.to_datetime(X_valid[col], errors='coerce')
  test_data[col] = pd.to_datetime(test_data[col], errors='coerce')
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day


Processed datetime column: PropertyField5


  X_train[col] = pd.to_datetime(X_train[col], errors='coerce')
  X_valid[col] = pd.to_datetime(X_valid[col], errors='coerce')
  test_data[col] = pd.to_datetime(test_data[col], errors='coerce')
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day


Processed datetime column: PropertyField7


  X_train[col] = pd.to_datetime(X_train[col], errors='coerce')
  X_valid[col] = pd.to_datetime(X_valid[col], errors='coerce')
  test_data[col] = pd.to_datetime(test_data[col], errors='coerce')
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day


Processed datetime column: PropertyField14


  X_train[col] = pd.to_datetime(X_train[col], errors='coerce')
  X_valid[col] = pd.to_datetime(X_valid[col], errors='coerce')
  test_data[col] = pd.to_datetime(test_data[col], errors='coerce')
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day


Processed datetime column: PropertyField28


  X_train[col] = pd.to_datetime(X_train[col], errors='coerce')
  X_valid[col] = pd.to_datetime(X_valid[col], errors='coerce')
  test_data[col] = pd.to_datetime(test_data[col], errors='coerce')
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day


Processed datetime column: PropertyField30


  X_train[col] = pd.to_datetime(X_train[col], errors='coerce')
  X_valid[col] = pd.to_datetime(X_valid[col], errors='coerce')
  test_data[col] = pd.to_datetime(test_data[col], errors='coerce')
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day


Processed datetime column: PropertyField31


  X_train[col] = pd.to_datetime(X_train[col], errors='coerce')
  X_valid[col] = pd.to_datetime(X_valid[col], errors='coerce')
  test_data[col] = pd.to_datetime(test_data[col], errors='coerce')
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day


Processed datetime column: PropertyField32


  X_train[col] = pd.to_datetime(X_train[col], errors='coerce')
  X_valid[col] = pd.to_datetime(X_valid[col], errors='coerce')
  test_data[col] = pd.to_datetime(test_data[col], errors='coerce')
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day


Processed datetime column: PropertyField33


  X_train[col] = pd.to_datetime(X_train[col], errors='coerce')
  X_valid[col] = pd.to_datetime(X_valid[col], errors='coerce')
  test_data[col] = pd.to_datetime(test_data[col], errors='coerce')
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day


Processed datetime column: PropertyField34


  X_train[col] = pd.to_datetime(X_train[col], errors='coerce')
  X_valid[col] = pd.to_datetime(X_valid[col], errors='coerce')
  test_data[col] = pd.to_datetime(test_data[col], errors='coerce')
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day


Processed datetime column: PropertyField36


  X_train[col] = pd.to_datetime(X_train[col], errors='coerce')
  X_valid[col] = pd.to_datetime(X_valid[col], errors='coerce')
  test_data[col] = pd.to_datetime(test_data[col], errors='coerce')
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day


Processed datetime column: PropertyField37


  X_train[col] = pd.to_datetime(X_train[col], errors='coerce')
  X_valid[col] = pd.to_datetime(X_valid[col], errors='coerce')
  test_data[col] = pd.to_datetime(test_data[col], errors='coerce')
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day


Processed datetime column: PropertyField38


  X_train[col] = pd.to_datetime(X_train[col], errors='coerce')
  X_valid[col] = pd.to_datetime(X_valid[col], errors='coerce')
  test_data[col] = pd.to_datetime(test_data[col], errors='coerce')
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day


Processed datetime column: GeographicField63


  X_train[col] = pd.to_datetime(X_train[col], errors='coerce')
  X_valid[col] = pd.to_datetime(X_valid[col], errors='coerce')
  test_data[col] = pd.to_datetime(test_data[col], errors='coerce')
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day


Processed datetime column: GeographicField64


In [13]:
# Drop columns with all NaN values
def drop_all_nan(df_list):
    nan_cols = df_list[0].columns[df_list[0].isnull().all()]
    print(f"Dropping all-NaN columns: {list(nan_cols)}")
    for df in df_list:
        df.drop(columns=nan_cols, inplace=True)

drop_all_nan([X_train, X_valid, test_data])

Dropping all-NaN columns: ['Field6_year', 'Field6_month', 'Field6_day', 'Field10_year', 'Field10_month', 'Field10_day', 'Field12_year', 'Field12_month', 'Field12_day', 'CoverageField8_year', 'CoverageField8_month', 'CoverageField8_day', 'CoverageField9_year', 'CoverageField9_month', 'CoverageField9_day', 'SalesField7_year', 'SalesField7_month', 'SalesField7_day', 'PersonalField7_year', 'PersonalField7_month', 'PersonalField7_day', 'PersonalField16_year', 'PersonalField16_month', 'PersonalField16_day', 'PersonalField17_year', 'PersonalField17_month', 'PersonalField17_day', 'PersonalField18_year', 'PersonalField18_month', 'PersonalField18_day', 'PersonalField19_year', 'PersonalField19_month', 'PersonalField19_day', 'PropertyField3_year', 'PropertyField3_month', 'PropertyField3_day', 'PropertyField4_year', 'PropertyField4_month', 'PropertyField4_day', 'PropertyField5_year', 'PropertyField5_month', 'PropertyField5_day', 'PropertyField7_year', 'PropertyField7_month', 'PropertyField7_day', '

In [14]:
# Re-impute numeric columns after feature engineering
numeric_cols = X_train.select_dtypes(include=['int64', 'float64']).columns
X_train[numeric_cols] = num_imputer.fit_transform(X_train[numeric_cols])
X_valid[numeric_cols] = num_imputer.transform(X_valid[numeric_cols])
test_data[numeric_cols] = num_imputer.transform(test_data[numeric_cols])

In [15]:
# Scale numeric features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train[numeric_cols])
X_valid_scaled = scaler.transform(X_valid[numeric_cols])
X_test_scaled = scaler.transform(test_data[numeric_cols])

print("Scaling complete.")

Scaling complete.


In [17]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42, sampling_strategy='minority', n_jobs=-1)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

print("Post-SMOTE class distribution:\n", pd.Series(y_train_resampled).value_counts())




Post-SMOTE class distribution:
 QuoteConversion_Flag
0    169487
1    169487
Name: count, dtype: int64


In [18]:
# Timer end
end_time = time.time()
print(f"Pipeline completed in {(end_time - start_time):.2f} seconds.")


Pipeline completed in 178.31 seconds.


MODEL TRAINING

In [19]:
pip install imblearn scikit-learn

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [20]:
# Import libraries
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.decomposition import PCA

In [21]:
# Timer start
start_time = time.time()

In [22]:
# Load data
train_data = pd.read_csv("C:/Users/shahp/OneDrive/Desktop/train.csv")
test_data = pd.read_csv("C:/Users/shahp/OneDrive/Desktop/test.csv")

print(f"Train shape: {train_data.shape}, Test shape: {test_data.shape}")

Train shape: (260753, 299), Test shape: (173836, 298)


In [23]:
# Prepare features and target
X = train_data.drop(columns=['QuoteConversion_Flag'])
y = train_data['QuoteConversion_Flag'].fillna(train_data['QuoteConversion_Flag'].mode()[0])

In [24]:
# Train/validation split
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [25]:
# Common columns
common_cols = X_train.columns.intersection(test_data.columns)

# Separate numeric and non-numeric
numeric_cols = X_train[common_cols].select_dtypes(include=['int64', 'float64']).columns
non_numeric_cols = X_train[common_cols].select_dtypes(exclude=['int64', 'float64']).columns

In [26]:
# Impute missing values
num_imputer = SimpleImputer(strategy='mean')
cat_imputer = SimpleImputer(strategy='most_frequent')

# Numeric imputation
X_train[numeric_cols] = num_imputer.fit_transform(X_train[numeric_cols])
X_valid[numeric_cols] = num_imputer.transform(X_valid[numeric_cols])
test_data[numeric_cols] = num_imputer.transform(test_data[numeric_cols])

# Categorical imputation
X_train[non_numeric_cols] = cat_imputer.fit_transform(X_train[non_numeric_cols])
X_valid[non_numeric_cols] = cat_imputer.transform(X_valid[non_numeric_cols])
test_data[non_numeric_cols] = cat_imputer.transform(test_data[non_numeric_cols])

In [27]:
# Process potential date columns
for col in non_numeric_cols:
    try:
        X_train[col] = pd.to_datetime(X_train[col], errors='coerce')
        X_valid[col] = pd.to_datetime(X_valid[col], errors='coerce')
        test_data[col] = pd.to_datetime(test_data[col], errors='coerce')

        for df in [X_train, X_valid, test_data]:
            df[f"{col}_year"] = df[col].dt.year
            df[f"{col}_month"] = df[col].dt.month
            df[f"{col}_day"] = df[col].dt.day
            df.drop(columns=col, inplace=True)

        print(f"Processed datetime column: {col}")

    except Exception:
        continue

# Drop all-NaN columns
def drop_all_nan(df_list):
    nan_cols = df_list[0].columns[df_list[0].isnull().all()]
    for df in df_list:
        df.drop(columns=nan_cols, inplace=True)

drop_all_nan([X_train, X_valid, test_data])

  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day


Processed datetime column: Original_Quote_Date


  X_train[col] = pd.to_datetime(X_train[col], errors='coerce')
  X_valid[col] = pd.to_datetime(X_valid[col], errors='coerce')
  test_data[col] = pd.to_datetime(test_data[col], errors='coerce')
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day


Processed datetime column: Field6


  X_train[col] = pd.to_datetime(X_train[col], errors='coerce')
  X_valid[col] = pd.to_datetime(X_valid[col], errors='coerce')
  test_data[col] = pd.to_datetime(test_data[col], errors='coerce')
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day


Processed datetime column: Field10


  X_train[col] = pd.to_datetime(X_train[col], errors='coerce')
  X_valid[col] = pd.to_datetime(X_valid[col], errors='coerce')
  test_data[col] = pd.to_datetime(test_data[col], errors='coerce')
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day


Processed datetime column: Field12


  X_train[col] = pd.to_datetime(X_train[col], errors='coerce')
  X_valid[col] = pd.to_datetime(X_valid[col], errors='coerce')
  test_data[col] = pd.to_datetime(test_data[col], errors='coerce')
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day


Processed datetime column: CoverageField8


  X_train[col] = pd.to_datetime(X_train[col], errors='coerce')
  X_valid[col] = pd.to_datetime(X_valid[col], errors='coerce')
  test_data[col] = pd.to_datetime(test_data[col], errors='coerce')
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day


Processed datetime column: CoverageField9


  X_train[col] = pd.to_datetime(X_train[col], errors='coerce')
  X_valid[col] = pd.to_datetime(X_valid[col], errors='coerce')
  test_data[col] = pd.to_datetime(test_data[col], errors='coerce')
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day


Processed datetime column: SalesField7


  X_train[col] = pd.to_datetime(X_train[col], errors='coerce')
  X_valid[col] = pd.to_datetime(X_valid[col], errors='coerce')
  test_data[col] = pd.to_datetime(test_data[col], errors='coerce')
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day


Processed datetime column: PersonalField7


  X_train[col] = pd.to_datetime(X_train[col], errors='coerce')
  X_valid[col] = pd.to_datetime(X_valid[col], errors='coerce')
  test_data[col] = pd.to_datetime(test_data[col], errors='coerce')
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day


Processed datetime column: PersonalField16


  X_train[col] = pd.to_datetime(X_train[col], errors='coerce')
  X_valid[col] = pd.to_datetime(X_valid[col], errors='coerce')
  test_data[col] = pd.to_datetime(test_data[col], errors='coerce')
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day


Processed datetime column: PersonalField17


  X_train[col] = pd.to_datetime(X_train[col], errors='coerce')
  X_valid[col] = pd.to_datetime(X_valid[col], errors='coerce')
  test_data[col] = pd.to_datetime(test_data[col], errors='coerce')
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day


Processed datetime column: PersonalField18


  X_train[col] = pd.to_datetime(X_train[col], errors='coerce')
  X_valid[col] = pd.to_datetime(X_valid[col], errors='coerce')
  test_data[col] = pd.to_datetime(test_data[col], errors='coerce')
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day


Processed datetime column: PersonalField19


  X_train[col] = pd.to_datetime(X_train[col], errors='coerce')
  X_valid[col] = pd.to_datetime(X_valid[col], errors='coerce')
  test_data[col] = pd.to_datetime(test_data[col], errors='coerce')
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day


Processed datetime column: PropertyField3


  X_train[col] = pd.to_datetime(X_train[col], errors='coerce')
  X_valid[col] = pd.to_datetime(X_valid[col], errors='coerce')
  test_data[col] = pd.to_datetime(test_data[col], errors='coerce')
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day


Processed datetime column: PropertyField4


  X_train[col] = pd.to_datetime(X_train[col], errors='coerce')
  X_valid[col] = pd.to_datetime(X_valid[col], errors='coerce')
  test_data[col] = pd.to_datetime(test_data[col], errors='coerce')
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day


Processed datetime column: PropertyField5


  X_train[col] = pd.to_datetime(X_train[col], errors='coerce')
  X_valid[col] = pd.to_datetime(X_valid[col], errors='coerce')
  test_data[col] = pd.to_datetime(test_data[col], errors='coerce')
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day


Processed datetime column: PropertyField7


  X_train[col] = pd.to_datetime(X_train[col], errors='coerce')
  X_valid[col] = pd.to_datetime(X_valid[col], errors='coerce')
  test_data[col] = pd.to_datetime(test_data[col], errors='coerce')
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day


Processed datetime column: PropertyField14


  X_train[col] = pd.to_datetime(X_train[col], errors='coerce')
  X_valid[col] = pd.to_datetime(X_valid[col], errors='coerce')
  test_data[col] = pd.to_datetime(test_data[col], errors='coerce')
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day


Processed datetime column: PropertyField28


  X_train[col] = pd.to_datetime(X_train[col], errors='coerce')
  X_valid[col] = pd.to_datetime(X_valid[col], errors='coerce')
  test_data[col] = pd.to_datetime(test_data[col], errors='coerce')
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day


Processed datetime column: PropertyField30


  X_train[col] = pd.to_datetime(X_train[col], errors='coerce')
  X_valid[col] = pd.to_datetime(X_valid[col], errors='coerce')
  test_data[col] = pd.to_datetime(test_data[col], errors='coerce')
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day


Processed datetime column: PropertyField31


  X_train[col] = pd.to_datetime(X_train[col], errors='coerce')
  X_valid[col] = pd.to_datetime(X_valid[col], errors='coerce')
  test_data[col] = pd.to_datetime(test_data[col], errors='coerce')
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day


Processed datetime column: PropertyField32


  X_train[col] = pd.to_datetime(X_train[col], errors='coerce')
  X_valid[col] = pd.to_datetime(X_valid[col], errors='coerce')
  test_data[col] = pd.to_datetime(test_data[col], errors='coerce')
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day


Processed datetime column: PropertyField33


  X_train[col] = pd.to_datetime(X_train[col], errors='coerce')
  X_valid[col] = pd.to_datetime(X_valid[col], errors='coerce')
  test_data[col] = pd.to_datetime(test_data[col], errors='coerce')
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day


Processed datetime column: PropertyField34


  X_train[col] = pd.to_datetime(X_train[col], errors='coerce')
  X_valid[col] = pd.to_datetime(X_valid[col], errors='coerce')
  test_data[col] = pd.to_datetime(test_data[col], errors='coerce')
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day


Processed datetime column: PropertyField36


  X_train[col] = pd.to_datetime(X_train[col], errors='coerce')
  X_valid[col] = pd.to_datetime(X_valid[col], errors='coerce')
  test_data[col] = pd.to_datetime(test_data[col], errors='coerce')
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day


Processed datetime column: PropertyField37


  X_train[col] = pd.to_datetime(X_train[col], errors='coerce')
  X_valid[col] = pd.to_datetime(X_valid[col], errors='coerce')
  test_data[col] = pd.to_datetime(test_data[col], errors='coerce')
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day


Processed datetime column: PropertyField38


  X_train[col] = pd.to_datetime(X_train[col], errors='coerce')
  X_valid[col] = pd.to_datetime(X_valid[col], errors='coerce')
  test_data[col] = pd.to_datetime(test_data[col], errors='coerce')
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day


Processed datetime column: GeographicField63


  X_train[col] = pd.to_datetime(X_train[col], errors='coerce')
  X_valid[col] = pd.to_datetime(X_valid[col], errors='coerce')
  test_data[col] = pd.to_datetime(test_data[col], errors='coerce')
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day
  df[f"{col}_year"] = df[col].dt.year
  df[f"{col}_month"] = df[col].dt.month
  df[f"{col}_day"] = df[col].dt.day


Processed datetime column: GeographicField64


In [28]:
# Re-impute numeric columns
numeric_cols = X_train.select_dtypes(include=['int64', 'float64']).columns
X_train[numeric_cols] = num_imputer.fit_transform(X_train[numeric_cols])
X_valid[numeric_cols] = num_imputer.transform(X_valid[numeric_cols])
test_data[numeric_cols] = num_imputer.transform(test_data[numeric_cols])

In [29]:
# Scaling numeric features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train[numeric_cols])
X_valid_scaled = scaler.transform(X_valid[numeric_cols])
X_test_scaled = scaler.transform(test_data[numeric_cols])

print("Scaling complete.")

Scaling complete.


In [30]:
# Dimensionality reduction to speed up
pca = PCA(n_components=50, random_state=42)
X_train_pca = pca.fit_transform(X_train_scaled)
X_valid_pca = pca.transform(X_valid_scaled)
X_test_pca = pca.transform(X_test_scaled)

print("PCA complete. Reduced to 50 components.")

PCA complete. Reduced to 50 components.


In [31]:
# SMOTE for class imbalance
smote = SMOTE(random_state=42, sampling_strategy='minority', n_jobs=-1)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_pca, y_train)

print("Post-SMOTE class distribution:\n", pd.Series(y_train_resampled).value_counts())



Post-SMOTE class distribution:
 QuoteConversion_Flag
0    169487
1    169487
Name: count, dtype: int64


In [32]:
# Model Training — Option 1: Logistic Regression (with Regularization)
log_reg = LogisticRegression(max_iter=1000, penalty='l2', C=1.0, solver='lbfgs')
log_reg.fit(X_train_resampled, y_train_resampled)

# Evaluation
y_valid_pred = log_reg.predict(X_valid_pca)
y_valid_proba = log_reg.predict_proba(X_valid_pca)[:, 1]

print("\nLogistic Regression Classification Report:")
print(classification_report(y_valid, y_valid_pred))
print(f"ROC-AUC Score: {roc_auc_score(y_valid, y_valid_proba):.4f}")


Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.74      0.83     42372
           1       0.41      0.78      0.54      9779

    accuracy                           0.75     52151
   macro avg       0.67      0.76      0.68     52151
weighted avg       0.84      0.75      0.77     52151

ROC-AUC Score: 0.8379


In [33]:
# Model Training — Option 2: Random Forest 
rf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42, n_jobs=-1)
rf.fit(X_train_resampled, y_train_resampled)

# Evaluation
y_valid_rf_pred = rf.predict(X_valid_pca)
y_valid_rf_proba = rf.predict_proba(X_valid_pca)[:, 1]

print("\nRandom Forest Classification Report:")
print(classification_report(y_valid, y_valid_rf_pred))
print(f"ROC-AUC Score: {roc_auc_score(y_valid, y_valid_rf_proba):.4f}")


Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.72      0.81     42372
           1       0.40      0.81      0.53      9779

    accuracy                           0.73     52151
   macro avg       0.67      0.76      0.67     52151
weighted avg       0.84      0.73      0.76     52151

ROC-AUC Score: 0.8427


In [34]:
# Predict on Test Data using best model
test_predictions = rf.predict(X_test_pca)

# Export predictions
submission = pd.DataFrame({
    "Id": test_data.index,
    "QuoteConversion_Flag": test_predictions
})

submission.to_csv("C:/Users/shahp/OneDrive/Desktop/test_predictions.csv", index=False)
print("\nPredictions exported to test_predictions.csv")


Predictions exported to test_predictions.csv


In [35]:
# Timer end
end_time = time.time()
print(f"\nPipeline completed in {(end_time - start_time):.2f} seconds.")


Pipeline completed in 244.91 seconds.
