In [1]:
import sqlite3

import pandas as pd
from lazypredict.Supervised import LazyClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier

# Step 1: Load and Preprocess Data
database_path = "data/data.db"
conn = sqlite3.connect(database_path)
query = open('data/fetch_all.sql', 'r').read()
df = pd.read_sql_query(query, conn)
conn.close()

df.head(), df.info()

# Map 'yes' to 1, 'no' to 0, and 'unknown' to NaN
df['y'] = df['y'].map({'no': 0, 'yes': 1, 'unknown': None})

# Drop irrelevant columns
irrelevant_columns = ['client_id', 'account_id', 'campaign_id', 'outcome_id']
data_cleaned = df.drop(columns=irrelevant_columns)

# Encode categorical features
categorical_features = data_cleaned.select_dtypes(include=['object']).columns
one_hot_encoder = OneHotEncoder(drop='first', sparse_output=False)
encoded_categorical = one_hot_encoder.fit_transform(data_cleaned[categorical_features])
encoded_df = pd.DataFrame(encoded_categorical, columns=one_hot_encoder.get_feature_names_out(categorical_features))

# Merge encoded features back into the dataset
data_cleaned = data_cleaned.drop(columns=categorical_features).reset_index(drop=True)
data_cleaned = pd.concat([data_cleaned, encoded_df], axis=1)

# Scale numerical features
numerical_features = data_cleaned.select_dtypes(include=['int64', 'float64']).columns.drop('y', errors='ignore')
scaler = StandardScaler()
data_cleaned[numerical_features] = scaler.fit_transform(data_cleaned[numerical_features])

# Step 2: Train the Model
# Use rows with known 'y' values for training
data_train = data_cleaned[data_cleaned['y'].notnull()]
X_train = data_train.drop(columns=['y'])
y_train = data_train['y']

# Split into training and testing sets
X_train_split, X_test_split, y_train_split, y_test_split = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42, stratify=y_train
)

# Step 3: Model Selection with LazyPredict
lazy_clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)
models, predictions = lazy_clf.fit(X_train_split, X_test_split, y_train_split, y_test_split)
models = models.sort_values(by='Accuracy', ascending=False)
pd.options.display.float_format = '{:.10f}'.format
print(models)
# Identify the best model based on accuracy
best_model_name = models.index[0]
print(f"Best model from LazyPredict: {best_model_name}")

# Train a RandomForestClassifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train_split, y_train_split)

# Step 4: Predict Probabilities for the Full Dataset
X_full = data_cleaned.drop(columns=['y'])
data_cleaned['Probability'] = model.predict_proba(X_full)[:, 1]

# Select the top 5,000 clients
selected_clients = data_cleaned.sort_values(by='Probability', ascending=False).head(5000)

# Step 5: Save Selected Client IDs to CSV
selected_client_ids = df.loc[selected_clients.index, ['client_id']]
selected_client_ids.to_csv("second_phase_target.csv", index=False, header=['target'])

print("Selected client IDs saved to 'second_phase_target.csv'.")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 21 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   client_id    45211 non-null  int64  
 1   age          45211 non-null  int64  
 2   job          45211 non-null  object 
 3   marital      45211 non-null  object 
 4   education    45211 non-null  object 
 5   account_id   45211 non-null  int64  
 6   in_default   45211 non-null  object 
 7   balance      45211 non-null  float64
 8   housing      45211 non-null  object 
 9   loan         45211 non-null  object 
 10  contact      45211 non-null  object 
 11  campaign_id  45211 non-null  int64  
 12  day          45211 non-null  int64  
 13  month        45211 non-null  object 
 14  duration     45211 non-null  int64  
 15  campaign     45211 non-null  int64  
 16  pdays        45211 non-null  int64  
 17  previous     45211 non-null  int64  
 18  outcome_id   45211 non-null  int64  
 19  pout

100%|██████████| 31/31 [00:03<00:00,  9.05it/s]

[LightGBM] [Info] Number of positive: 417, number of negative: 3199
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000397 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 956
[LightGBM] [Info] Number of data points in the train set: 3616, number of used features: 41
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.115321 -> initscore=-2.037507
[LightGBM] [Info] Start training from score -2.037507
                                  Accuracy  Balanced Accuracy      ROC AUC  \
Model                                                                        
RidgeClassifierCV             0.8950276243       0.6227552098 0.6227552098   
RidgeClassifier               0.8950276243       0.6227552098 0.6227552098   
LGBMClassifier                0.8928176796       0.6591580236 0.6591580236   
LinearSVC                     0.8928176796       0.629873




Selected client IDs saved to 'second_phase_target.csv'.
