In [None]:
# ================
# 1. IMPORTS
# ================
import os
import logging
import warnings
import joblib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys

from sklearn.model_selection import (
    train_test_split, GridSearchCV, StratifiedKFold, cross_val_score, RandomizedSearchCV, RepeatedStratifiedKFold
)
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer, MissingIndicator
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import (
    confusion_matrix, classification_report, roc_auc_score, roc_curve
)
from sklearn.ensemble import (
    RandomForestClassifier, GradientBoostingClassifier, 
    HistGradientBoostingClassifier
)
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

# Interpretability
import shap
from sklearn.inspection import permutation_importance, PartialDependenceDisplay

# Optimization
import optuna

In [None]:
# ================
# 2. CONFIGURATION
# ================
RANDOM_STATE = 42
TEST_SIZE = 0.2
N_SPLITS_CV = 5
SCORING_METRIC = 'roc_auc'
VERBOSE = 1

CPU_COUNT = os.cpu_count()

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Preprocessing

In [None]:
# ================
# 3. Import core waves (including 2024)
# ================

# Specify the directory path (auto-detect WSL/Linux vs Windows UNC)
directory_candidates = [
    r"\\wsl.localhost\Ubuntu\home\siyang\work\vaping_project_data\original_all_core",
    os.path.expanduser('~/work/vaping_project_data/original_all_core'),
    '/home/siyang/work/vaping_project_data/original_all_core',
]

directory_path = next((p for p in directory_candidates if os.path.exists(p)), None)
if directory_path is None:
    raise FileNotFoundError(
        "Could not find data directory. Tried: " + ", ".join(directory_candidates)
    )

# List all files ending with '0810.tsv' in the specified directory
files = [f for f in os.listdir(directory_path) if f.endswith('0810.tsv')]

# Ensure 2024 wave is included even if it wasn't picked up by directory listing
extra_file_candidates = [
    # User-provided WSL UNC path (useful when running from Windows/Jupyter that can see WSL via UNC)
    r"\\wsl.localhost\Ubuntu\home\siyang\work\vaping_project_data\original_all_core\original_core_2024_0810.tsv",
    # Native WSL/Linux path (useful when running inside WSL)
    "/home/siyang/work/vaping_project_data/original_all_core/original_core_2024_0810.tsv",
]

extra_file_path = next((p for p in extra_file_candidates if os.path.exists(p)), None)
if extra_file_path is None:
    warnings.warn(
        "Could not find original_core_2024_0810.tsv at expected paths. "
        "Proceeding with files discovered via directory listing."
    )
else:
    extra_file_name = os.path.basename(extra_file_path)
    if extra_file_name not in files:
        files.append(extra_file_name)

# Create a dictionary to store individual dataframes
dataframes = {}

# Read each file into a separate dataframe
for file in sorted(files):
    # If this is the 2024 file, prefer the explicit path we resolved above
    if file == 'original_core_2024_0810.tsv' and extra_file_path is not None:
        file_path = extra_file_path
    else:
        file_path = os.path.join(directory_path, file)

    try:
        # Use the file name (without extension) as the key
        df_name = file.replace('.tsv', '')  # Remove .tsv from the filename
        # Read the file with low_memory=False to handle mixed types
        dataframes[df_name] = pd.read_csv(file_path, sep='\t', low_memory=False)
        print(f"Successfully read: {file} into dataframe '{df_name}'")
    except Exception as e:
        print(f"Error reading {file}: {e}")

# Example: Accessing a specific dataframe
# df_2024 = dataframes['original_core_2024_0810']

Successfully read: original_core_2017_0810.tsv into dataframe 'original_core_2017_0810'
Successfully read: original_core_2018_0810.tsv into dataframe 'original_core_2018_0810'
Successfully read: original_core_2019_0810.tsv into dataframe 'original_core_2019_0810'
Successfully read: original_core_2020_0810.tsv into dataframe 'original_core_2020_0810'
Successfully read: original_core_2021_0810.tsv into dataframe 'original_core_2021_0810'
Successfully read: original_core_2022_0810.tsv into dataframe 'original_core_2022_0810'
Successfully read: original_core_2023_0810.tsv into dataframe 'original_core_2023_0810'
Successfully read: original_core_2024_0810.tsv into dataframe 'original_core_2024_0810'


In [None]:
# ================
# 4. Basic Info of each dataset
# ================

# Loop through each dataframe in the dictionary
for df_name, df in dataframes.items():
    print(f"=== Basic Information for {df_name} ===")
    
    # Display the first few rows
    print("\nFirst 5 Rows:")
    print(df.head())
    
    # Display the last few rows
    print("\nLast 5 Rows:")
    print(df.tail())
    
    # Get dataset shape
    print(f"\nDataset Shape: {df.shape}")
    
    # Get column names
    print(f"\nColumn Names: {df.columns.tolist()}")
    
    # Get data types
    print(f"\nData Types:\n{df.dtypes}")
    
    # Check for missing values
    print(f"\nMissing Values:\n{df.isnull().sum()}")
    
    # Get summary statistics for numerical columns
    print(f"\nSummary Statistics:\n{df.describe(include='all')}")
    
    # Count unique values in each column
    print(f"\nUnique Values per Column:\n{df.nunique()}")
    
    # Check for duplicate rows
    print(f"\nNumber of Duplicate Rows: {df.duplicated().sum()}")
    
    # Print a separator for readability
    print("\n" + "=" * 50 + "\n")

=== Basic Information for original_core_2017_0810 ===

First 5 Rows:
   CASEID    V1  V3     V4        V5  V501  V507  V508  V509  V7101  ...  \
0       1  2017   4  54591  2.427542     8     2     0     0      1  ...   
1       2  2017   1  12278  0.594924    10     4     0     1      1  ...   
2       3  2017   3  39863  0.110906    10     2     1     1      1  ...   
3       4  2017   2  30554  0.722531     8     4     0     0      1  ...   
4       5  2017   2  25277  0.549932    10     4     1     1     -9  ...   

   V7099D  V7133D  V7134D  V7135D  V7142D  V7143D  V7144D  V7139D  V7140D  \
0       0       0       0       0       0       0       0       0       0   
1       0       0       0       0      -8      -8      -8      -8      -8   
2       0       0       0       0       0       0       0       0       0   
3      -8       0       0       0       0       0       0      -8      -8   
4      -8       1       1       1       1       1       1      -8      -8   

   V7141D  

In [19]:
# ================
# 5. Inner Join all 7 waves
# ================

# Step 1: Find common columns across all dataframes
common_columns = set(dataframes[next(iter(dataframes))].columns)  # Initialize with columns from the first dataframe
for df in dataframes.values():
    common_columns.intersection_update(df.columns)  # Keep only columns present in all dataframes

# Convert the set of common columns to a list
common_columns = list(common_columns)
print(f"Common Columns: {common_columns}")

# Step 2: Filter each dataframe to keep only the common columns
filtered_dataframes = {}
for df_name, df in dataframes.items():
    filtered_dataframes[df_name] = df[common_columns]
    print(f"Filtered {df_name} to keep common columns.")

# Step 3: Concatenate all filtered dataframes into a single dataframe
merged_df = pd.concat(filtered_dataframes.values(), ignore_index=True)

# Display basic info of the merged dataframe
# Get dataset shape
print(f"\nDataset Shape: {df.shape}")
    
# Get column names
print(f"\nColumn Names: {df.columns.tolist()}")
    
# Get data types
print(f"\nData Types:\n{df.dtypes}")
    
# Check for missing values
print(f"\nMissing Values:\n{df.isnull().sum()}")
    
# Get summary statistics for numerical columns
print(f"\nSummary Statistics:\n{df.describe(include='all')}")
    
# Count unique values in each column
print(f"\nUnique Values per Column:\n{df.nunique()}")
    
# Check for duplicate rows
print(f"\nNumber of Duplicate Rows: {df.duplicated().sum()}")
    
# Print a separator for readability
print("\n" + "=" * 50 + "\n")

Common Columns: ['V7329', 'V7181', 'V8564', 'V7642', 'V7327', 'V7106', 'V7127', 'V7261', 'V8536', 'V7108', 'V7713', 'V7166', 'V7428', 'V7512', 'V501', 'V7602', 'V7391', 'V7390', 'V8542', 'V7543', 'V7241', 'V7114', 'V7319', 'V8508', 'V7704', 'V7425', 'V8482', 'V7229', 'V7455', 'V8555', 'V8444', 'V7426', 'V7417', 'V8519', 'V7499', 'V8501', 'V7408', 'V7234', 'V7387', 'V7312', 'V7471', 'V7240', 'V7214', 'V7311', 'V7415', 'V7233', 'V7353', 'V8509', 'V7445', 'V7498', 'V7468', 'V7128', 'V7478', 'V7348', 'V7354', 'V7579', 'V7384', 'V8538', 'V7569', 'V8513', 'V7404', 'V7673', 'V7603', 'V7254', 'V7320', 'V8425', 'V7431', 'V7443', 'V7116', 'V7540', 'V8441', 'V7671', 'V7495', 'V7372', 'V7549', 'V7402', 'V7410', 'V7451', 'V7462', 'V7236', 'V7447', 'V7405', 'V7345', 'V8540', 'V7429', 'V8543', 'V7180', 'V7258', 'V7538', 'V7162', 'V8503', 'V7591', 'V7223', 'V7465', 'V8560', 'V7515', 'V7453', 'V7448', 'V8565', 'V7185', 'V7508', 'V7145', 'V7423', 'V7422', 'V7414', 'V7663', 'V7330', 'V8504', 'V7412', 'V8