In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import RepeatedKFold
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
import xgboost as xgb
import catboost as cb
import re
from ast import literal_eval
from datetime import datetime
from sklearn.preprocessing import PowerTransformer, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD, NMF

In [None]:
###############################################################################
# TextProcessor
###############################################################################

In [8]:
class TextProcessor:
    def __init__(self, max_features=300):
        self.tfidf_models = {}
        self.count_models = {}
        self.svd_models = {}
        self.nmf_models = {}
        self.max_features = max_features
    
    def clean_text(self, text):
        if pd.isna(text):
            return ''
        text = str(text).lower()
        text = re.sub(r'[^\w\s]', ' ', text)
        text = re.sub(r'\d+', 'NUM', text)
        text = re.sub(r'\s+', ' ', text).strip()
        return text
    
    def process_list(self, text):
        """Convert list-like strings to a list of cleaned tokens."""
        if pd.isna(text) or text == '':
            return []
        try:
            items = literal_eval(text)
            return [self.clean_text(item) for item in items]
        except:
            return [self.clean_text(item) for item in str(text).split(',')]

    def fit_transform_text(self, texts, feature_name):
        processed_texts = [
            ' '.join(self.process_list(text)) if isinstance(text, str) else ''
            for text in texts
        ]
        
        # TF-IDF
        self.tfidf_models[feature_name] = TfidfVectorizer(
            max_features=self.max_features,
            ngram_range=(1, 3),
            stop_words='english'
        )
        tfidf_matrix = self.tfidf_models[feature_name].fit_transform(processed_texts)
        
        # Count
        self.count_models[feature_name] = CountVectorizer(
            max_features=self.max_features // 2,
            ngram_range=(1, 2),
            stop_words='english'
        )
        count_matrix = self.count_models[feature_name].fit_transform(processed_texts)
        
        # SVD
        self.svd_models[feature_name] = TruncatedSVD(n_components=50, random_state=42)
        svd_matrix = self.svd_models[feature_name].fit_transform(tfidf_matrix)
        
        # NMF
        self.nmf_models[feature_name] = NMF(n_components=30, random_state=42)
        nmf_matrix = self.nmf_models[feature_name].fit_transform(tfidf_matrix)
        
        return np.hstack([
            tfidf_matrix.toarray(),
            count_matrix.toarray(),
            svd_matrix,
            nmf_matrix
        ])

    def transform_text(self, texts, feature_name):
        processed_texts = [
            ' '.join(self.process_list(text)) if isinstance(text, str) else ''
            for text in texts
        ]
        
        tfidf_matrix = self.tfidf_models[feature_name].transform(processed_texts)
        count_matrix = self.count_models[feature_name].transform(processed_texts)
        svd_matrix = self.svd_models[feature_name].transform(tfidf_matrix)
        nmf_matrix = self.nmf_models[feature_name].transform(tfidf_matrix)
        
        return np.hstack([
            tfidf_matrix.toarray(),
            count_matrix.toarray(),
            svd_matrix,
            nmf_matrix
        ])

###############################################################################
# FeatureEngineer
###############################################################################
class FeatureEngineer:
    def __init__(self):
        self.text_processor = TextProcessor()
        self.scaler = StandardScaler()
        self.power_transformer = PowerTransformer(method='yeo-johnson', standardize=False)  
        # You can also try method='box-cox' if data > 0.

    def extract_years_experience(self, row):
        try:
            start_years = [int(y) for y in re.findall(r'\d{4}', str(row['start_dates']))]
            end_years = [int(y) for y in re.findall(r'\d{4}', str(row['end_dates']))]
            if not end_years:
                end_years = [2024]
            experiences = [e - s for s, e in zip(start_years, end_years)]
            return {
                'total_experience': sum(experiences),
                'max_experience': max(experiences) if experiences else 0,
                'num_positions': len(experiences)
            }
        except:
            return {'total_experience': 0, 'max_experience': 0, 'num_positions': 0}
    
    def clean_education_result(self, result_str):
        if pd.isna(result_str):
            return 0
        try:
            # Handle list-like strings
            if result_str.startswith('['):
                result_str = literal_eval(result_str)[0]
            
            result_str = str(result_str).upper()
            if result_str in ['N/A', 'NONE', 'NAN', '']:
                return 0
                
            # Remove % and convert to float
            result_str = result_str.replace('%', '')
            return float(result_str)
        except:
            return 0
    
    def extract_education_features(self, row):
        try:
            degree = str(row['degree_names']).lower() if not pd.isna(row['degree_names']) else ''
            result = self.clean_education_result(row['educational_results'])
            
            edu_score = 0
            if 'phd' in degree or 'doctorate' in degree:
                edu_score = 4
            elif 'master' in degree:
                edu_score = 3
            elif 'bachelor' in degree or 'bsc' in degree or 'ba' in degree:
                edu_score = 2
            elif 'diploma' in degree or 'certificate' in degree:
                edu_score = 1
                
            return {
                'education_score': edu_score,
                'education_result': result,
                'education_weight': edu_score * (result/100 if result > 0 else 1)
            }
        except:
            return {
                'education_score': 0,
                'education_result': 0,
                'education_weight': 0
            }

    def transform(self, df, is_train=True):
        # Numeric features from experience and education
        exp_features = df.apply(self.extract_years_experience, axis=1)
        edu_features = df.apply(self.extract_education_features, axis=1)
        
        feature_dict = {}
        for feat in ['total_experience', 'max_experience', 'num_positions']:
            feature_dict[feat] = [x[feat] for x in exp_features]
        for feat in ['education_score', 'education_result', 'education_weight']:
            feature_dict[feat] = [x[feat] for x in edu_features]
        
        # Basic numeric count features
        feature_dict['num_skills'] = df['skills'].fillna('').str.count(',') + 1
        feature_dict['has_certification'] = (~df['certification_skills'].isna()).astype(int)
        feature_dict['num_languages'] = df['languages'].fillna('').str.count(',') + 1

        # Additional *interaction* features to give the model more nuance
        feature_dict['experience_per_position'] = np.array(feature_dict['total_experience']) / (
            np.array(feature_dict['num_positions']) + 0.1
        )
        feature_dict['result_x_edu_score'] = (
            np.array(feature_dict['education_result']) * np.array(feature_dict['education_score'])
        )

        # Text features
        text_features = [
            'skills', 'career_objective', 'responsibilities',
            'educational_institution_name', 'certification_skills',
            'major_field_of_studies'
        ]
        
        all_text_features = {}
        for feature in text_features:
            if is_train:
                text_matrix = self.text_processor.fit_transform_text(df[feature], feature)
            else:
                text_matrix = self.text_processor.transform_text(df[feature], feature)
            
            # Keep dimension names stable
            for i in range(text_matrix.shape[1]):
                all_text_features[f'{feature}_text_{i}'] = text_matrix[:, i]
        
        # Skills match ratio
        df['skills_required'] = df['skills_required'].fillna('')
        df['skills'] = df['skills'].fillna('')
        required_skills = df['skills_required'].apply(self.text_processor.process_list)
        candidate_skills = df['skills'].apply(self.text_processor.process_list)
        
        match_ratios = []
        for req, cand in zip(required_skills, candidate_skills):
            req_set = set(req)
            cand_set = set(cand)
            if len(req_set) == 0:
                match_ratios.append(0.0)
            else:
                match_ratios.append(len(req_set.intersection(cand_set)) / len(req_set))
        feature_dict['skills_match_ratio'] = match_ratios
        
        # Combine numeric features
        numeric_df = pd.DataFrame(feature_dict, index=df.index)
        
        # Power transform or scale numeric features
        # (You can also chain PowerTransformer -> StandardScaler, etc. to see if it helps)
        if is_train:
            numeric_arr = self.power_transformer.fit_transform(numeric_df)
            numeric_arr = StandardScaler().fit_transform(numeric_arr)  # extra standard scaling
        else:
            numeric_arr = self.power_transformer.transform(numeric_df)
            numeric_arr = StandardScaler().fit_transform(numeric_arr)
        
        numeric_df = pd.DataFrame(numeric_arr, columns=numeric_df.columns, index=numeric_df.index)
        
        # Combine with text features
        text_feature_df = pd.DataFrame(all_text_features, index=df.index)
        
        # Final combined feature matrix
        return pd.concat([numeric_df, text_feature_df], axis=1)

###############################################################################
# Training and Ensembling
###############################################################################
def train_and_ensemble():
    train_df = pd.read_csv('/kaggle/input/bitfest-datathon-2025/train.csv')
    test_df = pd.read_csv('/kaggle/input/bitfest-datathon-2025/test.csv')

    fe = FeatureEngineer()

    print("Transforming train data...")
    train_features = fe.transform(train_df, is_train=True)
    print("Transforming test data...")
    test_features = fe.transform(test_df, is_train=False)

    y = train_df['matched_score'].values

    # We will do a small manual hyperparam search for LightGBM
    # (Replace with Optuna or Hyperopt for a more thorough search)
    lgb_params_list = [
        {
            'objective': 'regression_l2',
            'metric': 'l2',
            'num_leaves': 31,
            'learning_rate': 0.005,
            'feature_fraction': 0.8,
            'bagging_fraction': 0.8,
            'bagging_freq': 5,
            'reg_alpha': 0.1,
            'reg_lambda': 0.1,
            'min_child_samples': 20,
            'force_col_wise': True,
            'seed': 42
        },
        {
            'objective': 'regression_l2',
            'metric': 'l2',
            'num_leaves': 64,
            'learning_rate': 0.003,
            'feature_fraction': 0.9,
            'bagging_fraction': 0.9,
            'bagging_freq': 3,
            'reg_alpha': 0.2,
            'reg_lambda': 0.2,
            'min_child_samples': 25,
            'force_col_wise': True,
            'max_bin': 255,
            'seed': 42
        }
    ]

    # XGBoost parameters
    xgb_params = {
        'objective': 'reg:squarederror',
        'learning_rate': 0.005,
        'max_depth': 6,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'lambda': 2,
        'alpha': 1,
        'seed': 42
    }

    # CatBoost parameters
    cat_params = {
        'iterations': 3000,
        'learning_rate': 0.006,
        'depth': 6,
        'eval_metric': 'MSE',
        'loss_function': 'MSE',
        'random_seed': 42,
        'use_best_model': True,
        'verbose': 100
    }

    # We’ll do repeated cross validation to get more robust estimates
    rkf = RepeatedKFold(n_splits=5, n_repeats=2, random_state=42)  # total 10 folds

    # We'll store out-of-fold predictions for each model
    oof_preds_lgb_list = []
    oof_preds_xgb = np.zeros(len(train_df))
    oof_preds_cat = np.zeros(len(train_df))

    # We'll also keep an array for each test prediction to average them later
    test_preds_lgb_list = []
    test_preds_xgb = np.zeros(len(test_df))
    test_preds_cat = np.zeros(len(test_df))

    # Loop through different sets of LightGBM params
    for lgb_params in lgb_params_list:
        oof_preds_lgb = np.zeros(len(train_df))
        test_preds_lgb = np.zeros(len(test_df))
        
        # Cross-validation loop
        for fold, (train_idx, val_idx) in enumerate(rkf.split(train_features)):
            X_train, X_val = train_features.iloc[train_idx], train_features.iloc[val_idx]
            y_train, y_val = y[train_idx], y[val_idx]
            
            train_data_lgb = lgb.Dataset(X_train, label=y_train)
            val_data_lgb = lgb.Dataset(X_val, label=y_val)

            model_lgb = lgb.train(
                lgb_params,
                train_data_lgb,
                num_boost_round=5000,
                valid_sets=[train_data_lgb, val_data_lgb],
                callbacks=[
                    lgb.early_stopping(stopping_rounds=100),
                    lgb.log_evaluation(100)
                ]
            )

            oof_preds_lgb[val_idx] = model_lgb.predict(X_val)
            test_preds_lgb += model_lgb.predict(test_features) / rkf.get_n_splits()

        oof_preds_lgb_list.append(oof_preds_lgb)
        test_preds_lgb_list.append(test_preds_lgb)

    # Train a single XGBoost model with the same folds
    for fold, (train_idx, val_idx) in enumerate(rkf.split(train_features)):
        X_train, X_val = train_features.iloc[train_idx], train_features.iloc[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]

        xgb_train = xgb.DMatrix(X_train, y_train)
        xgb_val = xgb.DMatrix(X_val, y_val)
        xgb_test = xgb.DMatrix(test_features)

        model_xgb = xgb.train(
            xgb_params,
            xgb_train,
            num_boost_round=5000,
            evals=[(xgb_val, 'val')],
            early_stopping_rounds=100,
            verbose_eval=False
        )

        oof_preds_xgb[val_idx] = model_xgb.predict(xgb_val)
        test_preds_xgb += model_xgb.predict(xgb_test) / rkf.get_n_splits()

    # Train a single CatBoost model with the same folds
    for fold, (train_idx, val_idx) in enumerate(rkf.split(train_features)):
        X_train, X_val = train_features.iloc[train_idx], train_features.iloc[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]

        model_cat = cb.CatBoostRegressor(**cat_params)
        model_cat.fit(
            X_train, y_train,
            eval_set=(X_val, y_val),
            use_best_model=True,
            verbose=False  # set to True if you want to see the CatBoost logs
        )
        oof_preds_cat[val_idx] = model_cat.predict(X_val)
        test_preds_cat += model_cat.predict(test_features) / rkf.get_n_splits()

    # Evaluate each set of oof predictions for LGB
    for i, oof_lgb in enumerate(oof_preds_lgb_list):
        score = mean_squared_error(y, oof_lgb)
        print(f"LGB Model {i+1} CV MSE: {score:.6f}")

    # Evaluate XGB
    xgb_score = mean_squared_error(y, oof_preds_xgb)
    print(f"XGB CV MSE: {xgb_score:.6f}")

    # Evaluate CatBoost
    cat_score = mean_squared_error(y, oof_preds_cat)
    print(f"CATBoost CV MSE: {cat_score:.6f}")

    # Simple ensemble #1:
    # Average all LightGBM versions + XGB + Cat
    # If you had 2 LGB sets of oof predictions, you can average them
    avg_lgb = np.mean(np.column_stack(oof_preds_lgb_list), axis=1)
    oof_ensemble_1 = (avg_lgb + oof_preds_xgb + oof_preds_cat) / 3.0
    ensemble_1_score = mean_squared_error(y, oof_ensemble_1)
    print(f"Ensemble #1 (mean of LGBs + XGB + CAT) CV MSE: {ensemble_1_score:.6f}")

    # Predict on the test set with Ensemble #1
    avg_lgb_test = np.mean(np.column_stack(test_preds_lgb_list), axis=1)
    test_preds_ensemble_1 = (avg_lgb_test + test_preds_xgb + test_preds_cat) / 3.0

    # You can also experiment with different weighting strategies
    # For instance: if LGB is strong, do something like
    # ensemble_pred = 0.5*avg_lgb + 0.25*xgb + 0.25*cat
    # Or pick weights based on CV performance of each model.
    
    submission = pd.DataFrame({
        'ID': test_df['ID'],
        'matched_score': test_preds_ensemble_1
    })
    submission.to_csv('submission_ensemble.csv', index=False)
    print("Ensemble submission saved to submission_ensemble.csv")

In [9]:
if __name__ == "__main__":
    train_and_ensemble()

Transforming train data...


  if pd.isna(text):


Transforming test data...


  if pd.isna(text):


[LightGBM] [Info] Total Bins 66483
[LightGBM] [Info] Number of data points in the train set: 6108, number of used features: 2631
[LightGBM] [Info] Start training from score 0.658422
Training until validation scores don't improve for 100 rounds
[100]	training's l2: 0.0189821	valid_1's l2: 0.0190193
[200]	training's l2: 0.0145127	valid_1's l2: 0.0157731
[300]	training's l2: 0.0120032	valid_1's l2: 0.0140275
[400]	training's l2: 0.0103249	valid_1's l2: 0.0129199
[500]	training's l2: 0.0091082	valid_1's l2: 0.0120948
[600]	training's l2: 0.00819705	valid_1's l2: 0.0114649
[700]	training's l2: 0.00752525	valid_1's l2: 0.0110112
[800]	training's l2: 0.00698283	valid_1's l2: 0.0106508
[900]	training's l2: 0.00655994	valid_1's l2: 0.0103949
[1000]	training's l2: 0.00618909	valid_1's l2: 0.0101748
[1100]	training's l2: 0.00588546	valid_1's l2: 0.0100136
[1200]	training's l2: 0.00561879	valid_1's l2: 0.00987969
[1300]	training's l2: 0.00537029	valid_1's l2: 0.00974495
[1400]	training's l2: 0.005