In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import RepeatedKFold
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
import xgboost as xgb
import catboost as cb
import re
from ast import literal_eval
from datetime import datetime
from sklearn.preprocessing import PowerTransformer, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD, NMF
from sklearn.preprocessing import OneHotEncoder, LabelEncoder


In [2]:
###############################################################################
# TextProcessor
###############################################################################

In [3]:
class TextProcessor:
    def __init__(self, max_features=300):
        self.tfidf_models = {}
        self.count_models = {}
        self.svd_models = {}
        self.nmf_models = {}
        self.max_features = max_features

    def clean_text(self, text):
        if pd.isna(text):
            return ''
        text = str(text).lower()
        text = re.sub(r'[^\w\s]', ' ', text)
        text = re.sub(r'\d+', 'NUM', text)
        text = re.sub(r'\s+', ' ', text).strip()
        return text

    def process_list(self, text):
        if pd.isna(text) or text == '':
            return []
        try:
            items = literal_eval(text)
            return [self.clean_text(item) for item in items]
        except:
            return [self.clean_text(item) for item in str(text).split(',')]

    def fit_transform_text(self, texts, feature_name):
        processed_texts = [
            ' '.join(self.process_list(text)) if isinstance(text, str) else ''
            for text in texts
        ]
        self.tfidf_models[feature_name] = TfidfVectorizer(
            max_features=self.max_features,
            ngram_range=(1, 3),
            stop_words='english'
        )
        tfidf_matrix = self.tfidf_models[feature_name].fit_transform(processed_texts)
        self.count_models[feature_name] = CountVectorizer(
            max_features=self.max_features // 2,
            ngram_range=(1, 2),
            stop_words='english'
        )
        count_matrix = self.count_models[feature_name].fit_transform(processed_texts)
        self.svd_models[feature_name] = TruncatedSVD(n_components=50, random_state=42)
        svd_matrix = self.svd_models[feature_name].fit_transform(tfidf_matrix)
        self.nmf_models[feature_name] = NMF(n_components=30, random_state=42)
        nmf_matrix = self.nmf_models[feature_name].fit_transform(tfidf_matrix)
        return np.hstack([
            tfidf_matrix.toarray(),
            count_matrix.toarray(),
            svd_matrix,
            nmf_matrix
        ])

    def transform_text(self, texts, feature_name):
        processed_texts = [
            ' '.join(self.process_list(text)) if isinstance(text, str) else ''
            for text in texts
        ]
        tfidf_matrix = self.tfidf_models[feature_name].transform(processed_texts)
        count_matrix = self.count_models[feature_name].transform(processed_texts)
        svd_matrix = self.svd_models[feature_name].transform(tfidf_matrix)
        nmf_matrix = self.nmf_models[feature_name].transform(tfidf_matrix)
        return np.hstack([
            tfidf_matrix.toarray(),
            count_matrix.toarray(),
            svd_matrix,
            nmf_matrix
        ])


In [4]:
FEATURE_WEIGHTS = {
    'skills_required': 0.12,
    'locations': 0.10,
    'experience_requirement': 0.09,
    'job_position_name': 0.08,
    'educational_requirements': 0.07,
    'major_field_of_studies': 0.06,
    'responsibilities.1': 0.05,
    'passing_years': 0.05,
    'career_objective': 0.04,
    'skills': 0.03,
    'age_requirement': 0.03,
    'start_dates': 0.02,
    'responsibilities': 0.02
}

In [5]:
class FeatureEngineer:
    def __init__(self):
        self.text_processor = TextProcessor()
        self.scaler = StandardScaler()
        self.power_transformer = PowerTransformer(method='yeo-johnson', standardize=False)
        self.categorical_encoders = {}

    def extract_years_experience(self, row):
        try:
            start_years = [int(y) for y in re.findall(r'\d{4}', str(row['start_dates']))]
            end_years = [int(y) for y in re.findall(r'\d{4}', str(row['end_dates']))]
            if not end_years:
                end_years = [2024]
            experiences = [e - s for s, e in zip(start_years, end_years)]
            return {
                'total_experience': sum(experiences),
                'max_experience': max(experiences) if experiences else 0,
                'num_positions': len(experiences)
            }
        except:
            return {'total_experience': 0, 'max_experience': 0, 'num_positions': 0}

    def clean_education_result(self, result_str):
        if pd.isna(result_str):
            return 0
        try:
            if result_str.startswith('['):
                result_str = literal_eval(result_str)[0]
            result_str = str(result_str).upper()
            if result_str in ['N/A', 'NONE', 'NAN', '']:
                return 0
            result_str = result_str.replace('%', '')
            return float(result_str)
        except:
            return 0

    def extract_education_features(self, row):
        try:
            degree = str(row['degree_names']).lower() if not pd.isna(row['degree_names']) else ''
            result = self.clean_education_result(row['educational_results'])
            edu_score = 0
            if 'phd' in degree or 'doctorate' in degree:
                edu_score = 4
            elif 'master' in degree:
                edu_score = 3
            elif 'bachelor' in degree or 'bsc' in degree or 'ba' in degree:
                edu_score = 2
            elif 'diploma' in degree or 'certificate' in degree:
                edu_score = 1
            return {
                'education_score': edu_score,
                'education_result': result,
                'education_weight': edu_score * (result / 100 if result > 0 else 1)
            }
        except:
            return {
                'education_score': 0,
                'education_result': 0,
                'education_weight': 0
            }

    def calculate_weighted_feature(self, df):
        """Calculates a composite feature using FEATURE_WEIGHTS."""
        df['composite_feature'] = 0  # Initialize composite feature
        for feature, weight in FEATURE_WEIGHTS.items():
            if feature in df.columns:
                numeric_feature = pd.to_numeric(df[feature], errors='coerce').fillna(0)
                df['composite_feature'] += numeric_feature * weight
        return df

    def encode_categorical_features(self, df, categorical_columns, is_train=True):
        """
        Encodes categorical columns using One-Hot or Label Encoding.
        """
        encoded_dfs = []
        for col in categorical_columns:
            if is_train:
                if df[col].nunique() <= 50:  # Threshold for OneHotEncoding
                    encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
                    encoded = encoder.fit_transform(df[[col]].fillna('Unknown'))
                    self.categorical_encoders[col] = encoder
                    df_encoded = pd.DataFrame(
                        encoded,
                        columns=[f"{col}_{cat}" for cat in encoder.categories_[0]],
                        index=df.index
                    )
                else:
                    encoder = LabelEncoder()
                    encoded = encoder.fit_transform(df[col].fillna('Unknown'))
                    self.categorical_encoders[col] = encoder
                    df_encoded = pd.DataFrame(
                        {f"{col}_label": encoded}, index=df.index
                    )
            else:
                encoder = self.categorical_encoders.get(col)
                if encoder:
                    if isinstance(encoder, OneHotEncoder):
                        encoded = encoder.transform(df[[col]].fillna('Unknown'))
                        df_encoded = pd.DataFrame(
                            encoded,
                            columns=[f"{col}_{cat}" for cat in encoder.categories_[0]],
                            index=df.index
                        )
                    elif isinstance(encoder, LabelEncoder):
                        encoded = encoder.transform(df[col].fillna('Unknown'))
                        df_encoded = pd.DataFrame(
                            {f"{col}_label": encoded}, index=df.index
                        )
                    else:
                        raise ValueError(f"Unsupported encoder type for column '{col}'")
                else:
                    # Handle cases where the encoder is not available
                    print(f"Warning: No encoder found for column '{col}'. Using default encoding.")
                    df_encoded = pd.DataFrame(
                        {f"{col}_label": [0] * len(df)}, index=df.index
                    )
    
            # Append the encoded DataFrame to the list
            encoded_dfs.append(df_encoded)
        
        # Concatenate all encoded columns
        if encoded_dfs:
            return pd.concat(encoded_dfs, axis=1)
        else:
            # If no valid encoding was applied, return an empty DataFrame
            return pd.DataFrame(index=df.index)



    def sanitize_feature_names(self, df):
        """Clean feature names to remove special characters unsupported by LightGBM."""
        df.columns = [
            re.sub(r'[^\w\.-]', '_', col).replace('__', '_') for col in df.columns
        ]
        return df
    def transform(self, df, is_train=True):
        # Numeric features from experience and education
        exp_features = df.apply(self.extract_years_experience, axis=1)
        edu_features = df.apply(self.extract_education_features, axis=1)
    
        feature_dict = {}
        for feat in ['total_experience', 'max_experience', 'num_positions']:
            feature_dict[feat] = [x[feat] for x in exp_features]
        for feat in ['education_score', 'education_result', 'education_weight']:
            feature_dict[feat] = [x[feat] for x in edu_features]
    
        # Basic numeric count features
        feature_dict['num_skills'] = df['skills'].fillna('').str.count(',') + 1
        feature_dict['has_certification'] = (~df['certification_skills'].isna()).astype(int)
        feature_dict['num_languages'] = df['languages'].fillna('').str.count(',') + 1
    
        # Additional *interaction* features
        feature_dict['experience_per_position'] = np.array(feature_dict['total_experience']) / (
            np.array(feature_dict['num_positions']) + 0.1
        )
        feature_dict['result_x_edu_score'] = (
            np.array(feature_dict['education_result']) * np.array(feature_dict['education_score'])
        )
    
        numeric_df = pd.DataFrame(feature_dict, index=df.index)
    
        # Drop constant columns
        constant_cols = numeric_df.columns[numeric_df.nunique() <= 1]
        if len(constant_cols) > 0:
            print(f"Dropping constant columns: {list(constant_cols)}")
            numeric_df = numeric_df.drop(columns=constant_cols)
    
        # Power transform or scale numeric features
        if is_train:
            try:
                numeric_arr = self.power_transformer.fit_transform(numeric_df)
            except Exception as e:
                print(f"PowerTransformer failed: {e}. Falling back to StandardScaler.")
                numeric_arr = self.scaler.fit_transform(numeric_df)
        else:
            try:
                numeric_arr = self.power_transformer.transform(numeric_df)
            except Exception as e:
                print(f"PowerTransformer failed: {e}. Falling back to StandardScaler.")
                numeric_arr = self.scaler.transform(numeric_df)
    
        numeric_df = pd.DataFrame(numeric_arr, columns=numeric_df.columns, index=numeric_df.index)
    
        # Text features
        text_features = [
            'skills', 'career_objective', 'responsibilities',
            'educational_institution_name', 'certification_skills',
            'major_field_of_studies'
        ]
    
        all_text_features = {}
        for feature in text_features:
            if is_train:
                text_matrix = self.text_processor.fit_transform_text(df[feature], feature)
            else:
                text_matrix = self.text_processor.transform_text(df[feature], feature)
    
            for i in range(text_matrix.shape[1]):
                all_text_features[f'{feature}_text_{i}'] = text_matrix[:, i]
    
        text_feature_df = pd.DataFrame(all_text_features, index=df.index)
    
        # Handle categorical features
        categorical_columns = [
             'locations', 'result_types',
            'extra_curricular_activity_types', 'role_positions', 'proficiency_levels'
        ]
        categorical_df = self.encode_categorical_features(df, categorical_columns, is_train)
    
        # Calculate and add weighted composite feature
        df = self.calculate_weighted_feature(df)
        numeric_df['composite_feature'] = df['composite_feature']
    
        # Combine all features
        combined_features = pd.concat([numeric_df, text_feature_df, categorical_df], axis=1)
    
        # Sanitize feature names
        combined_features = self.sanitize_feature_names(combined_features)
    
        # Return the final feature DataFrame
        return combined_features


In [6]:
    train_df = pd.read_csv('/kaggle/input/bitfest-datathon-2025/train.csv')
    test_df = pd.read_csv('/kaggle/input/bitfest-datathon-2025/test.csv')

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score
import lightgbm as lgb
import optuna
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

class OptimizedLightGBM:
    def __init__(self, n_trials=50):
        self.n_trials = n_trials
        self.best_params = None
        
    def objective(self, trial, X, y):
        param = {
            'objective': 'regression_l2',
            'metric': 'l2',
            'boosting_type': 'gbdt',
            'device': 'gpu',
            'gpu_platform_id': 0,
            'gpu_device_id': 0,
            'verbose': -1,
            
            # GPU-compatible hyperparameters
            'num_leaves': trial.suggest_int('num_leaves', 31, 128),  # Reduced max leaves
            'max_depth': trial.suggest_int('max_depth', 5, 12),      # Reduced max depth
            'learning_rate': trial.suggest_loguniform('learning_rate', 0.005, 0.05),
            'feature_fraction': trial.suggest_uniform('feature_fraction', 0.6, 1.0),
            'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.6, 1.0),
            'bagging_freq': trial.suggest_int('bagging_freq', 1, 10),
            'min_child_samples': trial.suggest_int('min_child_samples', 20, 80),
            'max_bin': trial.suggest_int('max_bin', 63, 255),  # GPU-compatible bin size
            'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 10, 50),
            'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
            'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0)
        }
        
        # K-fold cross-validation
        kf = KFold(n_splits=5, shuffle=True, random_state=42)
        scores = []
        
        for train_idx, val_idx in kf.split(X):
            X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
            y_train, y_val = y[train_idx], y[val_idx]
            
            train_data = lgb.Dataset(X_train, label=y_train)
            val_data = lgb.Dataset(X_val, label=y_val)
            
            try:
                model = lgb.train(
                    param,
                    train_data,
                    valid_sets=[val_data],
                    num_boost_round=1000,
                    callbacks=[
                        lgb.early_stopping(stopping_rounds=50),
                        lgb.log_evaluation(period=0)
                    ]
                )
                
                preds = model.predict(X_val)
                fold_score = mean_squared_error(y_val, preds)
                scores.append(fold_score)
            except lgb.basic.LightGBMError as e:
                # Return a large error score if the parameters are invalid
                return float('inf')
        
        return np.mean(scores)

    def find_best_params(self, X, y):
        study = optuna.create_study(direction='minimize')
        study.optimize(lambda trial: self.objective(trial, X, y),
                      n_trials=self.n_trials)
        
        self.best_params = study.best_params
        print("\nOptimization Results:")
        print("Best MSE:", study.best_value)
        print("Best parameters:", self.best_params)
        return self.best_params

In [8]:
def main():
    # Load datasets
    # train_df = pd.read_csv('/kaggle/input/kuet-dset-2/train.csv')
    # test_df = pd.read_csv('/kaggle/input/kuet-dset-2/test.csv')
    
    # Feature Engineering
    fe = FeatureEngineer()
    print("Transforming train data...")
    train_features = fe.transform(train_df, is_train=True)
    print("Transforming test data...")
    test_features = fe.transform(test_df, is_train=False)
    
    y = train_df['matched_score'].values
    
    # Find best parameters
    print("Finding optimal hyperparameters...")
    optimizer = OptimizedLightGBM(n_trials=15)
    best_params = optimizer.find_best_params(train_features, y)
    
    # Add fixed parameters to best_params
    best_params.update({
        'objective': 'regression_l2',
        'metric': 'l2',
        'boosting_type': 'gbdt',
        'device': 'gpu',
        'gpu_platform_id': 0,
        'gpu_device_id': 0,
        'verbose': -1,
        'max_bin': 255  # Ensure GPU compatibility
    })

    # params = {
    #     'objective': 'regression_l2',
    #     'metric': 'l2',
    #     'num_leaves': 96,
    #     'max_depth': 12,
    #     'learning_rate': 0.03199484792860085,
    #     'feature_fraction': 0.6706304075294632,
    #     'bagging_fraction': 0.958672143301124,
    #     'bagging_freq': 3,
    #     'min_child_samples': 65,
    #     'lambda_l1': 0.014964605006024168,
    #     'lambda_l2': 4.20761346366579e-08,
    #     'max_bin': 178,
    #     'gpu_use_dp': True,  # Enable multi-GPU training
    #     'tree_learner': 'data_parallel',  # Use multi-GPU setup
    #     'verbose': -1
        
    # }
    
    # Cross-validation and model training
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    oof_predictions = np.zeros(len(train_features))
    test_predictions = np.zeros(len(test_features))
    feature_importance_df = pd.DataFrame()
    fold_scores = []
    
    print("\nTraining final model with best parameters...")
    for fold, (train_idx, val_idx) in enumerate(kf.split(train_features)):
        print(f"\nFold {fold + 1}/5")
        X_train, X_val = train_features.iloc[train_idx], train_features.iloc[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]
        
        train_data = lgb.Dataset(X_train, label=y_train)
        val_data = lgb.Dataset(X_val, label=y_val)
        
        model = lgb.train(
            best_params,
            # params,
            train_data,
            valid_sets=[train_data, val_data],
            num_boost_round=1000,
            callbacks=[
                lgb.early_stopping(stopping_rounds=50),
                lgb.log_evaluation(100)
            ]
        )
        
        val_preds = model.predict(X_val)
        oof_predictions[val_idx] = val_preds
        test_predictions += model.predict(test_features) / kf.n_splits
        
        fold_rmse = np.sqrt(mean_squared_error(y_val, val_preds))
        fold_scores.append(fold_rmse)
        print(f"Fold {fold + 1} RMSE: {fold_rmse:.6f}")
        
        fold_importance = pd.DataFrame({
            "feature": train_features.columns,
            "importance": model.feature_importance(),
            "fold": fold + 1
        })
        feature_importance_df = pd.concat([feature_importance_df, fold_importance], axis=0)
    
    # Calculate final metrics
    final_rmse = np.sqrt(mean_squared_error(y, oof_predictions))
    # final_r2 = r2_score(y, oof_predictions)
    
    print("\nFinal Model Performance:")
    print(f"Average RMSE: {np.mean(fold_scores):.6f} ± {np.std(fold_scores):.6f}")
    print(f"Overall RMSE: {final_rmse:.6f}")
    # print(f"R2 Score: {final_r2:.6f}")
    
    # Save predictions
    submission = pd.DataFrame({
        'ID': test_df['ID'],
        'matched_score': test_predictions
    })
    submission.to_csv('submission.csv', index=False)
    print("\nSubmission saved to submission.csv")
    
    # Save feature importance
    feature_importance = (feature_importance_df.groupby('feature')['importance']
                        .mean()
                        .sort_values(ascending=False))
    feature_importance.to_csv('feature_importance.csv')
    
    print("\nTop 10 Most Important Features:")
    print(feature_importance.head(10))
    
    return submission, feature_importance_df

from sklearn.model_selection import KFold  # Add this import
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np
import lightgbm as lgbm

from sklearn.preprocessing import LabelEncoder

# The rest of your code remains the same...

if __name__ == "__main__":
    main()

Transforming train data...
Transforming test data...


[I 2024-12-28 08:55:25,626] A new study created in memory with name: no-name-7effc769-38a5-4446-920a-4d012d3b6831


Finding optimal hyperparameters...
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 0.00977158
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 0.00942166
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 0.00892562
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 0.00951987
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 0.0088914


[I 2024-12-28 08:57:36,868] Trial 0 finished with value: 0.009306026182334485 and parameters: {'num_leaves': 54, 'max_depth': 9, 'learning_rate': 0.006154872287389939, 'feature_fraction': 0.7025492087368943, 'bagging_fraction': 0.7331398066744437, 'bagging_freq': 7, 'min_child_samples': 50, 'max_bin': 203, 'min_data_in_leaf': 29, 'lambda_l1': 2.0926831953352057e-06, 'lambda_l2': 0.8061711841850441}. Best is trial 0 with value: 0.009306026182334485.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 0.0103808
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 0.0100749
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 0.00940882
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 0.00994871
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 0.00939692


[I 2024-12-28 08:58:31,642] Trial 1 finished with value: 0.009842026222203487 and parameters: {'num_leaves': 57, 'max_depth': 6, 'learning_rate': 0.009514434641048085, 'feature_fraction': 0.6101690442389059, 'bagging_fraction': 0.6174219919768559, 'bagging_freq': 9, 'min_child_samples': 34, 'max_bin': 227, 'min_data_in_leaf': 50, 'lambda_l1': 2.4229861961031528e-06, 'lambda_l2': 6.176227450705077e-08}. Best is trial 0 with value: 0.009306026182334485.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[479]	valid_0's l2: 0.00921662
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[520]	valid_0's l2: 0.00909414
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[432]	valid_0's l2: 0.00876184
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[622]	valid_0's l2: 0.00916681
Training until validation scores don't improve for 50 rounds


[I 2024-12-28 08:59:57,522] Trial 2 finished with value: 0.008958070543431732 and parameters: {'num_leaves': 102, 'max_depth': 10, 'learning_rate': 0.03837075835582365, 'feature_fraction': 0.972429016039639, 'bagging_fraction': 0.6937236374765352, 'bagging_freq': 8, 'min_child_samples': 50, 'max_bin': 199, 'min_data_in_leaf': 31, 'lambda_l1': 1.1336164070058268e-05, 'lambda_l2': 6.608874850927742e-05}. Best is trial 2 with value: 0.008958070543431732.


Early stopping, best iteration is:
[320]	valid_0's l2: 0.00855094
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 0.0100944
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 0.00987531
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 0.00931224
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 0.0100015
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[999]	valid_0's l2: 0.00922058


[I 2024-12-28 09:02:06,518] Trial 3 finished with value: 0.009700815239765584 and parameters: {'num_leaves': 103, 'max_depth': 8, 'learning_rate': 0.0058822773916529705, 'feature_fraction': 0.8992751060460251, 'bagging_fraction': 0.9284808584056232, 'bagging_freq': 8, 'min_child_samples': 37, 'max_bin': 227, 'min_data_in_leaf': 44, 'lambda_l1': 1.497053737557557e-05, 'lambda_l2': 1.0788079943313457e-06}. Best is trial 2 with value: 0.008958070543431732.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 0.00971898
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 0.00956902
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 0.00895379
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 0.00944445
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 0.00894684


[I 2024-12-28 09:03:44,354] Trial 4 finished with value: 0.009326615897315796 and parameters: {'num_leaves': 115, 'max_depth': 8, 'learning_rate': 0.007034944484598274, 'feature_fraction': 0.6618784543838787, 'bagging_fraction': 0.6525346178284571, 'bagging_freq': 10, 'min_child_samples': 40, 'max_bin': 216, 'min_data_in_leaf': 32, 'lambda_l1': 0.18270520915435076, 'lambda_l2': 0.025016991343771843}. Best is trial 2 with value: 0.008958070543431732.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 0.0106407
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[996]	valid_0's l2: 0.0103327
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 0.00975263
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 0.0103535
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 0.00970788


[I 2024-12-28 09:05:30,842] Trial 5 finished with value: 0.01015745579030512 and parameters: {'num_leaves': 34, 'max_depth': 7, 'learning_rate': 0.0053057959159686415, 'feature_fraction': 0.9788989140271582, 'bagging_fraction': 0.6119538417539857, 'bagging_freq': 4, 'min_child_samples': 70, 'max_bin': 234, 'min_data_in_leaf': 30, 'lambda_l1': 0.05460092252633384, 'lambda_l2': 0.00010606977239928292}. Best is trial 2 with value: 0.008958070543431732.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 0.00926181
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[999]	valid_0's l2: 0.0091207
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[987]	valid_0's l2: 0.0086059
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 0.00915875
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 0.00845879


[I 2024-12-28 09:06:23,137] Trial 6 finished with value: 0.008921188739440877 and parameters: {'num_leaves': 79, 'max_depth': 5, 'learning_rate': 0.02189687463290465, 'feature_fraction': 0.6066665325178305, 'bagging_fraction': 0.6783333362716348, 'bagging_freq': 7, 'min_child_samples': 24, 'max_bin': 181, 'min_data_in_leaf': 39, 'lambda_l1': 0.43823926591852846, 'lambda_l2': 0.033790875751744484}. Best is trial 6 with value: 0.008921188739440877.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 0.00876743
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[994]	valid_0's l2: 0.00878638
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 0.00815204
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[994]	valid_0's l2: 0.00884232
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[998]	valid_0's l2: 0.0082362


[I 2024-12-28 09:07:23,468] Trial 7 finished with value: 0.008556873779141756 and parameters: {'num_leaves': 89, 'max_depth': 6, 'learning_rate': 0.02415471063872619, 'feature_fraction': 0.6138157214350698, 'bagging_fraction': 0.8697311401025435, 'bagging_freq': 5, 'min_child_samples': 70, 'max_bin': 126, 'min_data_in_leaf': 15, 'lambda_l1': 1.1136883975473784e-08, 'lambda_l2': 2.5368014817540747e-06}. Best is trial 7 with value: 0.008556873779141756.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[981]	valid_0's l2: 0.00893985
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[807]	valid_0's l2: 0.00876505
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[871]	valid_0's l2: 0.00829361
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[972]	valid_0's l2: 0.00861123
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 0.00821078


[I 2024-12-28 09:11:02,957] Trial 8 finished with value: 0.008564101702139833 and parameters: {'num_leaves': 78, 'max_depth': 12, 'learning_rate': 0.016763909441942452, 'feature_fraction': 0.9926220965624607, 'bagging_fraction': 0.9230760357766183, 'bagging_freq': 6, 'min_child_samples': 21, 'max_bin': 182, 'min_data_in_leaf': 26, 'lambda_l1': 0.02052710739650274, 'lambda_l2': 1.3754623882330091e-08}. Best is trial 7 with value: 0.008556873779141756.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 0.0092877
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 0.00919339
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 0.0085524
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 0.00912199
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 0.0085655


[I 2024-12-28 09:15:59,633] Trial 9 finished with value: 0.008944195519157953 and parameters: {'num_leaves': 107, 'max_depth': 12, 'learning_rate': 0.006080464889033302, 'feature_fraction': 0.8492894417442789, 'bagging_fraction': 0.8384733870895533, 'bagging_freq': 10, 'min_child_samples': 76, 'max_bin': 186, 'min_data_in_leaf': 14, 'lambda_l1': 7.760554073236107e-05, 'lambda_l2': 3.9162312284768616e-08}. Best is trial 7 with value: 0.008556873779141756.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 0.00911869
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 0.00884449
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[999]	valid_0's l2: 0.00850701
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 0.00913845
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[986]	valid_0's l2: 0.00853583


[I 2024-12-28 09:16:36,941] Trial 10 finished with value: 0.008828892739316276 and parameters: {'num_leaves': 126, 'max_depth': 5, 'learning_rate': 0.048320174979913616, 'feature_fraction': 0.7524165125343297, 'bagging_fraction': 0.9910751000294249, 'bagging_freq': 1, 'min_child_samples': 64, 'max_bin': 103, 'min_data_in_leaf': 12, 'lambda_l1': 1.2114152338714371e-08, 'lambda_l2': 3.246769602102152e-06}. Best is trial 7 with value: 0.008556873779141756.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[980]	valid_0's l2: 0.00856812
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[696]	valid_0's l2: 0.00867431
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[859]	valid_0's l2: 0.0081307
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[978]	valid_0's l2: 0.00854423
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[756]	valid_0's l2: 0.00813901


[I 2024-12-28 09:19:25,087] Trial 11 finished with value: 0.008411271907315278 and parameters: {'num_leaves': 81, 'max_depth': 12, 'learning_rate': 0.018528159776261856, 'feature_fraction': 0.7987037372929257, 'bagging_fraction': 0.8565989252784519, 'bagging_freq': 4, 'min_child_samples': 20, 'max_bin': 134, 'min_data_in_leaf': 21, 'lambda_l1': 0.004358449390820413, 'lambda_l2': 2.4609170104023255e-06}. Best is trial 11 with value: 0.008411271907315278.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[929]	valid_0's l2: 0.00861981
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[772]	valid_0's l2: 0.00872866
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[812]	valid_0's l2: 0.00820868
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 0.00855804
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[944]	valid_0's l2: 0.00794114


[I 2024-12-28 09:21:38,638] Trial 12 finished with value: 0.008411267190948798 and parameters: {'num_leaves': 89, 'max_depth': 10, 'learning_rate': 0.025239837770155153, 'feature_fraction': 0.7905087614604197, 'bagging_fraction': 0.8071296449895503, 'bagging_freq': 4, 'min_child_samples': 61, 'max_bin': 119, 'min_data_in_leaf': 20, 'lambda_l1': 0.002331428802554355, 'lambda_l2': 3.2450392478802307e-06}. Best is trial 12 with value: 0.008411267190948798.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[998]	valid_0's l2: 0.00895116
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 0.00861326
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[996]	valid_0's l2: 0.00817434
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 0.00870107
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[998]	valid_0's l2: 0.00809238


[I 2024-12-28 09:23:55,819] Trial 13 finished with value: 0.008506441522299128 and parameters: {'num_leaves': 71, 'max_depth': 11, 'learning_rate': 0.011829931133338667, 'feature_fraction': 0.7957437994216536, 'bagging_fraction': 0.7654674115436348, 'bagging_freq': 3, 'min_child_samples': 57, 'max_bin': 70, 'min_data_in_leaf': 20, 'lambda_l1': 0.0013841463302205967, 'lambda_l2': 0.0009235846387554657}. Best is trial 12 with value: 0.008411267190948798.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[497]	valid_0's l2: 0.0105278
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[994]	valid_0's l2: 0.00986883
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[482]	valid_0's l2: 0.00961639
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[685]	valid_0's l2: 0.010077
Training until validation scores don't improve for 50 rounds


[I 2024-12-28 09:24:29,566] Trial 14 finished with value: 0.00991147880151353 and parameters: {'num_leaves': 90, 'max_depth': 10, 'learning_rate': 0.03161041745295098, 'feature_fraction': 0.8428728847677374, 'bagging_fraction': 0.8190375904773319, 'bagging_freq': 2, 'min_child_samples': 56, 'max_bin': 137, 'min_data_in_leaf': 21, 'lambda_l1': 4.799094658981475, 'lambda_l2': 6.747985543721152e-06}. Best is trial 12 with value: 0.008411267190948798.


Early stopping, best iteration is:
[593]	valid_0's l2: 0.0094667

Optimization Results:
Best MSE: 0.008411267190948798
Best parameters: {'num_leaves': 89, 'max_depth': 10, 'learning_rate': 0.025239837770155153, 'feature_fraction': 0.7905087614604197, 'bagging_fraction': 0.8071296449895503, 'bagging_freq': 4, 'min_child_samples': 61, 'max_bin': 119, 'min_data_in_leaf': 20, 'lambda_l1': 0.002331428802554355, 'lambda_l2': 3.2450392478802307e-06}

Training final model with best parameters...

Fold 1/5
Training until validation scores don't improve for 50 rounds
[100]	training's l2: 0.00623303	valid_1's l2: 0.0110858
[200]	training's l2: 0.00383408	valid_1's l2: 0.00963383
[300]	training's l2: 0.00285192	valid_1's l2: 0.00919984
[400]	training's l2: 0.00217276	valid_1's l2: 0.00903814
[500]	training's l2: 0.00165702	valid_1's l2: 0.00890227
[600]	training's l2: 0.00127159	valid_1's l2: 0.00879977
[700]	training's l2: 0.000984445	valid_1's l2: 0.00874909
Early stopping, best iteration is:
[7

In [9]:
print(train_df.columns)
print(test_df.columns)


Index(['address', 'career_objective', 'skills', 'educational_institution_name',
       'degree_names', 'passing_years', 'educational_results', 'result_types',
       'major_field_of_studies', 'professional_company_names', 'company_urls',
       'start_dates', 'end_dates', 'related_skils_in_job', 'positions',
       'locations', 'responsibilities', 'extra_curricular_activity_types',
       'extra_curricular_organization_names',
       'extra_curricular_organization_links', 'role_positions', 'languages',
       'proficiency_levels', 'certification_providers', 'certification_skills',
       'online_links', 'issue_dates', 'expiry_dates', '﻿job_position_name',
       'educationaL_requirements', 'experiencere_requirement',
       'age_requirement', 'responsibilities.1', 'skills_required',
       'matched_score', 'composite_feature'],
      dtype='object')
Index(['ID', 'address', 'career_objective', 'skills',
       'educational_institution_name', 'degree_names', 'passing_years',
       'educ

In [10]:
if __name__ == "__main__":
    main()

Transforming train data...
Transforming test data...


[I 2024-12-28 09:27:09,861] A new study created in memory with name: no-name-8ad44b6c-03e5-4ba8-ad4c-61817efe2e17


Finding optimal hyperparameters...
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[703]	valid_0's l2: 0.0116467
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[685]	valid_0's l2: 0.0110215
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[840]	valid_0's l2: 0.0104434
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[849]	valid_0's l2: 0.0110018
Training until validation scores don't improve for 50 rounds


[I 2024-12-28 09:27:50,016] Trial 0 finished with value: 0.010909732021062503 and parameters: {'num_leaves': 71, 'max_depth': 8, 'learning_rate': 0.012894491887203617, 'feature_fraction': 0.997715990620524, 'bagging_fraction': 0.9730360917797842, 'bagging_freq': 4, 'min_child_samples': 25, 'max_bin': 137, 'min_data_in_leaf': 48, 'lambda_l1': 6.678670662880274, 'lambda_l2': 0.6956195935402643}. Best is trial 0 with value: 0.010909732021062503.


Early stopping, best iteration is:
[735]	valid_0's l2: 0.0104351
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[459]	valid_0's l2: 0.0088628
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[652]	valid_0's l2: 0.0085775
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[464]	valid_0's l2: 0.00825471
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[376]	valid_0's l2: 0.00886953
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[680]	valid_0's l2: 0.00825746


[I 2024-12-28 09:29:29,363] Trial 1 finished with value: 0.008564398806034606 and parameters: {'num_leaves': 81, 'max_depth': 12, 'learning_rate': 0.03412878379314154, 'feature_fraction': 0.7588588475769372, 'bagging_fraction': 0.8372294199572821, 'bagging_freq': 7, 'min_child_samples': 49, 'max_bin': 191, 'min_data_in_leaf': 36, 'lambda_l1': 0.0004524728228896782, 'lambda_l2': 3.5472939661627034e-05}. Best is trial 1 with value: 0.008564398806034606.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 0.00984361
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 0.00956792
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 0.00899761
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 0.00964465
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 0.00897623


[I 2024-12-28 09:31:04,340] Trial 2 finished with value: 0.009406004471587969 and parameters: {'num_leaves': 67, 'max_depth': 9, 'learning_rate': 0.0070252168313446745, 'feature_fraction': 0.9068110285249762, 'bagging_fraction': 0.6500584040732887, 'bagging_freq': 4, 'min_child_samples': 33, 'max_bin': 94, 'min_data_in_leaf': 40, 'lambda_l1': 5.7442600503861094e-08, 'lambda_l2': 0.012290333943122329}. Best is trial 1 with value: 0.008564398806034606.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 0.00915877
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 0.00900164
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 0.00837578
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[997]	valid_0's l2: 0.0091355
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 0.00853987


[I 2024-12-28 09:32:17,463] Trial 3 finished with value: 0.008842310014038285 and parameters: {'num_leaves': 99, 'max_depth': 7, 'learning_rate': 0.014179745736947866, 'feature_fraction': 0.7927942767411263, 'bagging_fraction': 0.9544696460972223, 'bagging_freq': 10, 'min_child_samples': 60, 'max_bin': 78, 'min_data_in_leaf': 22, 'lambda_l1': 0.0005894686433975719, 'lambda_l2': 0.00016512072885908397}. Best is trial 1 with value: 0.008564398806034606.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 0.0117048
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 0.0113914
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 0.0107838
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 0.0115514
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 0.0108063


[I 2024-12-28 09:33:07,106] Trial 4 finished with value: 0.01124754847855262 and parameters: {'num_leaves': 107, 'max_depth': 5, 'learning_rate': 0.005006856139904181, 'feature_fraction': 0.6512378214032367, 'bagging_fraction': 0.6760027636122246, 'bagging_freq': 8, 'min_child_samples': 68, 'max_bin': 149, 'min_data_in_leaf': 17, 'lambda_l1': 0.0018752115285757928, 'lambda_l2': 1.2718369899120523}. Best is trial 1 with value: 0.008564398806034606.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 0.0103077
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 0.0100204
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 0.00939614
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 0.00994547
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 0.00943673


[I 2024-12-28 09:33:58,570] Trial 5 finished with value: 0.009821297498622913 and parameters: {'num_leaves': 97, 'max_depth': 5, 'learning_rate': 0.015798400932130852, 'feature_fraction': 0.87362650989143, 'bagging_fraction': 0.7631162227745412, 'bagging_freq': 1, 'min_child_samples': 32, 'max_bin': 243, 'min_data_in_leaf': 45, 'lambda_l1': 3.3371179079691697, 'lambda_l2': 3.0923107674456376e-05}. Best is trial 1 with value: 0.008564398806034606.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[999]	valid_0's l2: 0.0103568
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 0.0100425
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 0.0093838
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 0.0100358
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 0.00929355


[I 2024-12-28 09:35:14,953] Trial 6 finished with value: 0.009822515762811877 and parameters: {'num_leaves': 39, 'max_depth': 6, 'learning_rate': 0.008249050363887178, 'feature_fraction': 0.9792062943004343, 'bagging_fraction': 0.6453236415097331, 'bagging_freq': 3, 'min_child_samples': 75, 'max_bin': 248, 'min_data_in_leaf': 38, 'lambda_l1': 0.1300565926295599, 'lambda_l2': 0.0007748051044937304}. Best is trial 1 with value: 0.008564398806034606.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 0.00879109
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 0.00871348
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[993]	valid_0's l2: 0.00813337
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 0.00872885
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 0.00821351


[I 2024-12-28 09:36:15,449] Trial 7 finished with value: 0.008516058893730945 and parameters: {'num_leaves': 38, 'max_depth': 7, 'learning_rate': 0.021663465611820004, 'feature_fraction': 0.8057287499773529, 'bagging_fraction': 0.8094253610736333, 'bagging_freq': 3, 'min_child_samples': 74, 'max_bin': 65, 'min_data_in_leaf': 30, 'lambda_l1': 0.00889013253182949, 'lambda_l2': 8.934989856673769e-08}. Best is trial 7 with value: 0.008516058893730945.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 0.00905044
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 0.00903004
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 0.00838696
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[997]	valid_0's l2: 0.00902832
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 0.00848327


[I 2024-12-28 09:38:48,049] Trial 8 finished with value: 0.008795805703122364 and parameters: {'num_leaves': 122, 'max_depth': 9, 'learning_rate': 0.0128591747685039, 'feature_fraction': 0.9193599624778676, 'bagging_fraction': 0.9882690917866976, 'bagging_freq': 8, 'min_child_samples': 55, 'max_bin': 154, 'min_data_in_leaf': 21, 'lambda_l1': 9.539514899249026e-06, 'lambda_l2': 0.0006405800492528965}. Best is trial 7 with value: 0.008516058893730945.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 0.0106995
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 0.0103063
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 0.0097591
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 0.0103176
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 0.00974607


[I 2024-12-28 09:39:33,923] Trial 9 finished with value: 0.010165710812169156 and parameters: {'num_leaves': 126, 'max_depth': 5, 'learning_rate': 0.008617430727077999, 'feature_fraction': 0.7098614438838488, 'bagging_fraction': 0.7067539437615035, 'bagging_freq': 2, 'min_child_samples': 65, 'max_bin': 126, 'min_data_in_leaf': 20, 'lambda_l1': 2.009639943270269e-07, 'lambda_l2': 0.3137165368746327}. Best is trial 7 with value: 0.008516058893730945.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[916]	valid_0's l2: 0.00870765
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[983]	valid_0's l2: 0.00839495
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[995]	valid_0's l2: 0.00788236
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[981]	valid_0's l2: 0.00844884
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[991]	valid_0's l2: 0.00796491


[I 2024-12-28 09:41:19,594] Trial 10 finished with value: 0.008279740739390553 and parameters: {'num_leaves': 33, 'max_depth': 11, 'learning_rate': 0.031569836374563, 'feature_fraction': 0.6041930574501404, 'bagging_fraction': 0.8628189085120157, 'bagging_freq': 6, 'min_child_samples': 80, 'max_bin': 199, 'min_data_in_leaf': 10, 'lambda_l1': 0.04553229986159843, 'lambda_l2': 1.7491843518391425e-08}. Best is trial 10 with value: 0.008279740739390553.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[804]	valid_0's l2: 0.00875698
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[894]	valid_0's l2: 0.00840571
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[998]	valid_0's l2: 0.00799974
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[897]	valid_0's l2: 0.00855821
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[784]	valid_0's l2: 0.00798819


[I 2024-12-28 09:42:45,619] Trial 11 finished with value: 0.008341766725837056 and parameters: {'num_leaves': 31, 'max_depth': 11, 'learning_rate': 0.03670920580969825, 'feature_fraction': 0.6370780450939477, 'bagging_fraction': 0.8509061360649933, 'bagging_freq': 6, 'min_child_samples': 80, 'max_bin': 202, 'min_data_in_leaf': 10, 'lambda_l1': 0.04084460024433476, 'lambda_l2': 1.3201087733978354e-08}. Best is trial 10 with value: 0.008279740739390553.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[731]	valid_0's l2: 0.00867257
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[806]	valid_0's l2: 0.00845325
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[751]	valid_0's l2: 0.00783133
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[995]	valid_0's l2: 0.00854051
Training until validation scores don't improve for 50 rounds


[I 2024-12-28 09:44:06,268] Trial 12 finished with value: 0.008256930198270522 and parameters: {'num_leaves': 31, 'max_depth': 11, 'learning_rate': 0.04797783252286095, 'feature_fraction': 0.6061207641214251, 'bagging_fraction': 0.8918973307974841, 'bagging_freq': 6, 'min_child_samples': 80, 'max_bin': 202, 'min_data_in_leaf': 12, 'lambda_l1': 0.09563916804287748, 'lambda_l2': 1.116224045481925e-08}. Best is trial 12 with value: 0.008256930198270522.


Early stopping, best iteration is:
[596]	valid_0's l2: 0.00778699
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[411]	valid_0's l2: 0.00868916
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[648]	valid_0's l2: 0.00845569
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[652]	valid_0's l2: 0.00804942
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[560]	valid_0's l2: 0.00853768
Training until validation scores don't improve for 50 rounds


[I 2024-12-28 09:45:33,775] Trial 13 finished with value: 0.008349227999261553 and parameters: {'num_leaves': 54, 'max_depth': 11, 'learning_rate': 0.04983672657162842, 'feature_fraction': 0.6030853880513323, 'bagging_fraction': 0.9013807499582697, 'bagging_freq': 5, 'min_child_samples': 50, 'max_bin': 197, 'min_data_in_leaf': 12, 'lambda_l1': 0.4572528846656661, 'lambda_l2': 6.005792204209516e-07}. Best is trial 12 with value: 0.008256930198270522.


Early stopping, best iteration is:
[453]	valid_0's l2: 0.00801419
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[995]	valid_0's l2: 0.00872933
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[939]	valid_0's l2: 0.00851946
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[934]	valid_0's l2: 0.00791002
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[986]	valid_0's l2: 0.00849103
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[774]	valid_0's l2: 0.00789618


[I 2024-12-28 09:47:39,654] Trial 14 finished with value: 0.0083092030624888 and parameters: {'num_leaves': 51, 'max_depth': 10, 'learning_rate': 0.02563679291982462, 'feature_fraction': 0.6908011195666701, 'bagging_fraction': 0.8906324921239008, 'bagging_freq': 6, 'min_child_samples': 79, 'max_bin': 223, 'min_data_in_leaf': 27, 'lambda_l1': 4.164718078240208e-05, 'lambda_l2': 1.920486453737221e-06}. Best is trial 12 with value: 0.008256930198270522.



Optimization Results:
Best MSE: 0.008256930198270522
Best parameters: {'num_leaves': 31, 'max_depth': 11, 'learning_rate': 0.04797783252286095, 'feature_fraction': 0.6061207641214251, 'bagging_fraction': 0.8918973307974841, 'bagging_freq': 6, 'min_child_samples': 80, 'max_bin': 202, 'min_data_in_leaf': 12, 'lambda_l1': 0.09563916804287748, 'lambda_l2': 1.116224045481925e-08}

Training final model with best parameters...

Fold 1/5
Training until validation scores don't improve for 50 rounds
[100]	training's l2: 0.00636617	valid_1's l2: 0.0102368
[200]	training's l2: 0.00430577	valid_1's l2: 0.00936505
[300]	training's l2: 0.00318139	valid_1's l2: 0.00898776
[400]	training's l2: 0.00241155	valid_1's l2: 0.00879868
[500]	training's l2: 0.00184315	valid_1's l2: 0.00871847
[600]	training's l2: 0.00143102	valid_1's l2: 0.00865435
[700]	training's l2: 0.00112729	valid_1's l2: 0.00863165
Early stopping, best iteration is:
[653]	training's l2: 0.00126167	valid_1's l2: 0.00862327
Fold 1 RMSE: 0