In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import RepeatedKFold
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
import xgboost as xgb
import catboost as cb
import re
from ast import literal_eval
from datetime import datetime
from sklearn.preprocessing import PowerTransformer, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD, NMF
from sklearn.preprocessing import OneHotEncoder, LabelEncoder


In [None]:
###############################################################################
# TextProcessor
###############################################################################

In [None]:
class TextProcessor:
    def __init__(self, max_features=300):
        self.tfidf_models = {}
        self.count_models = {}
        self.svd_models = {}
        self.nmf_models = {}
        self.max_features = max_features

    def clean_text(self, text):
        if pd.isna(text):
            return ''
        text = str(text).lower()
        text = re.sub(r'[^\w\s]', ' ', text)
        text = re.sub(r'\d+', 'NUM', text)
        text = re.sub(r'\s+', ' ', text).strip()
        return text

    def process_list(self, text):
        if pd.isna(text) or text == '':
            return []
        try:
            items = literal_eval(text)
            return [self.clean_text(item) for item in items]
        except:
            return [self.clean_text(item) for item in str(text).split(',')]

    def fit_transform_text(self, texts, feature_name):
        processed_texts = [
            ' '.join(self.process_list(text)) if isinstance(text, str) else ''
            for text in texts
        ]
        self.tfidf_models[feature_name] = TfidfVectorizer(
            max_features=self.max_features,
            ngram_range=(1, 3),
            stop_words='english'
        )
        tfidf_matrix = self.tfidf_models[feature_name].fit_transform(processed_texts)
        self.count_models[feature_name] = CountVectorizer(
            max_features=self.max_features // 2,
            ngram_range=(1, 2),
            stop_words='english'
        )
        count_matrix = self.count_models[feature_name].fit_transform(processed_texts)
        self.svd_models[feature_name] = TruncatedSVD(n_components=50, random_state=42)
        svd_matrix = self.svd_models[feature_name].fit_transform(tfidf_matrix)
        self.nmf_models[feature_name] = NMF(n_components=30, random_state=42)
        nmf_matrix = self.nmf_models[feature_name].fit_transform(tfidf_matrix)
        return np.hstack([
            tfidf_matrix.toarray(),
            count_matrix.toarray(),
            svd_matrix,
            nmf_matrix
        ])

    def transform_text(self, texts, feature_name):
        processed_texts = [
            ' '.join(self.process_list(text)) if isinstance(text, str) else ''
            for text in texts
        ]
        tfidf_matrix = self.tfidf_models[feature_name].transform(processed_texts)
        count_matrix = self.count_models[feature_name].transform(processed_texts)
        svd_matrix = self.svd_models[feature_name].transform(tfidf_matrix)
        nmf_matrix = self.nmf_models[feature_name].transform(tfidf_matrix)
        return np.hstack([
            tfidf_matrix.toarray(),
            count_matrix.toarray(),
            svd_matrix,
            nmf_matrix
        ])


In [None]:
FEATURE_WEIGHTS = {
    'skills_required': 0.12,
    'locations': 0.10,
    'experience_requirement': 0.09,
    'job_position_name': 0.08,
    'educational_requirements': 0.07,
    'major_field_of_studies': 0.06,
    'responsibilities.1': 0.05,
    'passing_years': 0.05,
    'career_objective': 0.04,
    'skills': 0.03,
    'age_requirement': 0.03,
    'start_dates': 0.02,
    'responsibilities': 0.02
}

In [None]:
class FeatureEngineer:
    def __init__(self):
        self.text_processor = TextProcessor()
        self.scaler = StandardScaler()
        self.power_transformer = PowerTransformer(method='yeo-johnson', standardize=False)
        self.categorical_encoders = {}

    def extract_years_experience(self, row):
        try:
            start_years = [int(y) for y in re.findall(r'\d{4}', str(row['start_dates']))]
            end_years = [int(y) for y in re.findall(r'\d{4}', str(row['end_dates']))]
            if not end_years:
                end_years = [2024]
            experiences = [e - s for s, e in zip(start_years, end_years)]
            return {
                'total_experience': sum(experiences),
                'max_experience': max(experiences) if experiences else 0,
                'num_positions': len(experiences)
            }
        except:
            return {'total_experience': 0, 'max_experience': 0, 'num_positions': 0}

    def clean_education_result(self, result_str):
        if pd.isna(result_str):
            return 0
        try:
            if result_str.startswith('['):
                result_str = literal_eval(result_str)[0]
            result_str = str(result_str).upper()
            if result_str in ['N/A', 'NONE', 'NAN', '']:
                return 0
            result_str = result_str.replace('%', '')
            return float(result_str)
        except:
            return 0

    def extract_education_features(self, row):
        try:
            degree = str(row['degree_names']).lower() if not pd.isna(row['degree_names']) else ''
            result = self.clean_education_result(row['educational_results'])
            edu_score = 0
            if 'phd' in degree or 'doctorate' in degree:
                edu_score = 4
            elif 'master' in degree:
                edu_score = 3
            elif 'bachelor' in degree or 'bsc' in degree or 'ba' in degree:
                edu_score = 2
            elif 'diploma' in degree or 'certificate' in degree:
                edu_score = 1
            return {
                'education_score': edu_score,
                'education_result': result,
                'education_weight': edu_score * (result / 100 if result > 0 else 1)
            }
        except:
            return {
                'education_score': 0,
                'education_result': 0,
                'education_weight': 0
            }

    def calculate_weighted_feature(self, df):
        """Calculates a composite feature using FEATURE_WEIGHTS."""
        df['composite_feature'] = 0  # Initialize composite feature
        for feature, weight in FEATURE_WEIGHTS.items():
            if feature in df.columns:
                numeric_feature = pd.to_numeric(df[feature], errors='coerce').fillna(0)
                df['composite_feature'] += numeric_feature * weight
        return df

    def encode_categorical_features(self, df, categorical_columns, is_train=True):
        """
        Encodes categorical columns using One-Hot or Label Encoding.
        """
        encoded_dfs = []
        for col in categorical_columns:
            if is_train:
                if df[col].nunique() <= 50:  # Threshold for OneHotEncoding
                    encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
                    encoded = encoder.fit_transform(df[[col]].fillna('Unknown'))
                    self.categorical_encoders[col] = encoder
                    df_encoded = pd.DataFrame(
                        encoded,
                        columns=[f"{col}_{cat}" for cat in encoder.categories_[0]],
                        index=df.index
                    )
                else:
                    encoder = LabelEncoder()
                    encoded = encoder.fit_transform(df[col].fillna('Unknown'))
                    self.categorical_encoders[col] = encoder
                    df_encoded = pd.DataFrame(
                        {f"{col}_label": encoded}, index=df.index
                    )
            else:
                encoder = self.categorical_encoders.get(col)
                if encoder:
                    if isinstance(encoder, OneHotEncoder):
                        encoded = encoder.transform(df[[col]].fillna('Unknown'))
                        df_encoded = pd.DataFrame(
                            encoded,
                            columns=[f"{col}_{cat}" for cat in encoder.categories_[0]],
                            index=df.index
                        )
                    elif isinstance(encoder, LabelEncoder):
                        encoded = encoder.transform(df[col].fillna('Unknown'))
                        df_encoded = pd.DataFrame(
                            {f"{col}_label": encoded}, index=df.index
                        )
                    else:
                        raise ValueError(f"Unsupported encoder type for column '{col}'")
                else:
                    # Handle cases where the encoder is not available
                    print(f"Warning: No encoder found for column '{col}'. Using default encoding.")
                    df_encoded = pd.DataFrame(
                        {f"{col}_label": [0] * len(df)}, index=df.index
                    )
    
            # Append the encoded DataFrame to the list
            encoded_dfs.append(df_encoded)
        
        # Concatenate all encoded columns
        if encoded_dfs:
            return pd.concat(encoded_dfs, axis=1)
        else:
            # If no valid encoding was applied, return an empty DataFrame
            return pd.DataFrame(index=df.index)



    def sanitize_feature_names(self, df):
        """Clean feature names to remove special characters unsupported by LightGBM."""
        df.columns = [
            re.sub(r'[^\w\.-]', '_', col).replace('__', '_') for col in df.columns
        ]
        return df
    def transform(self, df, is_train=True):
        # Numeric features from experience and education
        exp_features = df.apply(self.extract_years_experience, axis=1)
        edu_features = df.apply(self.extract_education_features, axis=1)
    
        feature_dict = {}
        for feat in ['total_experience', 'max_experience', 'num_positions']:
            feature_dict[feat] = [x[feat] for x in exp_features]
        for feat in ['education_score', 'education_result', 'education_weight']:
            feature_dict[feat] = [x[feat] for x in edu_features]
    
        # Basic numeric count features
        feature_dict['num_skills'] = df['skills'].fillna('').str.count(',') + 1
        feature_dict['has_certification'] = (~df['certification_skills'].isna()).astype(int)
        feature_dict['num_languages'] = df['languages'].fillna('').str.count(',') + 1
    
        # Additional *interaction* features
        feature_dict['experience_per_position'] = np.array(feature_dict['total_experience']) / (
            np.array(feature_dict['num_positions']) + 0.1
        )
        feature_dict['result_x_edu_score'] = (
            np.array(feature_dict['education_result']) * np.array(feature_dict['education_score'])
        )
    
        numeric_df = pd.DataFrame(feature_dict, index=df.index)
    
        # Drop constant columns
        constant_cols = numeric_df.columns[numeric_df.nunique() <= 1]
        if len(constant_cols) > 0:
            print(f"Dropping constant columns: {list(constant_cols)}")
            numeric_df = numeric_df.drop(columns=constant_cols)
    
        # Power transform or scale numeric features
        if is_train:
            try:
                numeric_arr = self.power_transformer.fit_transform(numeric_df)
            except Exception as e:
                print(f"PowerTransformer failed: {e}. Falling back to StandardScaler.")
                numeric_arr = self.scaler.fit_transform(numeric_df)
        else:
            try:
                numeric_arr = self.power_transformer.transform(numeric_df)
            except Exception as e:
                print(f"PowerTransformer failed: {e}. Falling back to StandardScaler.")
                numeric_arr = self.scaler.transform(numeric_df)
    
        numeric_df = pd.DataFrame(numeric_arr, columns=numeric_df.columns, index=numeric_df.index)
    
        # Text features
        text_features = [
            'skills', 'career_objective', 'responsibilities',
            'educational_institution_name', 'certification_skills',
            'major_field_of_studies'
        ]
    
        all_text_features = {}
        for feature in text_features:
            if is_train:
                text_matrix = self.text_processor.fit_transform_text(df[feature], feature)
            else:
                text_matrix = self.text_processor.transform_text(df[feature], feature)
    
            for i in range(text_matrix.shape[1]):
                all_text_features[f'{feature}_text_{i}'] = text_matrix[:, i]
    
        text_feature_df = pd.DataFrame(all_text_features, index=df.index)
    
        # Handle categorical features
        categorical_columns = [
             'locations', 'result_types',
            'extra_curricular_activity_types', 'role_positions', 'proficiency_levels'
        ]
        categorical_df = self.encode_categorical_features(df, categorical_columns, is_train)
    
        # Calculate and add weighted composite feature
        df = self.calculate_weighted_feature(df)
        numeric_df['composite_feature'] = df['composite_feature']
    
        # Combine all features
        combined_features = pd.concat([numeric_df, text_feature_df, categorical_df], axis=1)
    
        # Sanitize feature names
        combined_features = self.sanitize_feature_names(combined_features)
    
        # Return the final feature DataFrame
        return combined_features


In [None]:
    train_df = pd.read_csv('/kaggle/input/bitfest-datathon-2025/train.csv')
    test_df = pd.read_csv('/kaggle/input/bitfest-datathon-2025/test.csv')

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score
import lightgbm as lgb
import optuna
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

class OptimizedLightGBM:
    def __init__(self, n_trials=50):
        self.n_trials = n_trials
        self.best_params = None
        
    def objective(self, trial, X, y):
        param = {
            'objective': 'regression_l2',
            'metric': 'l2',
            'boosting_type': 'gbdt',
            'device': 'gpu',
            'gpu_platform_id': 0,
            'gpu_device_id': 0,
            'verbose': -1,
            
            # GPU-compatible hyperparameters
            'num_leaves': trial.suggest_int('num_leaves', 31, 128),  # Reduced max leaves
            'max_depth': trial.suggest_int('max_depth', 5, 12),      # Reduced max depth
            'learning_rate': trial.suggest_loguniform('learning_rate', 0.005, 0.05),
            'feature_fraction': trial.suggest_uniform('feature_fraction', 0.6, 1.0),
            'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.6, 1.0),
            'bagging_freq': trial.suggest_int('bagging_freq', 1, 10),
            'min_child_samples': trial.suggest_int('min_child_samples', 20, 80),
            'max_bin': trial.suggest_int('max_bin', 63, 255),  # GPU-compatible bin size
            'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 10, 50),
            'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
            'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0)
        }
        
        # K-fold cross-validation
        kf = KFold(n_splits=5, shuffle=True, random_state=42)
        scores = []
        
        for train_idx, val_idx in kf.split(X):
            X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
            y_train, y_val = y[train_idx], y[val_idx]
            
            train_data = lgb.Dataset(X_train, label=y_train)
            val_data = lgb.Dataset(X_val, label=y_val)
            
            try:
                model = lgb.train(
                    param,
                    train_data,
                    valid_sets=[val_data],
                    num_boost_round=1000,
                    callbacks=[
                        lgb.early_stopping(stopping_rounds=50),
                        lgb.log_evaluation(period=0)
                    ]
                )
                
                preds = model.predict(X_val)
                fold_score = mean_squared_error(y_val, preds)
                scores.append(fold_score)
            except lgb.basic.LightGBMError as e:
                # Return a large error score if the parameters are invalid
                return float('inf')
        
        return np.mean(scores)

    def find_best_params(self, X, y):
        study = optuna.create_study(direction='minimize')
        study.optimize(lambda trial: self.objective(trial, X, y),
                      n_trials=self.n_trials)
        
        self.best_params = study.best_params
        print("\nOptimization Results:")
        print("Best MSE:", study.best_value)
        print("Best parameters:", self.best_params)
        return self.best_params

In [None]:
def main():
    # Load datasets
    # train_df = pd.read_csv('/kaggle/input/kuet-dset-2/train.csv')
    # test_df = pd.read_csv('/kaggle/input/kuet-dset-2/test.csv')
    
    # Feature Engineering
    fe = FeatureEngineer()
    print("Transforming train data...")
    train_features = fe.transform(train_df, is_train=True)
    print("Transforming test data...")
    test_features = fe.transform(test_df, is_train=False)
    
    y = train_df['matched_score'].values
    
    # Find best parameters
    # print("Finding optimal hyperparameters...")
    # optimizer = OptimizedLightGBM(n_trials=5)
    # best_params = optimizer.find_best_params(train_features, y)
    
    # # Add fixed parameters to best_params
    # best_params.update({
    #     'objective': 'regression_l2',
    #     'metric': 'l2',
    #     'boosting_type': 'gbdt',
    #     'device': 'gpu',
    #     'gpu_platform_id': 0,
    #     'gpu_device_id': 0,
    #     'verbose': -1,
    #     'max_bin': 255  # Ensure GPU compatibility
    # })

    params = {
        'objective': 'regression_l2',
        'metric': 'l2',
        'num_leaves': 96,
        'max_depth': 12,
        'learning_rate': 0.03199484792860085,
        'feature_fraction': 0.6706304075294632,
        'bagging_fraction': 0.958672143301124,
        'bagging_freq': 3,
        'min_child_samples': 65,
        'lambda_l1': 0.014964605006024168,
        'lambda_l2': 4.20761346366579e-08,
        'max_bin': 178,
        'gpu_use_dp': True,  # Enable multi-GPU training
        'tree_learner': 'data_parallel',  # Use multi-GPU setup
        'verbose': -1
        
    }
    
    # Cross-validation and model training
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    oof_predictions = np.zeros(len(train_features))
    test_predictions = np.zeros(len(test_features))
    feature_importance_df = pd.DataFrame()
    fold_scores = []
    
    print("\nTraining final model with best parameters...")
    for fold, (train_idx, val_idx) in enumerate(kf.split(train_features)):
        print(f"\nFold {fold + 1}/5")
        X_train, X_val = train_features.iloc[train_idx], train_features.iloc[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]
        
        train_data = lgb.Dataset(X_train, label=y_train)
        val_data = lgb.Dataset(X_val, label=y_val)
        
        model = lgb.train(
            # best_params,
            params,
            train_data,
            valid_sets=[train_data, val_data],
            num_boost_round=1000,
            callbacks=[
                lgb.early_stopping(stopping_rounds=50),
                lgb.log_evaluation(100)
            ]
        )
        
        val_preds = model.predict(X_val)
        oof_predictions[val_idx] = val_preds
        test_predictions += model.predict(test_features) / kf.n_splits
        
        fold_rmse = np.sqrt(mean_squared_error(y_val, val_preds))
        fold_scores.append(fold_rmse)
        print(f"Fold {fold + 1} RMSE: {fold_rmse:.6f}")
        
        fold_importance = pd.DataFrame({
            "feature": train_features.columns,
            "importance": model.feature_importance(),
            "fold": fold + 1
        })
        feature_importance_df = pd.concat([feature_importance_df, fold_importance], axis=0)
    
    # Calculate final metrics
    final_rmse = np.sqrt(mean_squared_error(y, oof_predictions))
    # final_r2 = r2_score(y, oof_predictions)
    
    print("\nFinal Model Performance:")
    print(f"Average RMSE: {np.mean(fold_scores):.6f} ± {np.std(fold_scores):.6f}")
    print(f"Overall RMSE: {final_rmse:.6f}")
    # print(f"R2 Score: {final_r2:.6f}")
    
    # Save predictions
    submission = pd.DataFrame({
        'ID': test_df['ID'],
        'matched_score': test_predictions
    })
    submission.to_csv('submission.csv', index=False)
    print("\nSubmission saved to submission.csv")
    
    # Save feature importance
    feature_importance = (feature_importance_df.groupby('feature')['importance']
                        .mean()
                        .sort_values(ascending=False))
    feature_importance.to_csv('feature_importance.csv')
    
    print("\nTop 10 Most Important Features:")
    print(feature_importance.head(10))
    
    return submission, feature_importance_df

from sklearn.model_selection import KFold  # Add this import
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np
import lightgbm as lgbm

from sklearn.preprocessing import LabelEncoder

# The rest of your code remains the same...

if __name__ == "__main__":
    main()

In [None]:
print(train_df.columns)
print(test_df.columns)


In [None]:
if __name__ == "__main__":
    main()