In [4]:
!pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-3.3.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.3.1-py3-none-any.whl (268 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: sentence-transformers
Successfully installed sentence-transformers-3.3.1


In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
import lightgbm as lgbm
from sentence_transformers import SentenceTransformer
from ast import literal_eval
import re
from datetime import datetime

In [6]:
print(lgbm.__version__)

4.5.0


In [17]:
class TextProcessor:
    def __init__(self):
        self.model = SentenceTransformer('all-MiniLM-L6-v2')
    
    def process_list(self, text):
        try:
            if pd.isna(text) or text == '':
                return []
            return literal_eval(text)
        except:
            return text.split(',')
    
    def get_embeddings(self, texts):
        texts = [str(t) if not pd.isna(t) else '' for t in texts]
        return self.model.encode(texts)

class FeatureEngineer:
    def __init__(self):
        self.text_processor = TextProcessor()
        self.label_encoders = {}
    
    def extract_year(self, date_str):
        if pd.isna(date_str):
            return None
        try:
            return int(re.findall(r'\d{4}', str(date_str))[0])
        except:
            return None
    
    def process_dates(self, df):
        # Experience duration
        df['experience_years'] = df.apply(
            lambda x: self.extract_year(x['end_dates']) - self.extract_year(x['start_dates'])
            if self.extract_year(x['end_dates']) and self.extract_year(x['start_dates'])
            else 0, axis=1
        )
        return df
    
    def process_categorical(self, df, col):
        if col not in self.label_encoders:
            self.label_encoders[col] = LabelEncoder()
            df[f'{col}_encoded'] = self.label_encoders[col].fit_transform(df[col].fillna('MISSING'))
        else:
            df[f'{col}_encoded'] = self.label_encoders[col].transform(df[col].fillna('MISSING'))
        return df
    
    def transform(self, df):
        # Process dates
        df = self.process_dates(df)
        
        # Process categorical
        for col in ['degree_names', 'result_types', 'major_field_of_studies']:
            df = self.process_categorical(df, col)
        
        # Get embeddings for text features
        text_features = ['skills', 'career_objective', 'responsibilities']
        embedding_cols = {}
        
        for feature in text_features:
            embeddings = self.text_processor.get_embeddings(df[feature])
            embedding_cols.update({
                f'{feature}_emb_{i}': embeddings[:, i] 
                for i in range(embeddings.shape[1])
            })
        
        # Concatenate all embeddings at once
        embedding_df = pd.DataFrame(embedding_cols, index=df.index)
        df = pd.concat([df, embedding_df], axis=1)
        
        # Skills matching score
        df['skills_required'] = df['skills_required'].fillna('')
        df['skills'] = df['skills'].fillna('')
        required_skills = df['skills_required'].apply(self.text_processor.process_list)
        candidate_skills = df['skills'].apply(self.text_processor.process_list)
        
        df['skills_match_ratio'] = [
            len(set(req).intersection(set(cand))) / len(set(req)) if len(set(req)) > 0 else 0
            for req, cand in zip(required_skills, candidate_skills)
        ]
        
        return df

def train_model():
    train_df = pd.read_csv('/kaggle/input/bitfest-datathon-2025/train.csv')
    test_df = pd.read_csv('/kaggle/input/bitfest-datathon-2025/test.csv')
    
    fe = FeatureEngineer()
    
    print("Transforming train data...")
    train_df = fe.transform(train_df)
    print("Transforming test data...")
    test_df = fe.transform(test_df)
    
    # Select features
    text_cols = ['skills', 'career_objective', 'responsibilities']
    emb_cols = [col for col in train_df.columns if 'emb_' in col]
    cat_cols = [col for col in train_df.columns if 'encoded' in col]
    num_cols = ['experience_years', 'skills_match_ratio']
    
    feature_cols = emb_cols + cat_cols + num_cols
    
    params = {
        'objective': 'regression_l2',
        'metric': 'l2',
        'num_leaves': 31,
        'learning_rate': 0.05,
        'feature_fraction': 0.9,
        'max_depth': 8,
        'reg_alpha': 0.1,
        'reg_lambda': 0.1
    }
    
    # Cross-validation
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    cv_scores = []
    test_preds = np.zeros(len(test_df))
    
    for fold, (train_idx, val_idx) in enumerate(kf.split(train_df)):
        print(f"Training fold {fold + 1}")
        X_train = train_df.iloc[train_idx][feature_cols]
        y_train = train_df.iloc[train_idx]['matched_score']
        X_val = train_df.iloc[val_idx][feature_cols]
        y_val = train_df.iloc[val_idx]['matched_score']
        
        train_data = lgbm.Dataset(X_train, label=y_train)
        val_data = lgbm.Dataset(X_val, label=y_val)
        
        model = lgbm.train(
            params,
            train_data,
            num_boost_round=1000,
            valid_sets=[train_data, val_data],
            # early_stopping_round=50,
            callbacks=[
                lgbm.early_stopping(stopping_rounds=50),
                lgbm.log_evaluation(100)
            ]
            # verbose_eval=100
            # **callbacks=[lgb.early_stopping_rounds(stopping_rounds=50), lgb.log_evaluation(100)]**
        )
        
        val_preds = model.predict(X_val)
        fold_score = mean_squared_error(y_val, val_preds)
        cv_scores.append(fold_score)
        
        test_preds += model.predict(test_df[feature_cols]) / kf.n_splits
    
    print(f"CV MSE: {np.mean(cv_scores):.6f} ± {np.std(cv_scores):.6f}")
    
    submission = pd.DataFrame({
        'ID': test_df['ID'],
        'matched_score': test_preds
    })
    submission.to_csv('submission.csv', index=False)
    print("Submission saved to submission.csv")

In [18]:
if __name__ == "__main__":
    train_model()



Transforming train data...


Batches:   0%|          | 0/239 [00:00<?, ?it/s]

Batches:   0%|          | 0/239 [00:00<?, ?it/s]

Batches:   0%|          | 0/239 [00:00<?, ?it/s]

Transforming test data...


Batches:   0%|          | 0/60 [00:00<?, ?it/s]

Batches:   0%|          | 0/60 [00:00<?, ?it/s]

Batches:   0%|          | 0/60 [00:00<?, ?it/s]

Training fold 1
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.065770 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 187170
[LightGBM] [Info] Number of data points in the train set: 6108, number of used features: 1156
[LightGBM] [Info] Start training from score 0.658422
Training until validation scores don't improve for 50 rounds
[100]	training's l2: 0.00562275	valid_1's l2: 0.00982499
[200]	training's l2: 0.00365696	valid_1's l2: 0.00893094
[300]	training's l2: 0.00259898	valid_1's l2: 0.0086548
[400]	training's l2: 0.00187446	valid_1's l2: 0.00853791
[500]	training's l2: 0.00139779	valid_1's l2: 0.00845645
[600]	training's l2: 0.0010443	valid_1's l2: 0.00837287
[700]	training's l2: 0.000783498	valid_1's l2: 0.00832066
Early stopping, best iteration is:
[728]	training's l2: 0.000726093	valid_1's l2: 0.00831204
Training fold 2
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead 