In [9]:
import pandas as pd
import numpy as np
import optuna
from sklearn.metrics import mean_squared_error

In [10]:
def MCRMSE(y_trues, y_preds):
    scores = []
    idxes = y_trues.shape[1]
    for i in range(idxes):
        y_true = y_trues[:,i]
        y_pred = y_preds[:,i]
        score = mean_squared_error(y_true, y_pred, squared=False) # RMSE
        scores.append(score)
    mcrmse_score = np.mean(scores)
    return mcrmse_score, scores



def MCRMSE_SINGLE(y_trues, y_preds):
    scores = []
    mcrmse_score = mean_squared_error(y_trues, y_preds, squared=False) # RMSE
    return mcrmse_score, scores



def get_score(y_trues, y_preds, single=False):
    if single:
        mcrmse_score, scores = MCRMSE_SINGLE(y_trues, y_preds)
    else:
        mcrmse_score, scores = MCRMSE(y_trues, y_preds)
    return mcrmse_score, scores

In [16]:
train_csv = "../data/raw/summaries_train.csv"
 

csvs = [
    "../data/oofs/model236.csv",
]


train_df = pd.read_csv(train_csv)

In [17]:
# train_df

In [18]:
import numpy as np

def apply_smoothing(predictions, alpha=0.5):
    """
    Apply post-processing smoothing to a set of predictions.

    Parameters:
    - predictions: NumPy array containing the model's raw predictions.
    - alpha: Smoothing parameter (0 <= alpha <= 1). Higher values increase smoothing.

    Returns:
    - smoothed_predictions: NumPy array with smoothed predictions.
    """
    num_samples, num_columns = predictions.shape
    
    # Initialize an array to store smoothed predictions
    smoothed_predictions = np.zeros_like(predictions)
    
    
    # Apply smoothing to each column (prediction)
    for col in range(num_columns):
        column_predictions = predictions[:, col]
        smoothed_predictions[:, col] = column_predictions * (1 - alpha) + alpha * column_predictions.mean()
    
    return smoothed_predictions



In [23]:
oofs = []

for csv in csvs:
    df = pd.read_csv(csv)
    df[['content', 'wording']] = apply_smoothing(df[['content', 'wording']].values, alpha=0.025)

    ## post processing
    df = df.loc[df.student_id.isin(train_df.student_id.values)].reset_index(drop=True)    
    
    merged_df = df.merge(
        train_df,
        left_on='student_id',
        right_on='student_id',
        how='outer',
        suffixes=('_pred','_gt')
    ) 
    
    merged_df = merged_df.loc[merged_df.fold.isin([0,1,2,3])]    
    oofs.append(merged_df)



In [24]:
best_f0, best_f1, best_f2, best_f3 = 1, 1, 1, 1
best_f0_exp, best_f1_exp, best_f2_exp, best_f3_exp = 1, 1, 1, 1


for index, oof in enumerate(oofs):
    true_labels_content = oof[['content_gt']].values
    true_labels_wording = oof[['wording_gt']].values
    
    true_labels_oof = oof[['content_gt', 'wording_gt']].values
    predictions_oof = oof[['content_pred', 'wording_pred']].values 
    
    for fold in [0,1,2,3]:
        oof1 = oof.loc[oof.fold == fold]
        true_labels = oof1[['content_gt', 'wording_gt']].values
        predictions = oof1[['content_pred', 'wording_pred']].values    
        score, scores = get_score(true_labels, predictions)
        print(f"model{index+1}, fold {fold}", score, scores)
        if fold == 0 and score < best_f0:
            best_f0 = score
            best_f0_exp = index + 1
        elif fold == 1 and score < best_f1:
            best_f1 = score
            best_f1_exp = index + 1
        elif fold == 2 and score < best_f2:
            best_f2 = score
            best_f2_exp = index + 1
        elif fold == 3 and score < best_f3:
            best_f3 = score
            best_f3_exp = index + 1
            
            
    score, scores = get_score(true_labels_oof, predictions_oof)
    print(csvs[index].split("/")[-1], f"model{index+1}", score, scores)
    print("*"*50)
    
    
print(best_f0, f"model{best_f0_exp}")
print(best_f1, f"model{best_f1_exp}")
print(best_f2, f"model{best_f2_exp}")
print(best_f3, f"model{best_f3_exp}") # 0.47563110509395745

model1, fold 0 0.4419974461268222 [0.3826441421407404, 0.501350750112904]
model1, fold 1 0.5057045284198259 [0.4337077574708537, 0.5777012993687981]
model1, fold 2 0.43217760493884116 [0.3944587569425775, 0.4698964529351048]
model1, fold 3 0.5432992682438758 [0.46101865376780243, 0.625579882719949]
model236.csv model1 0.4747572360399296 [0.4133297543496843, 0.5361847177301748]
**************************************************
0.4419974461268222 model1
0.5057045284198259 model1
0.43217760493884116 model1
0.5432992682438758 model1


### Optuna Weight Tuning

In [25]:
def reduce_rmse(params):    
    for index, val  in enumerate(params):
        if index == 0:            
            preds = params[val]*oofs[0][['content_pred', 'wording_pred']].values
        else:
            preds += params[val]*oofs[index][['content_pred', 'wording_pred']].values
    
    param_sum = 0
    for key, val in params.items():
        param_sum += val

    preds = preds/param_sum
    
    score, _ = get_score(true_labels_oof, preds, single=True)
    return score



def objective(trial):
    
    params = {}    
    for i in range(len(oofs)):
        params[f"w{i+1}"] = trial.suggest_float(f'w{i+1}', 0, 1) 
        
    score = reduce_rmse(params)
    return score


study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=500)
best_params = study.best_params

[I 2023-10-12 12:01:49,933] A new study created in memory with name: no-name-ef521d64-6b0c-4fe6-a3be-d54cb3016850
[I 2023-10-12 12:01:49,938] Trial 0 finished with value: 0.4747572360399296 and parameters: {'w1': 0.8846156258427612}. Best is trial 0 with value: 0.4747572360399296.
[I 2023-10-12 12:01:49,940] Trial 1 finished with value: 0.4747572360399296 and parameters: {'w1': 0.5164597922916226}. Best is trial 0 with value: 0.4747572360399296.
[I 2023-10-12 12:01:49,941] Trial 2 finished with value: 0.4747572360399296 and parameters: {'w1': 0.4241199740897519}. Best is trial 0 with value: 0.4747572360399296.
[I 2023-10-12 12:01:49,943] Trial 3 finished with value: 0.4747572360399296 and parameters: {'w1': 0.4320749704975453}. Best is trial 0 with value: 0.4747572360399296.
[I 2023-10-12 12:01:49,945] Trial 4 finished with value: 0.4747572360399296 and parameters: {'w1': 0.3866896705570465}. Best is trial 0 with value: 0.4747572360399296.
[I 2023-10-12 12:01:49,947] Trial 5 finished w

[I 2023-10-12 12:01:50,219] Trial 48 finished with value: 0.4747572360399296 and parameters: {'w1': 0.5627646625187428}. Best is trial 0 with value: 0.4747572360399296.
[I 2023-10-12 12:01:50,226] Trial 49 finished with value: 0.4747572360399296 and parameters: {'w1': 0.3588876982567428}. Best is trial 0 with value: 0.4747572360399296.
[I 2023-10-12 12:01:50,233] Trial 50 finished with value: 0.4747572360399296 and parameters: {'w1': 0.6214187858361596}. Best is trial 0 with value: 0.4747572360399296.
[I 2023-10-12 12:01:50,240] Trial 51 finished with value: 0.4747572360399296 and parameters: {'w1': 0.8731578871747775}. Best is trial 0 with value: 0.4747572360399296.
[I 2023-10-12 12:01:50,248] Trial 52 finished with value: 0.4747572360399296 and parameters: {'w1': 0.5286112264703438}. Best is trial 0 with value: 0.4747572360399296.
[I 2023-10-12 12:01:50,255] Trial 53 finished with value: 0.4747572360399296 and parameters: {'w1': 0.7310734026571893}. Best is trial 0 with value: 0.4747

[I 2023-10-12 12:01:50,574] Trial 97 finished with value: 0.4747572360399296 and parameters: {'w1': 0.8249335643597075}. Best is trial 0 with value: 0.4747572360399296.
[I 2023-10-12 12:01:50,581] Trial 98 finished with value: 0.4747572360399296 and parameters: {'w1': 0.8590573242979047}. Best is trial 0 with value: 0.4747572360399296.
[I 2023-10-12 12:01:50,588] Trial 99 finished with value: 0.4747572360399296 and parameters: {'w1': 0.47778863321425813}. Best is trial 0 with value: 0.4747572360399296.
[I 2023-10-12 12:01:50,595] Trial 100 finished with value: 0.4747572360399296 and parameters: {'w1': 0.7857910336393209}. Best is trial 0 with value: 0.4747572360399296.
[I 2023-10-12 12:01:50,602] Trial 101 finished with value: 0.4747572360399296 and parameters: {'w1': 0.05868647136850763}. Best is trial 0 with value: 0.4747572360399296.
[I 2023-10-12 12:01:50,610] Trial 102 finished with value: 0.4747572360399296 and parameters: {'w1': 0.5010954813499466}. Best is trial 0 with value: 0

[I 2023-10-12 12:01:50,940] Trial 146 finished with value: 0.4747572360399296 and parameters: {'w1': 0.872831958276028}. Best is trial 0 with value: 0.4747572360399296.
[I 2023-10-12 12:01:50,948] Trial 147 finished with value: 0.4747572360399296 and parameters: {'w1': 0.6953044076438357}. Best is trial 0 with value: 0.4747572360399296.
[I 2023-10-12 12:01:50,955] Trial 148 finished with value: 0.4747572360399296 and parameters: {'w1': 0.636830680662495}. Best is trial 0 with value: 0.4747572360399296.
[I 2023-10-12 12:01:50,962] Trial 149 finished with value: 0.4747572360399296 and parameters: {'w1': 0.4204300866603982}. Best is trial 0 with value: 0.4747572360399296.
[I 2023-10-12 12:01:50,969] Trial 150 finished with value: 0.4747572360399296 and parameters: {'w1': 0.295923092449259}. Best is trial 0 with value: 0.4747572360399296.
[I 2023-10-12 12:01:50,977] Trial 151 finished with value: 0.4747572360399296 and parameters: {'w1': 0.8112601433779112}. Best is trial 0 with value: 0.4

[I 2023-10-12 12:01:51,317] Trial 195 finished with value: 0.4747572360399296 and parameters: {'w1': 0.379389684870846}. Best is trial 0 with value: 0.4747572360399296.
[I 2023-10-12 12:01:51,325] Trial 196 finished with value: 0.4747572360399296 and parameters: {'w1': 0.5287008165137119}. Best is trial 0 with value: 0.4747572360399296.
[I 2023-10-12 12:01:51,332] Trial 197 finished with value: 0.4747572360399296 and parameters: {'w1': 0.1876107903876132}. Best is trial 0 with value: 0.4747572360399296.
[I 2023-10-12 12:01:51,340] Trial 198 finished with value: 0.4747572360399296 and parameters: {'w1': 0.9679296625121736}. Best is trial 0 with value: 0.4747572360399296.
[I 2023-10-12 12:01:51,347] Trial 199 finished with value: 0.4747572360399296 and parameters: {'w1': 0.7531947060305119}. Best is trial 0 with value: 0.4747572360399296.
[I 2023-10-12 12:01:51,355] Trial 200 finished with value: 0.4747572360399296 and parameters: {'w1': 0.9826725080891124}. Best is trial 0 with value: 0

[I 2023-10-12 12:01:51,690] Trial 244 finished with value: 0.4747572360399296 and parameters: {'w1': 0.8368715239653226}. Best is trial 0 with value: 0.4747572360399296.
[I 2023-10-12 12:01:51,697] Trial 245 finished with value: 0.4747572360399296 and parameters: {'w1': 0.6733626792198794}. Best is trial 0 with value: 0.4747572360399296.
[I 2023-10-12 12:01:51,705] Trial 246 finished with value: 0.4747572360399296 and parameters: {'w1': 0.6064776641782155}. Best is trial 0 with value: 0.4747572360399296.
[I 2023-10-12 12:01:51,713] Trial 247 finished with value: 0.4747572360399296 and parameters: {'w1': 0.5766089151207593}. Best is trial 0 with value: 0.4747572360399296.
[I 2023-10-12 12:01:51,720] Trial 248 finished with value: 0.4747572360399296 and parameters: {'w1': 0.6906522067633603}. Best is trial 0 with value: 0.4747572360399296.
[I 2023-10-12 12:01:51,728] Trial 249 finished with value: 0.4747572360399296 and parameters: {'w1': 0.5466023997689377}. Best is trial 0 with value: 

[I 2023-10-12 12:01:52,064] Trial 293 finished with value: 0.4747572360399296 and parameters: {'w1': 0.5444805265005602}. Best is trial 0 with value: 0.4747572360399296.
[I 2023-10-12 12:01:52,072] Trial 294 finished with value: 0.4747572360399296 and parameters: {'w1': 0.6326000336886708}. Best is trial 0 with value: 0.4747572360399296.
[I 2023-10-12 12:01:52,080] Trial 295 finished with value: 0.4747572360399296 and parameters: {'w1': 0.7839712208602304}. Best is trial 0 with value: 0.4747572360399296.
[I 2023-10-12 12:01:52,087] Trial 296 finished with value: 0.4747572360399296 and parameters: {'w1': 0.8720866456820826}. Best is trial 0 with value: 0.4747572360399296.
[I 2023-10-12 12:01:52,095] Trial 297 finished with value: 0.4747572360399296 and parameters: {'w1': 0.36103104405127184}. Best is trial 0 with value: 0.4747572360399296.
[I 2023-10-12 12:01:52,102] Trial 298 finished with value: 0.4747572360399296 and parameters: {'w1': 0.7262547131932285}. Best is trial 0 with value:

[I 2023-10-12 12:01:52,451] Trial 342 finished with value: 0.4747572360399296 and parameters: {'w1': 0.8585692046939477}. Best is trial 0 with value: 0.4747572360399296.
[I 2023-10-12 12:01:52,459] Trial 343 finished with value: 0.4747572360399296 and parameters: {'w1': 0.6558201161147172}. Best is trial 0 with value: 0.4747572360399296.
[I 2023-10-12 12:01:52,467] Trial 344 finished with value: 0.4747572360399296 and parameters: {'w1': 0.7147880145817713}. Best is trial 0 with value: 0.4747572360399296.
[I 2023-10-12 12:01:52,475] Trial 345 finished with value: 0.4747572360399296 and parameters: {'w1': 0.2190369574931248}. Best is trial 0 with value: 0.4747572360399296.
[I 2023-10-12 12:01:52,483] Trial 346 finished with value: 0.4747572360399296 and parameters: {'w1': 0.062316875471861426}. Best is trial 0 with value: 0.4747572360399296.
[I 2023-10-12 12:01:52,492] Trial 347 finished with value: 0.4747572360399296 and parameters: {'w1': 0.5564678191147084}. Best is trial 0 with value

[I 2023-10-12 12:01:52,855] Trial 391 finished with value: 0.4747572360399296 and parameters: {'w1': 0.3850473874267253}. Best is trial 0 with value: 0.4747572360399296.
[I 2023-10-12 12:01:52,863] Trial 392 finished with value: 0.4747572360399296 and parameters: {'w1': 0.6413499083114732}. Best is trial 0 with value: 0.4747572360399296.
[I 2023-10-12 12:01:52,872] Trial 393 finished with value: 0.4747572360399296 and parameters: {'w1': 0.04795607663202778}. Best is trial 0 with value: 0.4747572360399296.
[I 2023-10-12 12:01:52,881] Trial 394 finished with value: 0.4747572360399296 and parameters: {'w1': 0.9267678490524579}. Best is trial 0 with value: 0.4747572360399296.
[I 2023-10-12 12:01:52,889] Trial 395 finished with value: 0.4747572360399296 and parameters: {'w1': 0.48936819572984314}. Best is trial 0 with value: 0.4747572360399296.
[I 2023-10-12 12:01:52,897] Trial 396 finished with value: 0.4747572360399296 and parameters: {'w1': 0.7516123277871962}. Best is trial 0 with value

[I 2023-10-12 12:01:53,288] Trial 440 finished with value: 0.4747572360399296 and parameters: {'w1': 0.6445647494152726}. Best is trial 0 with value: 0.4747572360399296.
[I 2023-10-12 12:01:53,301] Trial 441 finished with value: 0.4747572360399296 and parameters: {'w1': 0.9563813257354888}. Best is trial 0 with value: 0.4747572360399296.
[I 2023-10-12 12:01:53,310] Trial 442 finished with value: 0.4747572360399296 and parameters: {'w1': 0.30694449613500985}. Best is trial 0 with value: 0.4747572360399296.
[I 2023-10-12 12:01:53,321] Trial 443 finished with value: 0.4747572360399296 and parameters: {'w1': 0.48870473544089804}. Best is trial 0 with value: 0.4747572360399296.
[I 2023-10-12 12:01:53,329] Trial 444 finished with value: 0.4747572360399296 and parameters: {'w1': 0.11985176680263104}. Best is trial 0 with value: 0.4747572360399296.
[I 2023-10-12 12:01:53,338] Trial 445 finished with value: 0.4747572360399296 and parameters: {'w1': 0.8425495376299087}. Best is trial 0 with valu

[I 2023-10-12 12:01:53,735] Trial 489 finished with value: 0.4747572360399296 and parameters: {'w1': 0.4611075261852314}. Best is trial 0 with value: 0.4747572360399296.
[I 2023-10-12 12:01:53,744] Trial 490 finished with value: 0.4747572360399296 and parameters: {'w1': 0.5834698059577061}. Best is trial 0 with value: 0.4747572360399296.
[I 2023-10-12 12:01:53,753] Trial 491 finished with value: 0.4747572360399296 and parameters: {'w1': 0.3661465327272001}. Best is trial 0 with value: 0.4747572360399296.
[I 2023-10-12 12:01:53,762] Trial 492 finished with value: 0.4747572360399296 and parameters: {'w1': 0.8136001940002842}. Best is trial 0 with value: 0.4747572360399296.
[I 2023-10-12 12:01:53,771] Trial 493 finished with value: 0.4747572360399296 and parameters: {'w1': 0.3349196005519432}. Best is trial 0 with value: 0.4747572360399296.
[I 2023-10-12 12:01:53,780] Trial 494 finished with value: 0.4747572360399296 and parameters: {'w1': 0.221985672503286}. Best is trial 0 with value: 0

In [26]:
best_params

{'w1': 0.8846156258427612}

In [71]:
# best_params # 0.5210540444629375 | 

In [72]:
weights = best_params

preds_final = None


for idx, (model_key, model_wt) in enumerate(weights.items()):
    print(model_key, model_wt)
    if idx == 0:
        preds_final = oofs[idx][['content_pred', 'wording_pred']].values * model_wt
    else:
        preds_final += oofs[idx][['content_pred', 'wording_pred']].values * model_wt
preds_final = preds_final/np.sum(list(weights.values()))
print(np.sum(list(weights.values())))

model_ensemble = oofs[0].copy()
model_ensemble[['content_pred', 'wording_pred']] = preds_final


score, scores = get_score(true_labels_oof, preds_final)
score, scores

w1 0.7039470880705468
0.7039470880705468


(0.4826655974659348, [0.4183766427459229, 0.5469545521859467])

In [73]:
model_ensemble = model_ensemble.reset_index(drop=True)

In [74]:
df

Unnamed: 0,content,wording,student_id,prompt_id,text,tokenize_length,fold
0,-1.282615,-1.338554,ad7245db300c,39c16e,[SUMMARY_START]The main person is a likable pe...,769,0
1,-1.338856,-1.126083,c4433d2e4905,39c16e,[SUMMARY_START]An ideal tragedy as described b...,770,0
2,-1.424251,-1.332204,ac8891e90289,39c16e,[SUMMARY_START]a complex twisting plot that ma...,770,0
3,-1.290172,-1.414644,e16cafc1509a,39c16e,[SUMMARY_START]At least 3 elements of an ideal...,771,0
4,-1.277248,-1.151478,551c0f2ac3de,39c16e,[SUMMARY_START]An ideal tragedy must have a c...,771,0
...,...,...,...,...,...,...,...
7160,2.488900,2.252955,a04bba18f7d2,814d6b,[SUMMARY_START]The way that the Third Wave exp...,1030,3
7161,2.084769,2.026452,11c3509b7b43,814d6b,[SUMMARY_START]The Third Wave developed over s...,1063,3
7162,2.816301,2.524373,9ecc3600af3f,814d6b,[SUMMARY_START]The Third Wave started as a exp...,1078,3
7163,2.845079,2.200934,e12feeaa31c0,814d6b,[SUMMARY_START]The Third Wave developed in suc...,1102,3


In [75]:
train = pd.read_csv("/home/rohits/pv1/commonlit/data/raw/train_folds_processed.csv")
train= train.loc[train.fold.isin([0,1,2,3])].sort_values('student_id').reset_index(drop=True)

dfs = []
for csv in csvs:
    print
    model_name = csv.split("/")[-1].split(".")[0]
    df = pd.read_csv(csv)
    df = df.loc[df.fold.isin([0,1,2,3])]
    df = df.loc[df.student_id.isin(train_df.student_id.values)].sort_values('student_id').reset_index(drop=True)
    df = df[['student_id', 'content', 'wording']]
    train[[f'content_{model_name}', f'wording_{model_name}']] = df[['content', 'wording']]
    
    
# # # ## add weighted ensemble
# model_ensemble = model_ensemble.loc[df.student_id.isin(train.student_id.values)].sort_values('student_id').reset_index(drop=True)
# model_ensemble = model_ensemble[['student_id', 'content_pred', 'wording_pred']]
# train[[f'content_ensemble', f'wording_ensemble']] = model_ensemble[['content_pred', 'wording_pred']]

In [76]:
train.head(2)

Unnamed: 0,student_id,prompt_id,text,content,wording,fold,summary_length,splling_err_num,prompt_question,prompt_title,prompt_text,prompt_length,length_ratio,word_overlap_count,bigram_overlap_count,trigram_overlap_count,quotes_count,bart-large-cnn,content_model250_avg,wording_model250_avg
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538,3,69,5,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,671,0.102832,0,5,0,0,The Third Wave experiment took place at Cubbe...,-0.043972,0.889765
1,0020ae56ffbf,ebad26,They would rub it up with soda to make the sme...,-0.548304,0.506755,2,56,2,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an...",1137,0.049252,0,22,10,0,"With one member trimming beef in a cannery, a...",-0.517902,-0.221838


## LGBM model

In [77]:
import lightgbm as lgb


targets = ["content", "wording"]

drop_columns = ["fold", "student_id", "prompt_id", "text", 
                "prompt_question", "prompt_title", 
                "prompt_text", "bart-large-cnn", # "quotes_count", "trigram_overlap_count", "bigram_overlap_count", "word_overlap_count"
               ] + targets

In [78]:
# 'summary_length', 'splling_err_num', 'prompt_length', 'length_ratio',
#        'word_overlap_count', 'bigram_overlap_count', 'trigram_overlap_count',
#        'quotes_count', 'content_model2', 'wording_model2', 'content_model201',
#        'wording_model201', 'content_ensemble', 'wording_ensemble'

In [79]:
# summary_length, splling_err_num, prompt_length, length_ratio, word_overlap_count

In [80]:
model_dict = {}

for target in targets:
    models = []
    
    for fold in [0,1,2,3]:
        X_train_cv = train[train["fold"] != fold].drop(columns=drop_columns)
        
        print(X_train_cv.info())
        
        y_train_cv = train[train["fold"] != fold][target]

        X_eval_cv = train[train["fold"] == fold].drop(columns=drop_columns)
        y_eval_cv = train[train["fold"] == fold][target]

        dtrain = lgb.Dataset(X_train_cv, label=y_train_cv)
        dval = lgb.Dataset(X_eval_cv, label=y_eval_cv)
        
        params = {
            'boosting_type': 'gbdt',
            'random_state': 42,
            'objective': 'regression',
            'metric': 'rmse',
            'learning_rate': 0.045,
            'max_depth': 3,
            'lambda_l1': 0.1,
#             'lambda_l2': 0.01
        }
        

        evaluation_results = {}
        model = lgb.train(
          params,
          num_boost_round=10000,
            #categorical_feature = categorical_features,
          valid_names=['train', 'valid'],
          train_set=dtrain,
          valid_sets=[dval],
            
#           early_stopping_rounds=10,  # Stop if no improvement in 10 rounds
#           verbose_eval=10  # Print progress every 10 rounds
            
          callbacks=[
              lgb.early_stopping(stopping_rounds=30, verbose=True),
              lgb.log_evaluation(10),
              lgb.callback.record_evaluation(evaluation_results)
            ],
        )
        models.append(model)

    
    model_dict[target] = models

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5108 entries, 0 to 7164
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   summary_length         5108 non-null   int64  
 1   splling_err_num        5108 non-null   int64  
 2   prompt_length          5108 non-null   int64  
 3   length_ratio           5108 non-null   float64
 4   word_overlap_count     5108 non-null   int64  
 5   bigram_overlap_count   5108 non-null   int64  
 6   trigram_overlap_count  5108 non-null   int64  
 7   quotes_count           5108 non-null   int64  
 8   content_model250_avg   5108 non-null   float64
 9   wording_model250_avg   5108 non-null   float64
dtypes: float64(3), int64(7)
memory usage: 439.0 KB
None
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1394
[LightGBM] [Info] Number of data points in the train set: 5108, number of used features: 10
[LightGBM] [Info] Start trai

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6062 entries, 1 to 7164
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   summary_length         6062 non-null   int64  
 1   splling_err_num        6062 non-null   int64  
 2   prompt_length          6062 non-null   int64  
 3   length_ratio           6062 non-null   float64
 4   word_overlap_count     6062 non-null   int64  
 5   bigram_overlap_count   6062 non-null   int64  
 6   trigram_overlap_count  6062 non-null   int64  
 7   quotes_count           6062 non-null   int64  
 8   content_model250_avg   6062 non-null   float64
 9   wording_model250_avg   6062 non-null   float64
dtypes: float64(3), int64(7)
memory usage: 521.0 KB
None
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1401
[LightGBM] [Info] Number of data points in the train set: 6062, number of used features: 10
[LightGBM] [Info] Start trai

[220]	train's rmse: 0.694928
Early stopping, best iteration is:
[192]	train's rmse: 0.694206


In [81]:
# cv
rmses = []

preds_target = {}

for target in targets:
    models = model_dict[target]

    preds = []
    trues = []
    ids = []
    
    folds = [0,1,2,3]
    
    for fold, model in zip(folds, models):
        fold_ids = train[train["fold"] == fold].student_id.values
        X_eval_cv = train[train["fold"] == fold].drop(columns=drop_columns)
        y_eval_cv = train[train["fold"] == fold][target]

        pred = model.predict(X_eval_cv)

        trues.extend(y_eval_cv)
        preds.extend(pred)
        ids.extend(fold_ids)
        
    preds_target[target] = preds
    rmse = np.sqrt(mean_squared_error(trues, preds))
    print(f"{target}_rmse : {rmse}")
    rmses = rmses + [rmse]

print(f"mcrmse : {sum(rmses) / len(rmses)}")

content_rmse : 0.43729095808875873
wording_rmse : 0.562393929440772
mcrmse : 0.4998424437647654


In [82]:
content_rmse : 0.4298522117128822
wording_rmse : 0.5883296879464759
mcrmse : 0.5090909498296791


# content_rmse : 0.4314174437930952
# wording_rmse : 0.5666001367824579
# mcrmse : 0.49900879028777656

In [51]:
len(preds_target['wording'])

7165

In [None]:
ps_df = pd.DataFrame({
    'student_id': ids,
    'content_pred': preds_target['content'],
    'wording_pred': preds_target['wording']
})

In [None]:
ps_df.to_csv("../data/raw/pseudo_lgbm1.csv")

In [None]:
def reduce_rmse_content(params):    
    
    for index, val in enumerate(params.keys()):
        if index == 0:            
            preds = params[val]*oofs[0][['content_pred']].values
        else:
            preds += params[val]*oofs[index][['content_pred']].values
    
    param_sum = 0
    for key, val in params.items():
        param_sum += val

    preds = preds/param_sum
    
    score, _ = get_score(true_labels_content, preds, single=True)
    return score


def reduce_rmse_wording(params):    
    
    for index, val in enumerate(params.keys()):
        if index == 0:            
            preds = params[val]*oofs[0][['wording_pred']].values
        else:
            preds += params[val]*oofs[index][['wording_pred']].values
    
    param_sum = 0
    for key, val in params.items():
        param_sum += val

    preds = preds/param_sum
    
    score, _ = get_score(true_labels_wording, preds, single=True)
    return score
    


def objective_content(trial):
    
    params = {}    
    for i in range(len(oofs)):
        params[f"w{i+1}"] = trial.suggest_float(f'w{i+1}', 0, 1) 
        
        
# #     params['w1'] = trial.suggest_float(f'w1', 0, 1)
#     params['w1'] = 0.0
# #     params['w2'] = trial.suggest_float(f'w2', 0, 1)
#     params['w2'] = 0.2751292481676879
# #     params['w3'] = trial.suggest_float(f'w3', 0, 1)
#     params['w3'] = 0.9334223120116154
# #     params['w4'] = trial.suggest_float(f'w4', 0, 1)
#     params['w4'] = 0.33590012439835765
# #     params['w5'] = trial.suggest_float(f'w5', 0, 1)
#     params['w5'] = 0.9167395356124376
#     params['w6'] = trial.suggest_float(f'w6', 0, 1)
#     params['w6'] = 0.06456631090298333
        
        
        
    score = reduce_rmse_content(params)
    return score


def objective_wording(trial):
    params = {}    
    for i in range(len(oofs)):
        params[f"w{i+1}"] = trial.suggest_float(f'w{i+1}', 0, 1) 
    
    
    
# #     params['w1'] = trial.suggest_float(f'w1', 0, 1)
#     params['w1'] = 0.467454419562916
# #     params['w2'] = trial.suggest_float(f'w2', 0, 1)
#     params['w2'] = 0.0
# #     params['w3'] = trial.suggest_float(f'w3', 0, 1)
#     params['w3'] = 0.0
# #     params['w4'] = trial.suggest_float(f'w4', 0, 1)
#     params['w4'] = 0.0
# #     params['w5'] = trial.suggest_float(f'w5', 0, 1)
#     params['w5'] = 0.24793402911254775
# #     params['w6'] = trial.suggest_float(f'w6', 0, 1)
#     params['w6'] = 0.872806341843549
    

    
    score = reduce_rmse_wording(params)
    return score
        
    

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective_content, n_trials=500)
best_params_content = study.best_params

In [None]:
best_params_content # 0.0.4350322004265345

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective_wording, n_trials=500)
best_params_wording = study.best_params

In [None]:
best_params_wording # 0.0.5824519396100762

In [None]:
content = 0.43503616854966076
wording = 0.5813420492103132

In [None]:
print((content + wording)/2, f"[content - {content}, wording - {wording}]" ) 

In [None]:
# oofs # 0.5323537777575318

In [None]:
best_weights = {
    "content": {
        'model208': 0.014894157748461517,
        'model210': 0.024798218285980337,
        'model222': 0.3683565615916879,
        'model223': 0.07440796563094199,
        'model301': 0.26009101940467266,
        'model303': 0.035186623400637024,
        'model304': 0.03131238078272683,
        'model228': 0.27593695225248327,
        'model229': 0.9533872030382488,
        
    },
    
    "wording": {
        'model208': 0.0520672559265657,
        'model210': 0.019115888342385444,
        'model222': 0.0257670812178082,
        'model223': 0.06963802230475018,
        'model301': 0.004504811515739459,
        'model303': 0.42283927332769894,
        'model304': 0.050441060713579226,
        'model228': 0.9768371762718951,
        'model229': 0.9179471545026996,
    }    
}



In [None]:

content_final = None

for col in ['content']:
    w_ = best_weights[col]
    for idx, (model_key, model_wt) in enumerate(w_.items()):
        print(model_key, model_wt)
        if idx == 0:
            content_final = oofs[idx]['content_pred'].values * model_wt
        else:
            content_final += oofs[idx]['content_pred'].values * model_wt
    content_final = content_final/np.sum(list(w_.values()))
    print(np.sum(list(w_.values())))

In [None]:
wording_final = None

for col in ['wording']:
    w_ = best_weights[col]
    for idx, (model_key, model_wt) in enumerate(w_.items()):
        print(model_key, model_wt)
        if idx == 0:
            wording_final = oofs[idx]['wording_pred'].values* model_wt
        else:
            wording_final += oofs[idx]['wording_pred'].values * model_wt
    wording_final = wording_final/np.sum(list(w_.values()))
    print(np.sum(list(w_.values())))

In [None]:
final_oof_df = oofs[0].copy()

true_labels_oof = final_oof_df[['content_gt', 'wording_gt']].values
final_oof_df['content_pred'] = content_final
final_oof_df['wording_pred'] = wording_final
predictions_oof = final_oof_df[['content_pred', 'wording_pred']].values 

score, scores = get_score(true_labels_oof, predictions_oof)
print(score, scores)

In [None]:
final_oof_df.to_csv("../data/raw/pseudo2.csv", index=False)

In [None]:
### prepare v1 pseudo

In [None]:
best_weights = {
    "content": {
        'model208': 0.021420427641406517,
        'model210': 0.17377306975618464,
        'model222': 0.6944174081358903,
        'model223': 0.2540676804844625,
        'model301': 0.6028460120908795,
        'model303': 0.027908605704995954,
    },
    
    "wording": {
        'model208': 0.467454419562916,
        'model210': 0.0,
        'model222': 0.0,
        'model223': 0.0,
        'model301': 0.24793402911254775,
        'model303': 0.872806341843549,
    }    
}

In [None]:
content_final = None

for col in ['content']:
    w_ = best_weights[col]
    for idx, (model_key, model_wt) in enumerate(w_.items()):
        print(model_key, model_wt)
        if idx == 0:
            content_final = oofs[idx]['content_pred'].values * model_wt
        else:
            content_final += oofs[idx]['content_pred'].values * model_wt
    content_final = content_final/np.sum(list(w_.values()))
    print(np.sum(list(w_.values())))
    

In [None]:
wording_final = None

for col in ['wording']:
    w_ = best_weights[col]
    for idx, (model_key, model_wt) in enumerate(w_.items()):
        print(model_key, model_wt)
        if idx == 0:
            wording_final = oofs[idx]['wording_pred'].values* model_wt
        else:
            wording_final += oofs[idx]['wording_pred'].values * model_wt
    wording_final = wording_final/np.sum(list(w_.values()))
    print(np.sum(list(w_.values())))

In [None]:
final_oof_df = oofs[0].copy()

true_labels_oof = final_oof_df[['content_gt', 'wording_gt']].values
final_oof_df['content_pred'] = content_final
final_oof_df['wording_pred'] = wording_final
predictions_oof = final_oof_df[['content_pred', 'wording_pred']].values 

score, scores = get_score(true_labels_oof, predictions_oof)
print(score, scores)

In [None]:
final_oof_df.to_csv("pseudo1.csv", index=False)

In [None]:
type(true_labels_oof)

In [None]:
# target_columns = ['content', 'wording']
# oof_final[target_columns] = 0

# for col in target_columns:
#     w_ = best_weights[col]
#     for fn, w in w_.items():
#         oof_final[col] += oof_final[col+'_'+fn] * w
#     oof_final[col] = oof_final[col]/np.sum(list(w_.values()))
    
    
# oof_final = oof_final[['student_id'] + target_columns]
# # submission[target_columns] = submission[target_columns] .clip(1, 5)
# submission.head()

In [380]:
train = pd.read_csv("/home/rohits/pv1/commonlit/data/raw/train_folds_processed.csv")


In [389]:
fold = train.loc[train.fold == 2]
fold['prompt_text'].unique()[0] # 604, 550, 966, 596

'With one member trimming beef in a cannery, and another working in a sausage factory, the family had a first-hand knowledge of the great majority of Packingtown swindles. For it was the custom, as they found, whenever meat was so spoiled that it could not be used for anything else, either to can it or else to chop it up into sausage. With what had been told them by Jonas, who had worked in the pickle rooms, they could now study the whole of the spoiled-meat industry on the inside, and read a new and grim meaning into that old Packingtown jest—that they use everything of the pig except the squeal. \r\nJonas had told them how the meat that was taken out of pickle would often be found sour, and how they would rub it up with soda to take away the smell, and sell it to be eaten on free-lunch counters; also of all the miracles of chemistry which they performed, giving to any sort of meat, fresh or salted, whole or chopped, any color and any flavor and any odor they chose. In the pickling of

In [377]:
fold.columns

Index(['student_id', 'prompt_id', 'text', 'content', 'wording', 'fold'], dtype='object')

In [None]:
train.head()

In [None]:
contractions = {
"ain't": "am not / are not",
"aren't": "are not / am not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he had / he would",
"he'd've": "he would have",
"he'll": "he shall / he will",
"he'll've": "he shall have / he will have",
"he's": "he has / he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how has / how is",
"i'd": "I had / I would",
"i'd've": "I would have",
"i'll": "I shall / I will",
"i'll've": "I shall have / I will have",
"i'm": "I am",
"i've": "I have",
"isn't": "is not",
"it'd": "it had / it would",
"it'd've": "it would have",
"it'll": "it shall / it will",
"it'll've": "it shall have / it will have",
"it's": "it has / it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she had / she would",
"she'd've": "she would have",
"she'll": "she shall / she will",
"she'll've": "she shall have / she will have",
"she's": "she has / she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so as / so is",
"that'd": "that would / that had",
"that'd've": "that would have",
"that's": "that has / that is",
"there'd": "there had / there would",
"there'd've": "there would have",
"there's": "there has / there is",
"they'd": "they had / they would",
"they'd've": "they would have",
"they'll": "they shall / they will",
"they'll've": "they shall have / they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we had / we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what shall / what will",
"what'll've": "what shall have / what will have",
"what're": "what are",
"what's": "what has / what is",
"what've": "what have",
"when's": "when has / when is",
"when've": "when have",
"where'd": "where did",
"where's": "where has / where is",
"where've": "where have",
"who'll": "who shall / who will",
"who'll've": "who shall have / who will have",
"who's": "who has / who is",
"who've": "who have",
"why's": "why has / why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you had / you would",
"you'd've": "you would have",
"you'll": "you shall / you will",
"you'll've": "you shall have / you will have",
"you're": "you are",
"you've": "you have"
}

In [None]:
txt = train['text'][300]
txt

In [None]:
import re
from bs4 import BeautifulSoup


def clean_summary(summary):
    for word in summary.split():
        if word.lower() in contractions:
            summary = summary.replace(word, contractions[word.lower()])
    # Add space after punctuations
    clean_summary = summary.replace("\n", "[BR]")
    # Remove HTML tags using BeautifulSoup
    clean_summary = BeautifulSoup(clean_summary, "html.parser").get_text()
    # Remove special characters and non-printable characters
    clean_summary = re.sub(r'[^A-Za-z0-9\s]', ' ', clean_summary)
    # Remove extra spaces and newlines
    clean_summary = re.sub(r'\s+', ' ', clean_summary).strip()
    
    return clean_summary

In [None]:
clean_summary(txt)

In [None]:
# contractions

In [None]:
oofs[0]

In [None]:
!pip freeze