In [108]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error, r2_score
import seaborn as sns
import lightgbm as lgb

In [109]:
data= pd.read_csv('./data/final_data_v2.csv')
print(data.columns)
print(data.isnull().sum())
print(data.head())

Index(['paragraph_txt', 'essay_level', 'student_grade_group', 'org_paragraph',
       'org', 'org_essay', 'org_coherence', 'org_quantity', 'con_novelty',
       'con_clearance', 'con', 'con_prompt', 'con_description', 'exp_style',
       'exp_grammar', 'exp', 'exp_vocab', 'essay_grade', 'essay_main_subject',
       'punctuation_marks', 'ending_of_a_word', 'word_order', 'diff',
       'Rouge_l_f1', 'paragraph_scoreT_avg'],
      dtype='object')
paragraph_txt           0
essay_level             0
student_grade_group     0
org_paragraph           0
org                     0
org_essay               0
org_coherence           0
org_quantity            0
con_novelty             0
con_clearance           0
con                     0
con_prompt              0
con_description         0
exp_style               0
exp_grammar             0
exp                     0
exp_vocab               0
essay_grade             0
essay_main_subject      0
punctuation_marks       0
ending_of_a_word        0
word_o

In [110]:
delete_columns= ['paragraph_txt', 'student_grade_group', 'essay_grade', 'essay_main_subject', \
    'ending_of_a_word', 'word_order', 'diff', 'punctuation_marks']
# delete_columns= ['paragraph_txt', 'student_grade_group', 'essay_grade', 'essay_main_subject']

data= data.drop(delete_columns, axis= 1)
data.columns

Index(['essay_level', 'org_paragraph', 'org', 'org_essay', 'org_coherence',
       'org_quantity', 'con_novelty', 'con_clearance', 'con', 'con_prompt',
       'con_description', 'exp_style', 'exp_grammar', 'exp', 'exp_vocab',
       'Rouge_l_f1', 'paragraph_scoreT_avg'],
      dtype='object')

In [111]:
seed= 42
train, test= train_test_split(data, test_size= 0.2, random_state= seed)
train, val= train_test_split(train,test_size= 0.2, random_state= seed)

# train.to_csv('./data/train.csv', index= False)
# test.to_csv('./data/test.csv', index= False)
# val.to_csv('./data/validation.csv', index= False)

In [112]:
train= pd.read_csv('./data/train.csv')
test= pd.read_csv('./data/test.csv')
valid= pd.read_csv('./data/validation.csv')

In [113]:
def get_params():
    params = {'learning_rate': 0.01, 
          'max_depth': 128, 
          'boosting': 'gbdt', 
          'objective': 'regression', 
          'metric': 'mse', 
          'is_training_metric': True, 
          'num_leaves': 128, 
          'feature_fraction': 0.8, 
          'bagging_fraction': 0.8, 
          'bagging_freq': 5, 
          'seed':42}

    return params

In [114]:
train_x= train.drop(['paragraph_scoreT_avg'], axis= 1)
train_y= train['paragraph_scoreT_avg']

val_x= valid.drop(['paragraph_scoreT_avg'], axis= 1)
val_y= valid['paragraph_scoreT_avg']

test_x= test.drop(['paragraph_scoreT_avg'], axis= 1)
test_y= test['paragraph_scoreT_avg']

In [115]:
# scaler= RobustScaler()
# train_x= scaler.fit_transform(train_x)
# val_x= scaler.transform(val_x)
# test_x= scaler.transform(test_x)

trainset= lgb.Dataset(train_x, label= train_y)
valset= lgb.Dataset(val_x, label= val_y)
testset= lgb.Dataset(test_x, label= test_y)

In [116]:
params= get_params()
model= lgb.train(params, trainset, 1000, valset, verbose_eval=100, early_stopping_rounds= 100)



You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 414
[LightGBM] [Info] Number of data points in the train set: 3840, number of used features: 18
[LightGBM] [Info] Start training from score 2.610969
Training until validation scores don't improve for 100 rounds
[100]	valid_0's l2: 0.126124
[200]	valid_0's l2: 0.123465
Early stopping, best iteration is:
[190]	valid_0's l2: 0.12334


In [117]:
pred_train = model.predict(train_x)
pred_val= model.predict(val_x)
# print(pred_val)
mse_train = mean_squared_error(train_y, pred_train)
mse = mean_squared_error(val_y, pred_val)
print(f'train mse : {mse_train}')
print(f'val mse: {mse}')



train mse : 0.09609505292693003
val mse: 0.12334021270598174


In [119]:
def get_label_and_accuracy(pred_score_list, df):
    pred_score_df= pd.DataFrame({'pred_score': pred_score_list})
    pred_df= pd.concat([df, pred_score_df], axis= 1)

    print(pred_df)
    pred_df= pred_df.sort_values(by=['paragraph_scoreT_avg'], ascending= False)
    length= len(pred_df)
    
    label= ['A']* int(0.25*length) + ['B']* int(0.25* length) + ['C'] * int(0.25* length) + ['D']* (length- int(0.25*length) - int(0.25* length)- int(0.25* length))
    pred_df['label']= label

    pred_df= pred_df.sort_values(by=['pred_score'], ascending= False)
    length= len(pred_df)
    label= ['A']* int(0.25*length) + ['B']* int(0.25* length) + ['C'] * int(0.25* length) + ['D']* (length- int(0.25*length) - int(0.25* length)- int(0.25* length))
    pred_df['pred_label']= label

    acc= 0
    for i in range(len(pred_df)):
        if pred_df.iloc[i]['label']==  pred_df.iloc[i]['paragraph_scoreT_avg']:
            acc+=1

    print(f'accuracy: {acc/len(pred_df)*100}%')

In [None]:
get_label_and_accuracy(pred_val, val_y)