In [60]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import mean_squared_error
import seaborn as sns
import lightgbm as lgb

In [61]:
data= pd.read_csv('./data/final_data_v2.csv')
print(data.columns)
print(data.isnull().sum())
print(data.head())

Index(['paragraph_txt', 'essay_level', 'student_grade_group', 'org_paragraph',
       'org', 'org_essay', 'org_coherence', 'org_quantity', 'con_novelty',
       'con_clearance', 'con', 'con_prompt', 'con_description', 'exp_style',
       'exp_grammar', 'exp', 'exp_vocab', 'essay_grade', 'essay_main_subject',
       'punctuation_marks', 'ending_of_a_word', 'word_order', 'diff',
       'Rouge_l_f1', 'paragraph_scoreT_avg'],
      dtype='object')
paragraph_txt           0
essay_level             0
student_grade_group     0
org_paragraph           0
org                     0
org_essay               0
org_coherence           0
org_quantity            0
con_novelty             0
con_clearance           0
con                     0
con_prompt              0
con_description         0
exp_style               0
exp_grammar             0
exp                     0
exp_vocab               0
essay_grade             0
essay_main_subject      0
punctuation_marks       0
ending_of_a_word        0
word_o

In [62]:
# delete_columns= ['student_grade_group', 'essay_main_subject', 'essay_grade']

# data= data.drop(delete_columns, axis= 1)
# data.columns

In [63]:
# seed= 42
# train, test= train_test_split(data, test_size= 0.2, random_state= seed)
# train, val= train_test_split(train,test_size= 0.2, random_state= seed)

# train.to_csv('./data/train_with_para.csv', index= False)
# test.to_csv('./data/test_with_para.csv', index= False)
# val.to_csv('./data/validation_with_para.csv', index= False)


In [64]:
train= pd.read_csv('./data/train.csv')
test= pd.read_csv('./data/test.csv')
valid= pd.read_csv('./data/validation.csv')

In [65]:
def get_params():
    params = {'learning_rate': 0.01, 
          'max_depth': 64, 
          'boosting': 'gbdt', 
          'objective': 'multiclass', 
          'metric': 'multi_logloss', 
          'num_leaves': 8, 
          'min_data': 30, 
          'num_classes': 4, 
          'seed':42}

    return params

In [66]:
def get_label(pred_df):
    # pred_df= pred_df.sort_values(by=['paragraph_scoreT_avg'], ascending= False)
    # length= len(pred_df)
    
    # label= [3]* int(0.25*length) + [2]* int(0.25* length) + [1] * int(0.25* length) + [0]* (length- int(0.25*length) - int(0.25* length)- int(0.25* length))
    label= []
    for i in range(len(pred_df)):
        # print(pred_df.iloc[i]['paragraph_scoreT_avg'])
        if pred_df.iloc[i]['paragraph_scoreT_avg'] == 3.0:
            label.append(3)
        elif pred_df.iloc[i]['paragraph_scoreT_avg'] >= 2.75:
            label.append(2)
        elif pred_df.iloc[i]['paragraph_scoreT_avg'] >= 2.333335:
            label.append(1)
        else:
            label.append(0)
    pred_df['label']= label
    return pred_df

In [67]:
train= get_label(train)
train_x= train.drop(['paragraph_scoreT_avg', 'label'], axis= 1)
train_y= train['label']

valid= get_label(valid)
val_x= valid.drop(['paragraph_scoreT_avg', 'label'], axis= 1)
val_y= valid['label']

test= get_label(test)
test_x= test.drop(['paragraph_scoreT_avg', 'label'], axis= 1)
test_y= test['label']
pd.DataFrame(test_y).to_csv('./final_test_y.csv', index= False)

In [68]:
# scaler= RobustScaler()
# train_x= scaler.fit_transform(train_x)
# val_x= scaler.transform(val_x)
# test_x= scaler.transform(test_x)

trainset= lgb.Dataset(train_x, label= train_y)
valset= lgb.Dataset(val_x, label= val_y)
testset= lgb.Dataset(test_x, label= test_y)

In [69]:
params= get_params()
model= lgb.train(params, trainset, 1000, testset, verbose_eval=100, early_stopping_rounds= 100)



You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 414
[LightGBM] [Info] Number of data points in the train set: 3840, number of used features: 18
[LightGBM] [Info] Start training from score -1.298589
[LightGBM] [Info] Start training from score -1.376963
[LightGBM] [Info] Start training from score -1.416968
[LightGBM] [Info] Start training from score -1.459762
Training until validation scores don't improve for 100 rounds
[100]	valid_0's multi_logloss: 1.29821
[200]	valid_0's multi_logloss: 1.28178
[300]	valid_0's multi_logloss: 1.28003
Early stopping, best iteration is:
[277]	valid_0's multi_logloss: 1.27946


In [70]:
pred_train = model.predict(train_x)
pred_val= model.predict(test_x)
print(pred_val)
tmp_df= pd.DataFrame(pred_val)
tmp_df.columns= [0, 1, 2, 3]
tmp_df.to_csv('./lightgbm_logits.csv', index= False)

pred_val= np.argmax(pred_val, axis= 1)

print(classification_report(pred_val, test_y))

[[0.15828239 0.20975432 0.31336402 0.31859927]
 [0.52388047 0.16009029 0.14985894 0.16617031]
 [0.49183613 0.20228367 0.17592399 0.12995621]
 ...
 [0.09108296 0.30173522 0.18921106 0.41797077]
 [0.08330528 0.36385926 0.35786966 0.19496581]
 [0.50848987 0.16350644 0.13330422 0.19469947]]
              precision    recall  f1-score   support

           0       0.76      0.44      0.56       544
           1       0.26      0.31      0.28       241
           2       0.27      0.33      0.30       230
           3       0.22      0.36      0.27       185

    accuracy                           0.38      1200
   macro avg       0.37      0.36      0.35      1200
weighted avg       0.48      0.38      0.41      1200

