In [3]:
import numpy as np
import pandas as pd
from tqdm import tqdm

from lightgbm import LGBMRegressor
from sklearn import model_selection
from sklearn import metrics

In [4]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sample = pd.read_csv('sample_submission.csv')

In [5]:
train.head()

Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0016926B079C,I think that students would benefit from learn...,3.5,3.5,3.0,3.0,4.0,3.0
1,0022683E9EA5,When a problem is a change you have to let it ...,2.5,2.5,3.0,2.0,2.0,2.5
2,00299B378633,"Dear, Principal\n\nIf u change the school poli...",3.0,3.5,3.0,3.0,3.0,2.5
3,003885A45F42,The best time in life is when you become yours...,4.5,4.5,4.5,4.5,4.0,5.0
4,0049B1DF5CCC,Small act of kindness can impact in other peop...,2.5,3.0,3.0,3.0,2.5,2.5


In [6]:
target_col = ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

vec = TfidfVectorizer(max_features=5000,smooth_idf=True, sublinear_tf=True) 
vec.fit(raw_documents=train.full_text)

TfidfVectorizer(max_features=5000, sublinear_tf=True)

In [8]:
def extract_vectors(x):
    vecs = vec.transform(x)
    return vecs.toarray().flatten()

train['vecs'] = train.full_text.apply(lambda x: extract_vectors([x]))

In [9]:
train.head()

Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions,vecs
0,0016926B079C,I think that students would benefit from learn...,3.5,3.5,3.0,3.0,4.0,3.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,0022683E9EA5,When a problem is a change you have to let it ...,2.5,2.5,3.0,2.0,2.0,2.5,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,00299B378633,"Dear, Principal\n\nIf u change the school poli...",3.0,3.5,3.0,3.0,3.0,2.5,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,003885A45F42,The best time in life is when you become yours...,4.5,4.5,4.5,4.5,4.0,5.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,0049B1DF5CCC,Small act of kindness can impact in other peop...,2.5,3.0,3.0,3.0,2.5,2.5,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [10]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3911 entries, 0 to 3910
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   text_id      3911 non-null   object 
 1   full_text    3911 non-null   object 
 2   cohesion     3911 non-null   float64
 3   syntax       3911 non-null   float64
 4   vocabulary   3911 non-null   float64
 5   phraseology  3911 non-null   float64
 6   grammar      3911 non-null   float64
 7   conventions  3911 non-null   float64
 8   vecs         3911 non-null   object 
dtypes: float64(6), object(3)
memory usage: 275.1+ KB


In [11]:
feature_set = []
for i, row in tqdm(train.iterrows(), total=len(train)):
    vecs = row['vecs']
    vals = row[target_col].astype(float)
    features = np.hstack([vecs, vals]).flatten()
    feature_set.append(features)
feature_set = np.array(feature_set)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3911/3911 [00:01<00:00, 1963.72it/s]


In [12]:
cohesion_train = feature_set[:, -6]
syntax_train = feature_set[:, -5]
vocabulary_train = feature_set[:, -4]
phraseology_train = feature_set[:, -3]
grammar_train = feature_set[:, -2]
conventions_train = feature_set[:, -1]

X = feature_set[:, :-6]

In [13]:
cohesion_model = LGBMRegressor(n_estimators=500, max_depth=8, learning_rate=0.1)
syntax_model = LGBMRegressor(n_estimators=500, max_depth=8, learning_rate=0.1)
vocabulary_model = LGBMRegressor(n_estimators=500, max_depth=8, learning_rate=0.1)
phraseology_model = LGBMRegressor(n_estimators=500, max_depth=8, learning_rate=0.1)
grammar_model = LGBMRegressor(n_estimators=500, max_depth=8, learning_rate=0.1)
conventions_model = LGBMRegressor(n_estimators=500, max_depth=8, learning_rate=0.1)

In [14]:
performances_container = []


cv_strategy = model_selection.KFold(n_splits=5, shuffle=True, random_state=42)

performances = {}

for var in [("cohesion", cohesion_train), ("syntax", syntax_train), ("vocabulary", vocabulary_train), 
            ("phraseology", phraseology_train), ("grammar", grammar_train), ("conventions", conventions_train)]:
    print(f'Validating on {var[0]}')
    fold = 0
    train_scores = []
    val_scores = []
    
    n_estimators = 500
    max_depth = 8
    learning_rate = 0.1
    
    for train_idx, val_idx in cv_strategy.split(X, var[1]):

        X_train = X[train_idx]
        y_train = var[1][train_idx]

        X_val = X[val_idx]
        y_val = var[1][val_idx]

        model = LGBMRegressor(n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate)
        model.fit(X_train, y_train)

        train_preds = model.predict(X_train)
        val_preds = model.predict(X_val)

        train_score = np.sqrt(metrics.mean_squared_error(y_train, train_preds))
        val_score = np.sqrt(metrics.mean_squared_error(y_val, val_preds))

        train_scores.append(train_score)
        val_scores.append(val_score)

        print(f"Fold {fold} ==> Train accuracy: {train_score:0.4f} | Validation accuracy: {val_score:0.4f}")
        fold += 1

    training_performance = np.mean(train_scores)
    val_performance = np.mean(val_scores)
    performances = {"var": var[0], "training_performance": training_performance, "val_performance": val_performance, "n_estimators": n_estimators, "max_depth": max_depth, "learning_rate": learning_rate}
    performances_container.append(performances)
    print(f"END. Training performance: {training_performance:0.4f} | Validation performance: {val_performance:0.4f}\n")

Validating on cohesion
Fold 0 ==> Train accuracy: 0.0765 | Validation accuracy: 0.5649
Fold 1 ==> Train accuracy: 0.0781 | Validation accuracy: 0.5753
Fold 2 ==> Train accuracy: 0.0765 | Validation accuracy: 0.5595
Fold 3 ==> Train accuracy: 0.0755 | Validation accuracy: 0.5823
Fold 4 ==> Train accuracy: 0.0723 | Validation accuracy: 0.5732
END. Training performance: 0.0758 | Validation performance: 0.5710

Validating on syntax
Fold 0 ==> Train accuracy: 0.0650 | Validation accuracy: 0.5545
Fold 1 ==> Train accuracy: 0.0651 | Validation accuracy: 0.5591
Fold 2 ==> Train accuracy: 0.0643 | Validation accuracy: 0.5475
Fold 3 ==> Train accuracy: 0.0689 | Validation accuracy: 0.5428
Fold 4 ==> Train accuracy: 0.0698 | Validation accuracy: 0.5404
END. Training performance: 0.0666 | Validation performance: 0.5489

Validating on vocabulary
Fold 0 ==> Train accuracy: 0.0575 | Validation accuracy: 0.4921
Fold 1 ==> Train accuracy: 0.0648 | Validation accuracy: 0.5039
Fold 2 ==> Train accuracy: 

In [15]:
cohesion_model.fit(X, cohesion_train)
syntax_model.fit(X, syntax_train)
vocabulary_model.fit(X, vocabulary_train)
phraseology_model.fit(X, phraseology_train)
grammar_model.fit(X, grammar_train)
conventions_model.fit(X, conventions_train)

LGBMRegressor(max_depth=8, n_estimators=500)

In [16]:
test['vecs'] = test.full_text.apply(lambda x: extract_vectors([x]))

In [17]:
test_feature_set = []
for i, row in tqdm(test.iterrows(), total=len(test)):
    test_feature_set.append(row.vecs)
test_feature_set = np.array(test_feature_set)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 1000.07it/s]


In [18]:
cohesion_predictions = cohesion_model.predict(test_feature_set)
syntax_predictions = syntax_model.predict(test_feature_set)
vocabulary_predictions = vocabulary_model.predict(test_feature_set)
phraseology_predictions = phraseology_model.predict(test_feature_set)
grammar_predictions = grammar_model.predict(test_feature_set)
conventions_predictions = conventions_model.predict(test_feature_set)

In [19]:
submission = sample.copy()
submission.cohesion = cohesion_predictions
submission.syntax = syntax_predictions
submission.vocabulary = vocabulary_predictions
submission.phraseology = phraseology_predictions
submission.grammar = grammar_predictions
submission.conventions = conventions_predictions

In [20]:
submission.to_csv("submission.csv", index=False)