In [1]:
!pip uninstall en-core-web-sm -y
!pip uninstall en-core-web-md -y
!pip uninstall en-core-web-lg -y
!pip uninstall tensorflow -y

Found existing installation: en-core-web-sm 3.5.0
Uninstalling en-core-web-sm-3.5.0:
  Successfully uninstalled en-core-web-sm-3.5.0
[0mFound existing installation: en-core-web-lg 3.5.0
Uninstalling en-core-web-lg-3.5.0:
  Successfully uninstalled en-core-web-lg-3.5.0
Found existing installation: tensorflow 2.12.0
Uninstalling tensorflow-2.12.0:
  Successfully uninstalled tensorflow-2.12.0


In [1]:
!pip install --no-index --find-links /kaggle/input/data9417/ /kaggle/input/data9417/tensorflow-2.13.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
!pip install --no-index --find-links /kaggle/input/data9417/ /kaggle/input/data9417/contractions-0.1.73-py2.py3-none-any.whl
!pip install --no-index --find-links /kaggle/input/data9417/ /kaggle/input/data9417/spacy-3.4.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
!pip install --no-index --find-links /kaggle/input/data9417/ /kaggle/input/data9417/spacy_cleaner-3.1.3-py3-none-any.whl

!pip install /kaggle/input/data9417/en_core_web_sm-3.4.1-py3-none-any.whl

### Results 
#### 0.5957 public score
#### 0.6042 private score

## Start here. 

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import contractions
import spacy
import spacy_cleaner
#import statsmodels.api as sm
#import pylab as py
from spacy_cleaner.processing import removers, replacers, mutators
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import backend as backend
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, Dense, Embedding, Input, dot
import tensorflow as tf



In [4]:
train_df = pd.read_csv('/kaggle/input/feedback-prize-english-language-learning/train.csv')
test_df = pd.read_csv('/kaggle/input/feedback-prize-english-language-learning/test.csv')

In [5]:
import random
from albumentations.core.transforms_interface import DualTransform, BasicTransform
from nltk import sent_tokenize
class NLPTransform(BasicTransform):
    """ Transform for nlp task."""
    LANGS = {
        'en': 'english',
        'it': 'italian', 
        'fr': 'french', 
        'es': 'spanish',
        'tr': 'turkish', 
        'ru': 'russian',
        'pt': 'portuguese'
    }

    @property
    def targets(self):
        return {"data": self.apply}
    
    def update_params(self, params, **kwargs):
        if hasattr(self, "interpolation"):
            params["interpolation"] = self.interpolation
        if hasattr(self, "fill_value"):
            params["fill_value"] = self.fill_value
        return params

    def get_sentences(self, text, lang='en'):
        return sent_tokenize(text, self.LANGS.get(lang, 'english'))
    


In [6]:
#trying basic data augmentation: 
class ShuffleSentencesTransform(NLPTransform):
    """ Do shuffle by sentence """
    def __init__(self, always_apply=False, p=0.5):
        super(ShuffleSentencesTransform, self).__init__(always_apply, p)

    def apply(self, data, **params):
        text, lang = data
        sentences = self.get_sentences(text, lang)
        random.shuffle(sentences)
        return ' '.join(sentences), lang
    
#https://www.kaggle.com/code/shonenkov/nlp-albumentations/notebook

In [7]:
transform = ShuffleSentencesTransform(p=1.0)
transformed_list = []

for iterator in train_df['full_text']:
    text = iterator
    lang = 'en'
    temp_text = transform(data=(text, lang))['data'][0]
    transformed_list.append(temp_text)
aug_df = train_df.copy()
aug_df['full_text'] = transformed_list
train_df = pd.concat([train_df, aug_df], keys = 'text_id', ignore_index = True)
train_df.tail()

Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions
7817,FFD29828A873,"i've been through it all and seen it all, im a...",2.5,3.0,3.0,3.5,2.5,2.5
7818,FFD9A83B0849,Working with a group should be allowed for stu...,4.0,4.0,4.0,4.0,3.5,3.0
7819,FFDC4011AC9C,Then just die you get around the challlenge by...,2.5,3.0,3.0,3.0,3.5,3.0
7820,FFE16D704B16,"Look at Barack Obama for example, he's a black...",4.0,4.5,4.5,4.0,4.5,4.5
7821,FFED00D6E0BD,In that way they would be able to do everythin...,3.5,2.5,3.5,3.0,3.0,3.5


In [8]:
df = train_df.copy()
test_feat = test_df.copy()

In [9]:
#https://www.kaggle.com/code/tangelus/english-language-learning-vectorization-lgbm
def syllable_count(word):
    word = word.lower()
    count = 0
    vowels = "aeiouy"
    if word[0] in vowels:
        count += 1
    for index in range(1, len(word)):
        if word[index] in vowels and word[index - 1] not in vowels:
            count += 1
    if word.endswith("e"):
        count -= 1
    if count == 0:
        count += 1
    return count

def flesch_kincaid_score(essay):
    #206.835 - 1.015 × (total words ÷ total sentences) - 84.6 × (total syllables ÷ total words).
    num_words = len(essay.split())
    num_sentences = len(essay.split('.'))
    syllables = sum([syllable_count(word) for word in essay.split()])
    score = 206.835 - 1.015 * (num_words / num_sentences) - 84.6 * (syllables / num_words)
    return score

In [10]:
# extract features from text
import string
from nltk.corpus import stopwords

stop_words = stopwords.words('english')
df['char_count'] = df['full_text'].apply(len)
df['word_count'] = df['full_text'].apply(lambda x: len(x.split()))
df['word_density'] = df['char_count'] / (df['word_count'] + 1)
df['punctuation_count'] = df['full_text'].apply(lambda x: len("".join(_ for _ in x if _ in string.punctuation))) 
df['title_word_count'] = df['full_text'].apply(lambda x: len([wrd for wrd in x.split() if wrd.istitle()]))
df['upper_case_word_count'] = df['full_text'].apply(lambda x: len([wrd for wrd in x.split() if wrd.isupper()]))
df['stopword_count'] = df['full_text'].apply(lambda x: len([wrd for wrd in x.split() if wrd.lower() in stop_words]))
df['flesch_kincaid_score'] = df['full_text'].apply(lambda x: flesch_kincaid_score(x))

In [11]:
eng_features_train = df.drop(['full_text', 'cohesion','syntax', 'vocabulary',
       'phraseology', 'grammar', 'conventions'], axis=1)
eng_features_train[:5]

Unnamed: 0,text_id,char_count,word_count,word_density,punctuation_count,title_word_count,upper_case_word_count,stopword_count,flesch_kincaid_score
0,0016926B079C,1387,261,5.293893,21,3,1,129,83.98176
1,0022683E9EA5,2635,533,4.934457,21,12,2,311,63.312381
2,00299B378633,1663,320,5.180685,36,27,9,177,79.293125
3,003885A45F42,3973,728,5.449931,108,57,9,420,75.30375
4,0049B1DF5CCC,1326,234,5.642553,3,3,0,122,40.080577


In [12]:
test_feat['char_count'] = test_feat['full_text'].apply(len)
test_feat['word_count'] = test_feat['full_text'].apply(lambda x: len(x.split()))
test_feat['word_density'] = test_feat['char_count'] / (test_feat['word_count'] + 1)
test_feat['punctuation_count'] = test_feat['full_text'].apply(lambda x: len("".join(_ for _ in x if _ in string.punctuation))) 
test_feat['title_word_count'] = test_feat['full_text'].apply(lambda x: len([wrd for wrd in x.split() if wrd.istitle()]))
test_feat['upper_case_word_count'] = test_feat['full_text'].apply(lambda x: len([wrd for wrd in x.split() if wrd.isupper()]))
test_feat['stopword_count'] = test_feat['full_text'].apply(lambda x: len([wrd for wrd in x.split() if wrd.lower() in stop_words]))
test_feat['flesch_kincaid_score'] = test_feat['full_text'].apply(lambda x: flesch_kincaid_score(x))

In [13]:
eng_features_test = test_feat.drop(['full_text'], axis=1)
eng_features_test[:5]

Unnamed: 0,text_id,char_count,word_count,word_density,punctuation_count,title_word_count,upper_case_word_count,stopword_count,flesch_kincaid_score
0,0000C359D63E,4224,835,5.052632,37,25,1,454,67.34613
1,000BAD50D026,2167,386,5.599483,36,11,1,207,67.940871
2,00367BB2546B,2361,442,5.329571,33,11,1,244,62.349525


In [14]:
proc_text = [contractions.fix(text) for text in train_df['full_text']]
# Very fast! Measure time later
def more_processing(text):
    text = '<sostok> ' + text + ' <eostok>'
    text = text.lower()
    return text

proc_text_2 = [more_processing(text) for text in proc_text]

nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])
#print(nlp.pipe_names)

pipeline = spacy_cleaner.Pipeline(
    nlp,
    removers.remove_stopword_token,
    removers.remove_punctuation_token,
    replacers.replace_number_token,
    mutators.mutate_lemma_token,
)

# Remember to switch n_process to 4 when running on Kaggle.

proc_text_3 = pipeline.clean(proc_text_2, n_process = 4, batch_size = 12)

#proc_text_3 = pd.read_pickle("/kaggle/input/data9417/proc_text_3.pkl")

# Also added start of seq and end of seq, lowercase
train_df['cleaned_text'] = proc_text_3

train_df.head()

proc_test = [contractions.fix(text) for text in test_df['full_text']]
# Very fast! Measure time later
def more_processing(text):
    text = '<sostok> ' + text + ' <eostok>'
    text = text.lower()
    return text

proc_test_2 = [more_processing(text) for text in proc_test]

nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])
#print(nlp.pipe_names)

pipeline = spacy_cleaner.Pipeline(
    nlp,
    removers.remove_stopword_token,
    removers.remove_punctuation_token,
    replacers.replace_number_token,
    mutators.mutate_lemma_token,
)

# Remember to switch n_process to 4 when running on Kaggle.

proc_test_3 = pipeline.clean(proc_test_2, n_process = 4, batch_size = 12)

test_df['cleaned_text'] = proc_test_3

Cleaning Progress: 100%|██████████| 7822/7822 [04:32<00:00, 28.72it/s]
Cleaning Progress: 100%|██████████| 3/3 [00:00<00:00,  9.29it/s]


In [15]:
train_df = pd.concat([train_df, eng_features_train], keys = 'text_id', axis = 1)
test_df = pd.concat([test_df, eng_features_test], keys = 'text_id', axis = 1)

In [16]:
# Just fixing the column name problem.
df_1_cols = train_df['t'].columns.to_numpy()
df_2_cols = train_df['e'].columns.to_numpy()
#columns_1.extend(columns_2)
columns_1 = [x for x in df_1_cols]
columns_2 = [x for x in df_2_cols]
columns_total = columns_1 + columns_2

In [17]:
train_df.columns = columns_total

In [18]:
# Just fixing the column name problem.
df_1_cols = test_df['t'].columns.to_numpy()
df_2_cols = test_df['e'].columns.to_numpy()
#columns_1.extend(columns_2)
columns_1 = [x for x in df_1_cols]
columns_2 = [x for x in df_2_cols]
columns_total = columns_1 + columns_2
test_df.columns = columns_total

In [19]:
test_df.head()

Unnamed: 0,text_id,full_text,cleaned_text,text_id.1,char_count,word_count,word_density,punctuation_count,title_word_count,upper_case_word_count,stopword_count,flesch_kincaid_score
0,0000C359D63E,when a person has no experience on a job their...,< sostok > person experience job go good peopl...,0000C359D63E,4224,835,5.052632,37,25,1,454,67.34613
1,000BAD50D026,Do you think students would benefit from being...,< sostok > think student benefit able attend c...,000BAD50D026,2167,386,5.599483,36,11,1,207,67.940871
2,00367BB2546B,"Thomas Jefferson once states that ""it is wonde...",< sostok > thomas jefferson state wonderful ag...,00367BB2546B,2361,442,5.329571,33,11,1,244,62.349525


In [20]:
# rare word analysis
def get_rare_word_percent(tokenizer, threshold):
    # threshold: if the word's occurrence is less than this then it's rare word

    count = 0
    total_count = 0
    frequency = 0
    total_frequency = 0
    rare_words = []

    for key, value in tokenizer.word_counts.items():
        total_count += 1
        total_frequency += value
        if value < threshold:
            rare_words.append({key, value})
            count += 1
            frequency += value
    print({
        'percent': round((count / total_count) * 100, 2),
        'total_coverage': round(frequency / total_frequency * 100, 2),
        'count': count,
        'total_count': total_count
        })

    return rare_words

# This is a different tokenizer to the one built into SpaCy
from tensorflow.keras.preprocessing.text import Tokenizer
x_tokenizer = Tokenizer()
x_tokenizer.fit_on_texts(train_df['cleaned_text'])

rare_words = get_rare_word_percent(x_tokenizer, 4)
#print(rare_words)

#rare_words[:100]
#mostly typos in otherwise normal

org_tokenizer = Tokenizer()
org_tokenizer.fit_on_texts(train_df['full_text'])

rare_words_2 = get_rare_word_percent(org_tokenizer, 4)
#print(rare_words)

print(rare_words_2[:100])
#typos exist in original sentences too.

{'percent': 52.24, 'total_coverage': 1.4, 'count': 9088, 'total_count': 17397}
{'percent': 49.91, 'total_coverage': 0.66, 'count': 11065, 'total_count': 22169}
[{2, 'showers'}, {2, 'mos'}, {2, 'homeles'}, {2, 'wount'}, {2, 'trowing'}, {2, 'estudnets'}, {'arctecture', 2}, {'estared', 2}, {'potition', 2}, {2, 'arquicteture'}, {2, 'thirt'}, {"cann't", 2}, {'beatifull', 2}, {2, 'selfcondifence'}, {'impotently', 2}, {2, 'selfcondience'}, {'battlefield', 2}, {2, 'selfcondicence'}, {2, 'selfcondidence'}, {'incuage', 2}, {2, 'circulating'}, {'corovirus', 2}, {'highschol', 2}, {2, 'grocerys'}, {2, 'thenthey'}, {2, 'aruge'}, {'techbology', 2}, {2, 'surgical'}, {2, "expensive's"}, {2, "sticker's"}, {'terror', 2}, {'tees', 2}, {'20019', 2}, {2, 'pedophiles'}, {2, 'nutrients'}, {2, 'interducaions'}, {2, 'warmest'}, {2, 'enjoble'}, {2, 'consecrate'}, {'paragargh', 2}, {2, 'bey'}, {2, 'wannna'}, {'remebers', 2}, {2, 'balmy'}, {2, 'reaserch'}, {2, "classmates'"}, {"class'", 2}, {'aggravated', 2}, {'ca

In [21]:
import numpy as np
from sklearn.model_selection import train_test_split
x_train_full, x_val_full, y_train, y_val = train_test_split(np.array(train_df[['cleaned_text', 'char_count', 'word_count', 'word_density',
                                                                     'punctuation_count', 'title_word_count', 'upper_case_word_count',
                                                                   'stopword_count', 'flesch_kincaid_score']]), np.array(train_df[['cohesion', 'syntax', 'vocabulary',
                                                                                  'phraseology', 'grammar', 'conventions']]), test_size = 0.05, random_state = 0, shuffle = True)

In [22]:
x_train = x_train_full[:, 0]
x_train_feats = x_train_full[:, 1:]
x_val = x_val_full[:, 0]
x_val_feats = x_val_full[:, 1:]

In [23]:
x_train_feats[:5]

array([[2326, 401, 5.786069651741293, 44, 23, 1, 208, 67.55522513161543],
       [4208, 746, 5.633199464524766, 86, 57, 12, 385, 65.17338914890718],
       [998, 189, 5.252631578947368, 16, 10, 2, 91, 72.00142857142859],
       [1148, 232, 4.927038626609442, 18, 11, 3, 143, 84.01425287356324],
       [3484, 626, 5.556618819776714, 66, 56, 9, 340, 70.51915257539159]],
      dtype=object)

In [24]:
# To check how many rows in a column has length (of the text) <= limit
def get_word_percent(column, limit):
    count = 0
    for sentence in column:
        if len(sentence.split()) <= limit:
            count += 1

    return round(count / len(column), 2)


# Check how many % of headlines have 0-430 words
print(get_word_percent(train_df.cleaned_text, 430))

1.0


In [25]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
max_text_len = 450
x_train_sequence = x_tokenizer.texts_to_sequences(x_train)
x_val_sequence = x_tokenizer.texts_to_sequences(x_val)

# padding upto max_text_len
x_train_padded = pad_sequences(x_train_sequence, maxlen=max_text_len, padding='pre')
x_val_padded = pad_sequences(x_val_sequence, maxlen=max_text_len, padding='pre')

x_vocab_size = len(x_tokenizer.word_index) + 1

print(x_vocab_size)

print(x_train_padded.shape)
print(y_train.shape)
print(x_val_padded.shape)
print(y_val.shape)

17398
(7430, 450)
(7430, 6)
(392, 450)
(392, 6)


In [26]:
def get_embedding_matrix(tokenizer, embedding_dim, vocab_size=None):
    word_index = tokenizer.word_index
    voc = list(word_index.keys())

    path_to_glove_file = '/kaggle/input/glove50/glove.6B.50d.txt'

    embeddings_index = {}
    with open(path_to_glove_file) as f:
        for line in f:
            word, coefs = line.split(maxsplit=1)
            coefs = np.fromstring(coefs, "f", sep=" ")
            embeddings_index[word] = coefs

    print("Found %s word vectors." % len(embeddings_index))

    num_tokens = len(voc) + 2 if not vocab_size else vocab_size
    hits = 0
    misses = 0

    # Prepare embedding matrix
    embedding_matrix = np.zeros((num_tokens, embedding_dim))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # Words not found in embedding index will be all-zeros.
            # This includes the representation for "padding" and "OOV"
            embedding_matrix[i] = embedding_vector
            #print("Word, i : ", word, i)
            hits += 1
        else:
            misses += 1
    print("Converted %d words (%d misses)" % (hits, misses))

    return embedding_matrix

embedding_dim = 50
x_embedding_matrix = get_embedding_matrix(x_tokenizer, embedding_dim, x_vocab_size)

Found 400000 word vectors.
Converted 10101 words (7296 misses)


In [27]:
#root mean squared error
from keras import backend as K 
def root_mean_squared_error(y):
    y_true = y[0]
    y_pred = y[1]
    return K.sqrt(K.mean(K.square(y_pred - y_true))) 

#tensor containing the RMSE for each column.
def mean_columnwise_root_mean_squared_error(y_true, y_pred):
    all_rmse = tf.map_fn(root_mean_squared_error, (y_true, y_pred), dtype=tf.float32)
    return K.mean(all_rmse)    

In [28]:
from tensorflow.keras import backend as backend
from tensorflow.keras.models import Model
from tensorflow.keras.layers import MaxPooling1D, GlobalAveragePooling1D, Conv1D
from tensorflow.keras.layers import BatchNormalization, Concatenate, LSTM, Dense, Embedding, Input, dot, Dropout
import tensorflow as tf
backend.clear_session()
input_dim = len(x_train_padded[0])
essay_input= Input(shape = (input_dim, ))
features_input = Input(shape = (8,))
embedding_layer = Embedding(input_dim = x_vocab_size, 
                    output_dim = embedding_dim, 
                    embeddings_initializer = tf.keras.initializers.Constant(x_embedding_matrix),
                    input_length=max_text_len, trainable = False, mask_zero = True)(essay_input)

# Convolutional layers for the essays input
conv_1 = Conv1D(128, 2, activation = 'relu')(embedding_layer)
conv_2 = Conv1D(128, 2, activation = 'relu')(conv_1)

# Dense layers for the other features
dense_1 = Dense(32, activation = 'relu')(features_input)
# Softmax outputs probabilities of each of the 8 features falling into 4 different classes. 
dense_2 = Dense(32, activation = 'softmax')(dense_1)
normalization_1 = BatchNormalization()(dense_2)

# Dropout
dropout_1 = Dropout(0.2)(conv_2)

#conv_3 = Conv1D(128, 2, activation = 'relu')(dropout_1)
#conv_4 = Conv1D(128, 2, activation = 'relu')(conv_3)

# Dropout
#dropout_2 = Dropout(0.2)(conv_4)

# Pooling
pooling_layer = GlobalAveragePooling1D()(dropout_1)
normalization_2 = BatchNormalization()(pooling_layer)

# Concatenate
concat_layer = Concatenate()([normalization_1, normalization_2])

# Dense layer
dense_3 = Dense(64, activation = 'relu')(concat_layer)

# Output
output_layer = Dense(6)(dense_3)

model = Model(inputs=[essay_input, features_input], outputs=output_layer)
optimizer_obj = tf.keras.optimizers.Adam(learning_rate=0.001)
model.compile(optimizer=optimizer_obj, loss=mean_columnwise_root_mean_squared_error)
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 450)]                0         []                            
                                                                                                  
 embedding (Embedding)       (None, 450, 50)              869900    ['input_1[0][0]']             
                                                                                                  
 conv1d (Conv1D)             (None, 449, 128)             12928     ['embedding[0][0]']           
                                                                                                  
 input_2 (InputLayer)        [(None, 8)]                  0         []                            
                                                                                              

In [29]:
x_train_feats = np.asarray(x_train_feats).astype('float32')
x_val_feats = np.asarray(x_val_feats).astype('float32')
x_train_feats[:5]

array([[2.3260000e+03, 4.0100000e+02, 5.7860699e+00, 4.4000000e+01,
        2.3000000e+01, 1.0000000e+00, 2.0800000e+02, 6.7555222e+01],
       [4.2080000e+03, 7.4600000e+02, 5.6331997e+00, 8.6000000e+01,
        5.7000000e+01, 1.2000000e+01, 3.8500000e+02, 6.5173386e+01],
       [9.9800000e+02, 1.8900000e+02, 5.2526317e+00, 1.6000000e+01,
        1.0000000e+01, 2.0000000e+00, 9.1000000e+01, 7.2001427e+01],
       [1.1480000e+03, 2.3200000e+02, 4.9270387e+00, 1.8000000e+01,
        1.1000000e+01, 3.0000000e+00, 1.4300000e+02, 8.4014252e+01],
       [3.4840000e+03, 6.2600000e+02, 5.5566187e+00, 6.6000000e+01,
        5.6000000e+01, 9.0000000e+00, 3.4000000e+02, 7.0519150e+01]],
      dtype=float32)

In [30]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(x_train_feats)
x_train_feats_scaled = scaler.transform(x_train_feats)
x_val_feats_scaled = scaler.transform(x_val_feats)
x_train_feats_scaled[:5]

array([[0.37638378, 0.3105939 , 0.044139  , 0.21568628, 0.11386138,
        0.00271003, 0.2689747 , 0.921026  ],
       [0.6920497 , 0.58747995, 0.03955369, 0.42156863, 0.28217822,
        0.03252032, 0.5046604 , 0.91484606],
       [0.15363972, 0.14044943, 0.02813861, 0.07843138, 0.04950495,
        0.00542005, 0.11318242, 0.9325621 ],
       [0.17879906, 0.17495987, 0.01837251, 0.0882353 , 0.05445544,
        0.00813008, 0.18242343, 0.9637305 ],
       [0.5706139 , 0.49117178, 0.03725666, 0.32352942, 0.27722773,
        0.02439024, 0.44474033, 0.9287162 ]], dtype=float32)

In [31]:
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ReduceLROnPlateau
callbacks = [
    EarlyStopping(monitor='val_loss',
                  mode='min', verbose=1, patience=3, min_delta = 0.002),
    ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=1, min_lr=0.000001, verbose=1),
]

In [32]:
history = model.fit(x = [x_train_padded, x_train_feats_scaled], y = y_train, epochs=20, validation_data=([x_val_padded, x_val_feats_scaled], y_val), batch_size = 64, callbacks = callbacks)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 6: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 12: ReduceLROnPlateau reducing learning rate to 1.0000000474974514e-05.
Epoch 13/20
Epoch 14/20
Epoch 14: ReduceLROnPlateau reducing learning rate to 1.0000000656873453e-06.
Epoch 14: early stopping


In [33]:
def clean_input(df):
    proc_text = [contractions.fix(text) for text in df['full_text']]
    # Very fast! Measure time later
    def more_processing(text):
        text = '<sostok> ' + text + ' <eostok>'
        text = text.lower()
        return text

    proc_text_2 = [more_processing(text) for text in proc_text]
    nlp = spacy.load("en_core_web_sm", disable=['senter', 'parser', 'ner'])
    print(nlp.pipe_names)

    pipeline = spacy_cleaner.Pipeline(
      nlp,
      removers.remove_stopword_token,
      removers.remove_punctuation_token,
      replacers.replace_number_token,
      mutators.mutate_lemma_token,
    )
    cleaned_text = pipeline.clean(proc_text_2, n_process = 8, batch_size = 12)
    return cleaned_text

In [34]:
def get_training_data(df, x_tokenizer, mode='fit'):
    X = []
    y = []

    x_sequence = x_tokenizer.texts_to_sequences(df['cleaned_text'])
    X = pad_sequences(x_sequence, maxlen=max_text_len, padding='pre')

    if mode == 'fit':
        for index, row in df.iterrows():
            labels = [row['cohesion'], row['syntax'], row['vocabulary'], row['phraseology'], row['grammar'], row['conventions']]
            y.append(labels)

    if mode == 'fit':
        return np.array(X), np.array(y)
    else:
        return np.array(X)


In [35]:
test_feat.head()

Unnamed: 0,text_id,full_text,char_count,word_count,word_density,punctuation_count,title_word_count,upper_case_word_count,stopword_count,flesch_kincaid_score
0,0000C359D63E,when a person has no experience on a job their...,4224,835,5.052632,37,25,1,454,67.34613
1,000BAD50D026,Do you think students would benefit from being...,2167,386,5.599483,36,11,1,207,67.940871
2,00367BB2546B,"Thomas Jefferson once states that ""it is wonde...",2361,442,5.329571,33,11,1,244,62.349525


In [36]:
def get_features():
    eng_features_test_2 = test_feat.drop(['text_id', 'full_text'], axis = 1)
    scaled = scaler.transform(np.array(eng_features_test_2))
    return scaled
get_features()

array([[0.69473333, 0.65890851, 0.02213965, 0.18137256, 0.12376237,
        0.00271003, 0.59653793, 0.92048347],
       [0.34971487, 0.29855538, 0.03854237, 0.17647059, 0.05445544,
        0.00271003, 0.26764313, 0.92202658],
       [0.38225429, 0.3434992 , 0.0304464 , 0.16176471, 0.05445544,
        0.00271003, 0.31691077, 0.90751929]])

In [37]:
test_df.head()

Unnamed: 0,text_id,full_text,cleaned_text,text_id.1,char_count,word_count,word_density,punctuation_count,title_word_count,upper_case_word_count,stopword_count,flesch_kincaid_score
0,0000C359D63E,when a person has no experience on a job their...,< sostok > person experience job go good peopl...,0000C359D63E,4224,835,5.052632,37,25,1,454,67.34613
1,000BAD50D026,Do you think students would benefit from being...,< sostok > think student benefit able attend c...,000BAD50D026,2167,386,5.599483,36,11,1,207,67.940871
2,00367BB2546B,"Thomas Jefferson once states that ""it is wonde...",< sostok > thomas jefferson state wonderful ag...,00367BB2546B,2361,442,5.329571,33,11,1,244,62.349525


In [38]:
df_test = pd.read_csv('/kaggle/input/feedback-prize-english-language-learning/test.csv')
sub = pd.read_csv('/kaggle/input/feedback-prize-english-language-learning/sample_submission.csv')
xtest_tokenizer = Tokenizer()
xtest = test_df['cleaned_text']
xtest_tokenizer.fit_on_texts(test_df['cleaned_text'])
max_text_len=450
X_test = get_training_data(test_df, xtest_tokenizer, None)
X_feats = get_features()

In [39]:
pred = model.predict([X_test, X_feats])

for index, row in df_test.iterrows():
    sub_index = sub[sub['text_id']==row['text_id']].index
    sub.iloc[sub_index, 1] = pred[sub_index,0]
    sub.iloc[sub_index, 2] = pred[sub_index,1]
    sub.iloc[sub_index, 3] = pred[sub_index,2]
    sub.iloc[sub_index, 4] = pred[sub_index,3]
    sub.iloc[sub_index, 5] = pred[sub_index,4]
    sub.iloc[sub_index, 6] = pred[sub_index,5]

sub['text_id'] = sub['text_id']
sub.to_csv('/kaggle/working/submission.csv', index=False)



In [40]:
df_sub = pd.read_csv('/kaggle/working/submission.csv')

In [41]:
df_sub.head()

Unnamed: 0,text_id,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0000C359D63E,3.079875,2.776812,2.941681,2.962804,2.738084,2.888472
1,000BAD50D026,2.823738,2.534441,2.793427,2.68572,2.611121,2.716066
2,00367BB2546B,3.089237,2.888758,3.232103,3.082988,3.16405,3.179684
