In [None]:
!pip install "../input/textstat/Pyphen-0.10.0-py3-none-any.whl"
!pip install "../input/textstat/textstat-0.7.0-py3-none-any.whl"

In [None]:
import numpy as np
import pandas as pd
import os
import seaborn as sns
import textstat
import matplotlib.pyplot as plt

from spacy.lang.en import English
from collections import defaultdict

In [None]:
nlp=English()
stop_words=nlp.Defaults.stop_words

nlp.add_pipe(nlp.create_pipe('sentencizer'))
print(nlp.pipe_names)

In [None]:
train=pd.read_csv('../input/commonlitreadabilityprize/train.csv')
train=train[['id', 'excerpt', 'target']]
train.head()

In [None]:
class Vocab:
    def __init__(self, passages):
        self.passages=passages
        self.word2id={}
        self.id2word={}
        self.vocab_freq={}
        self.vocab_doc_freq={}
        self.vocab_=[]
    def build(self):
        for passage in self.passages:
            for word in nlp(passage):
                word=word.text
                if word not in self.vocab_freq:
                    self.vocab_freq[word]=0
                self.vocab_freq[word]+=1
        for idx, word in enumerate(self.vocab_freq.keys()):
            self.word2id[word]=idx
            self.id2word[idx]=word
            self.vocab_.append(word)
    def filter_vocab_by_count(self, min_freq=5, max_freq=500000):
        temp_vocab={}
        for word, freq in self.vocab_freq.items():
            if freq <= min_freq or freq>=max_freq:
                continue
            temp_vocab[word]=freq
        return temp_vocab

Generate Vocabulary

In [None]:
%%time
vocab=Vocab(train.excerpt.values)
vocab.build()

In [None]:
for token in nlp('Hello 123dfasl _hel'):
    print(token.is_punct)

In [None]:
def get_sentence_count(excerpt):
    return len( list(nlp(excerpt).sents) )

def get_word_count(excerpt):
    cnt=0
    for token in nlp(excerpt):
        if token.like_num or token.is_punct:
            continue
        cnt+=1
    return cnt
    
def get_unique_word_count(excerpt):
    word_set=set()
    for token in nlp(excerpt):
        if token.like_num or token.is_punct:
            continue
        word_set.add(token.lower_)
    return len(word_set)

def get_word_count_without_stopword(excerpt):
    cnt=0
    for token in nlp(excerpt):
        if token.like_num or token.is_punct or token.is_stop:
            continue
        cnt+=1
    return cnt

def get_distinct_word_count_without_stopword(excerpt):
    words=set()
    for token in nlp(excerpt):
        if token.like_num or token.is_punct or token.is_stop:
            continue
        words.add(token.lower_)
    return len(words)


def get_stopword_count(excerpt):
    cnt=0
    for word in nlp(excerpt):
        if word.is_stop:
            cnt+=1
    return cnt

def get_unique_stopword_count(excerpt):
    word_set=set()
    for word in nlp(excerpt):
        if word.is_stop:
            word_set.add(word)
    return len(word_set)
    
def get_punctuation_count(excerpt):
    cnt=0
    for word in nlp(excerpt):
        if word.is_punct:
            cnt+=1
    return cnt

def get_title_word_count(excerpt):
    cnt=0
    for word in nlp(excerpt):
        if word.is_title:
            cnt+=1
    return cnt

def get_unique_title_word_count(excerpt):
    words=set()
    for word in nlp(excerpt):
        if word.is_title:
            words.add(word.text)
    return len(words)

def get_capital_word_count(excerpt):
    cnt=0
    for word in nlp(excerpt):
        if word.is_upper:
            cnt+=1
    return cnt

def get_unique_capital_word_count(excerpt):
    words=set()
    for word in nlp(excerpt):
        if word.is_upper:
            words.add(word)
    return len(words)

def get_syllable_counts(excerpt):
    syllabel_freq=defaultdict(int)
    for token in nlp(excerpt):
        if token.like_num or token.is_punct:
            continue
        syllabel_freq[ textstat.syllable_count(token.text) ]+=1
    return syllabel_freq

def get_syllable_count_without_stop(excerpt):
    syllabel_freq=defaultdict(int)
    for token in nlp(excerpt):
        if token.like_num or token.is_punct or token.is_stop:
            continue
        syllabel_freq[ textstat.syllable_count(token.text) ]+=1
    return syllabel_freq

In [None]:
%%time

train['sentence_count']=train.excerpt.apply(get_sentence_count)

train['word_count']=train.excerpt.apply(get_word_count)
train['distinct_word_count']=train.excerpt.apply(get_unique_word_count)

train['stopword_count']=train.excerpt.apply(get_stopword_count)
train['distinct_stopword_count']=train.excerpt.apply(get_unique_stopword_count)


train['word_count_without_stopword']=train.excerpt.apply(get_word_count_without_stopword)
train['distinct_word_count_without_stopword']=train.excerpt.apply(get_distinct_word_count_without_stopword)


train['punctuation_counts']=train.excerpt.apply(get_punctuation_count)

train['title_words_count']=train.excerpt.apply(get_title_word_count)
train['distinct_title_words_count']=train.excerpt.apply(get_unique_title_word_count)


train['capital_word_count']=train.excerpt.apply(get_capital_word_count)
train['distinct_capital_word_count']=train.excerpt.apply(get_unique_capital_word_count)

In [None]:
%%time
train['word_redundancy']=1-(train['distinct_word_count'].div(train['word_count']))
train['stopword_redundancy']=1-(train['stopword_count'].div(train['word_count']))
train['word_redundancy_witout_stopwords']=1-(train['distinct_word_count_without_stopword'].div(train['word_count_without_stopword']))


train['title_word_proportion']=train['title_words_count'].div(train['word_count'])
train['title_word_per_sentence']=train['title_words_count'].div(train['sentence_count'])


train['capital_word_proportaion']=train['capital_word_count'].div(train['word_count'])


train['words_per_punctuation']=train['word_count'].div(train.punctuation_counts)
train['words_per_sentence']=train['word_count'].div(train['sentence_count'])

Syllables

In [None]:
%%time
train['syllable_freq']=train.excerpt.apply(get_syllable_counts)
train['syllable_without_stop_freq']=train.excerpt.apply(get_syllable_count_without_stop)


train['0syllable']=train.syllable_freq.apply(lambda x: x[0])
train['0syllable_proportion']=train['0syllable'].div(train['word_count'])

train['1syllable']=train.syllable_freq.apply(lambda x: x[1])
train['1syllable_proportion']=train['1syllable'].div(train['word_count'])

train['2syllable']=train.syllable_freq.apply(lambda x: x[2])
train['2syllable_proportion']=train['2syllable'].div(train['word_count'])


train['3syllable']=train.syllable_freq.apply(lambda x: x[3])
train['3syllable_proportion']=train['3syllable'].div(train['word_count'])


train['4syllable']=train.syllable_freq.apply(lambda x: x[4])
train['4syllable_proportion']=train['4syllable'].div(train['word_count'])


train['>=5syllable']=train.syllable_freq.apply(lambda x: sum(x.values()) - x[0]- x[1]- x[2]- x[3]- x[4] )
train['>=5syllable_proportion']=train['>=5syllable'].div(train['word_count'])




train['0syllable_no_stop']=train.syllable_without_stop_freq.apply(lambda x: x[0])
train['0syllable_no_stop_proportion']=train['0syllable_no_stop'].div(train['word_count_without_stopword'])


train['1syllable_no_stop']=train.syllable_without_stop_freq.apply(lambda x: x[1])
train['1syllable_no_stop_proportion']=train['1syllable_no_stop'].div(train['word_count_without_stopword'])

train['2syllable_no_stop']=train.syllable_without_stop_freq.apply(lambda x: x[2])
train['2syllable_no_stop_proportion']=train['2syllable_no_stop'].div(train['word_count_without_stopword'])

train['3syllable_no_stop']=train.syllable_without_stop_freq.apply(lambda x: x[3])
train['3syllable_no_stop_proportion']=train['3syllable_no_stop'].div(train['word_count_without_stopword'])

train['4syllable_no_stop']=train.syllable_without_stop_freq.apply(lambda x: x[4])
train['4syllable_no_stop_proportion']=train['4syllable_no_stop'].div(train['word_count_without_stopword'])

train['>=5syllable_no_stop']=train.syllable_without_stop_freq.apply(lambda x: sum(x.values()) - x[0]- x[1]- x[2]- x[3]- x[4] )
train['>=5syllable_no_stop_proportion']=train['>=5syllable_no_stop'].div(train['word_count_without_stopword'])

train.head()

In [None]:
train.columns

In [None]:
train.word_redundancy.describe()

In [None]:
train.word_redundancy_witout_stopwords.describe()

In [None]:
plt.figure(figsize=(12, 5))
sns.histplot(data=train, x='word_redundancy', stat='probability', color='red',  label="Word Redundancy")
sns.histplot(data=train, x='word_redundancy_witout_stopwords', stat='probability', color='blue', 
             label="Word Redundance Without StopWords")
plt.legend(loc='best')
plt.show()

In [None]:
fig, ax=plt.subplots(2, 1, sharex=True)

sns.boxplot(data=train, x='word_redundancy', color='red', ax=ax[0])
sns.boxplot(data=train, x='word_redundancy_witout_stopwords', color='blue', ax=ax[1])

fig.show()

1. Stopwords take much higher redundancy in excerpts.
2. There are few excerpts which take higher redudandancy (>0.6) even without stopwords
3. More redundant excerpts without stopword in most cases may result in ease of read for explaining to the lower grade students.

In [None]:
plt.figure(figsize=(12, 5))

sns.scatterplot(data=train, x='word_redundancy', y='target', color='red', label='Word Redundancy')
sns.scatterplot(data=train, x='word_redundancy_witout_stopwords', y='target', 
                color='blue', label='Word Redundancy Without Stopwords')

plt.legend(loc='best')
plt.show()

In [None]:
sns.scatterplot(data=train[train['word_redundancy_witout_stopwords']>0.45],
                x='word_redundancy_witout_stopwords', y='target', 
                color='blue', label='Word Redundancy Without Stopwords')


As can be seen for the excerpts with >0.45 redundancy witout stopwords had higher chances of being ease of read.

let us look at some of the samples

In [None]:
train[(train.word_redundancy_witout_stopwords>0.45) & (train.target>0)].excerpt.values[0]

In [None]:
train[(train.word_redundancy_witout_stopwords>0.45) & (train.target < -1.0)].excerpt.values[0]

> Redundant words are there in both the child stories and the physics passages.

> Some words makes the passages differ in their subjects.

> In the children text there are some words (their, one of them, our etc, which are collective) may help in understanding text

In [None]:

plt.figure(figsize=(12, 5))
sns.histplot(data=train, x='word_count_without_stopword', stat='probability', 
             color='red',  label="Word Count Without StopWord")

sns.histplot(data=train, x='distinct_word_count_without_stopword', stat='probability', color='blue', 
             label="Distinct Word Count Without StopWord")
plt.legend(loc='best')
plt.show()


In [None]:
fig, ax=plt.subplots(2, 1, sharex=True)

sns.boxplot(data=train, x='word_count_without_stopword', color='red', ax=ax[0])
sns.boxplot(data=train, x='distinct_word_count_without_stopword', color='blue', ax=ax[1])

fig.show()

# 

In [None]:
plt.figure(figsize=(12, 5))

plt.hlines(y=-1, xmin=0, xmax=150, color='green')
sns.scatterplot(data=train, x='word_count_without_stopword', y='target', color='red')
sns.scatterplot(data=train, x='distinct_word_count_without_stopword', y='target', color='blue')

plt.legend(loc='best')
plt.show()

In [None]:
sns.lineplot(data=train, x='distinct_word_count_without_stopword', y='target')

Distinct Word Count without stopwords have downward trend with target variable

# lets check the capital words

In [None]:
plt.figure(figsize=(12, 5))
sns.histplot(data=train, x='capital_word_count', stat='probability', color='red',  label="Capital Word Count")
plt.legend(loc='best')
plt.show()


plt.figure(figsize=(12, 5))
sns.histplot(data=train, x='capital_word_proportaion', stat='probability', color='blue', label="Capital Word Proportions")
plt.legend(loc='best')
plt.show()


In [None]:
train['capital_word_count'].describe()

In [None]:
train['capital_word_proportaion'].describe()

In [None]:
plt.figure(figsize=(12, 5))
sns.scatterplot(data=train, x='capital_word_count', y='target', color='red')
plt.legend(loc='best')
plt.show()


plt.figure(figsize=(12, 5))
sns.scatterplot(data=train, x='capital_word_proportaion', y='target', color='blue')
plt.legend(loc='best')
plt.show()

In [None]:
sns.lineplot(data=train, x='capital_word_proportaion', y='target')

> Looks Capital words in the passages are not contributing more for the target variable

> Observing the scatter & line plot , they look very random with target variable.

# lets check for the title words and sentence statistics

In [None]:
plt.figure(figsize=(12, 5))
sns.histplot(data=train, x='sentence_count', stat='probability', 
             color='red',  label="Sentence Count")
plt.legend(loc='best')
plt.show()

In [None]:
sns.scatterplot(data=train, x='sentence_count', y='target')

In [None]:
sns.lineplot(data=train, x='sentence_count', y='target')

In [None]:
_, ax=plt.subplots(1, 4, figsize=(15, 5))
sns.heatmap(train[['sentence_count', 'target']].corr(), annot=True, cbar=False, ax=ax[0])

sns.heatmap(train[(train.sentence_count>5) & 
                  (train.sentence_count<=20)
                 ][['sentence_count', 'target']].corr(), annot=True, cbar=False, ax=ax[1])

sns.heatmap(train[(train.sentence_count>20) & 
                  (train.sentence_count<30) 
                 ][['sentence_count', 'target']].corr(), annot=True, cbar=False, ax=ax[2])

sns.heatmap(train[train.sentence_count>=30][['sentence_count', 'target']].corr(), annot=True, cbar=False, ax=ax[3])

plt.legend(loc='best')
plt.show()

> Line plots are from mean estimators.

> Sentence Counts have positive trend with the target variable  and the confidence is decreasing after 15.

> lower sentence counts looks to be hard to read from line plot. I think that could be because of explainability 

> Sentence Lengths of >=20 had negative correlation witht the target

In [None]:
plt.figure(figsize=(12, 5))
sns.histplot(data=train, x='title_words_count', stat='probability', 
             color='red',  label="Title Word Count")
plt.legend(loc='best')
plt.show()


plt.figure(figsize=(12, 5))
sns.histplot(data=train, x='title_word_per_sentence', stat='probability', 
             color='blue', label="Title Word Per Sentence")
plt.legend(loc='best')
plt.show()

In [None]:
sns.lineplot(data=train, x='title_words_count', y='target')

In [None]:
_, ax=plt.subplots(1, 4, figsize=(15, 5))
sns.heatmap(train[['title_words_count', 'target']].corr(), annot=True, cbar=False, ax=ax[0])

sns.heatmap(train[(train.title_words_count>5) & 
                  (train.title_words_count<=20)
                 ][['title_words_count', 'target']].corr(), annot=True, cbar=False, ax=ax[1])

sns.heatmap(train[(train.title_words_count>20) & 
                  (train.title_words_count<30) 
                 ][['title_words_count', 'target']].corr(), annot=True, cbar=False, ax=ax[2])

sns.heatmap(train[train.title_words_count>=30][['title_words_count', 'target']].corr(), annot=True, cbar=False, ax=ax[3])

plt.legend(loc='best')
plt.show()

In [None]:
sns.lineplot(data=train, x='title_word_per_sentence', y='target')

# lets check for word counts

In [None]:
plt.figure(figsize=(12, 5))
sns.histplot(data=train, x='word_count', stat='probability', label="Word Count")
plt.legend(loc='best')
plt.show()

In [None]:
sns.scatterplot(data=train, x='word_count', y='target')

In [None]:
sns.lineplot(data=train, x='word_count', y='target')

In [None]:
train[['word_count', 'target']].corr()

1. Does n't look word counts have direct correlation with the target

# lets check the syllables

In [None]:
[colname for colname in train.columns if 'syllable' in colname]

In [None]:
train[['0syllable_no_stop','1syllable_no_stop','2syllable_no_stop',
       '3syllable_no_stop','4syllable_no_stop','>=5syllable_no_stop','target'
]].corr()

In [None]:
train[['0syllable_no_stop','1syllable_no_stop','2syllable_no_stop',
       '3syllable_no_stop','4syllable_no_stop','>=5syllable_no_stop','target'
]].describe()

In [None]:
plt.figure(figsize=(14, 5))
sns.boxplot(data=train[['0syllable_no_stop','1syllable_no_stop','2syllable_no_stop',
                        '3syllable_no_stop','4syllable_no_stop','>=5syllable_no_stop']])

plt.show()

In [None]:
sns.scatterplot(data=train[train['>=5syllable_no_stop'] >0 ], 
                x='>=5syllable_no_stop',
                y='target')


# lets see the syllable proporations

In [None]:
train[['0syllable_no_stop_proportion', '1syllable_no_stop_proportion',
       '2syllable_no_stop_proportion','3syllable_no_stop_proportion',
       '4syllable_no_stop_proportion','>=5syllable_no_stop_proportion',
       'target'
      ]].corr()


In [None]:
train[['0syllable_no_stop_proportion', '1syllable_no_stop_proportion',
       '2syllable_no_stop_proportion','3syllable_no_stop_proportion',
       '4syllable_no_stop_proportion','>=5syllable_no_stop_proportion']].describe()

In [None]:
print("Number Of Documents in which syllable count==0")
(train[['0syllable_no_stop_proportion', '1syllable_no_stop_proportion',
       '2syllable_no_stop_proportion','3syllable_no_stop_proportion',
       '4syllable_no_stop_proportion','>=5syllable_no_stop_proportion']]==0).sum()

In [None]:
plt.figure(figsize=(14, 5))
plt.xticks(rotation=30)
sns.boxplot(data=train[['0syllable_no_stop_proportion', '1syllable_no_stop_proportion',
       '2syllable_no_stop_proportion','3syllable_no_stop_proportion',
       '4syllable_no_stop_proportion','>=5syllable_no_stop_proportion']])

plt.show()

In [None]:
_, ax=plt.subplots(2, 3, figsize=(17, 5))

for i, colname in enumerate(['0syllable_no_stop_proportion', '1syllable_no_stop_proportion',
                             '2syllable_no_stop_proportion','3syllable_no_stop_proportion',
                             '4syllable_no_stop_proportion','>=5syllable_no_stop_proportion']):
    
    sns.heatmap(train[train[colname] >0][[colname, 'target']].corr(), annot=True, ax=ax[i//3, i%3],
                cbar=False
               )
plt.show()

Syllables 3, 4, 5 have negative correlation with the target (ease of read)

# Punctuations and words per sentence

In [None]:
sns.histplot(data=train, x='punctuation_counts', bins=100)

In [None]:
sns.scatterplot(data=train, x='punctuation_counts', y='target')

In [None]:
train[['punctuation_counts', 'target']].corr()

In [None]:
sns.histplot(train['words_per_sentence'], bins=100)

In [None]:
sns.scatterplot(data=train, x='words_per_sentence', y='target')


In [None]:
sns.heatmap(train[['words_per_sentence','target']].corr(), annot=True)

# Model Training

In [None]:
from sklearn.preprocessing import StandardScaler

import torch
import torch.nn as nn

In [None]:
train_columns=['sentence_count', 'word_count', 'distinct_word_count', 'words_per_sentence',
       'stopword_count', 'distinct_stopword_count', 'word_count_without_stopword', 
       'distinct_word_count_without_stopword', 
       'punctuation_counts', 'title_words_count', 'distinct_title_words_count',
       'word_redundancy', 'stopword_redundancy', 'word_redundancy_witout_stopwords',
       'title_word_proportion', 'title_word_per_sentence', 'words_per_punctuation',  
       '0syllable_no_stop', '0syllable_no_stop_proportion', 
       '1syllable_no_stop', '1syllable_no_stop_proportion', 
       '2syllable_no_stop', '2syllable_no_stop_proportion', 
       '3syllable_no_stop','3syllable_no_stop_proportion', 
       '4syllable_no_stop','4syllable_no_stop_proportion', 
       '>=5syllable_no_stop', '>=5syllable_no_stop_proportion'
]

print('Train Columns:')
print(train_columns)

In [None]:
train_df=train[['id']+train_columns+['target']].copy()
train_df.head()

In [None]:
def get_cv_ids():
    global train_df
    df=train_df.copy()
    cv_ids=[]
    ranges=[(-4, -3.0), (-3.0, -2.0),(-2.0, -1.0),(-1.0, 0.0), (0.0, 1.0), (1.0, 2)]
    for r in ranges:
        l=r[0]
        h=r[1]
        
        cur_cvids=list(df[(df.target>=l) & (df.target<h)].id.values)
        np.random.choice(cur_cvids)
        cv_ids+=cur_cvids[:int(len(cur_cvids)*0.1)]
    return cv_ids
cv_ids=get_cv_ids()


val_df=train_df[train_df.id.isin(cv_ids)].copy()
train_df=train_df[train_df.id.isin(cv_ids)==False].copy()

print('Number Of Validation Records:', len(val_df))
print('Number Of Train Records:', len(train_df))

val_df.target.hist(bins=100)
plt.show()

In [None]:
%%time
test_df=pd.read_csv('../input/commonlitreadabilityprize/test.csv')

test_df['sentence_count']=test_df.excerpt.apply(get_sentence_count)

test_df['word_count']=test_df.excerpt.apply(get_word_count)
test_df['distinct_word_count']=test_df.excerpt.apply(get_unique_word_count)

test_df['stopword_count']=test_df.excerpt.apply(get_stopword_count)
test_df['distinct_stopword_count']=test_df.excerpt.apply(get_unique_stopword_count)


test_df['word_count_without_stopword']=test_df.excerpt.apply(get_word_count_without_stopword)
test_df['distinct_word_count_without_stopword']=test_df.excerpt.apply(get_distinct_word_count_without_stopword)


test_df['punctuation_counts']=test_df.excerpt.apply(get_punctuation_count)

test_df['title_words_count']=test_df.excerpt.apply(get_title_word_count)
test_df['distinct_title_words_count']=test_df.excerpt.apply(get_unique_title_word_count)


test_df['capital_word_count']=test_df.excerpt.apply(get_capital_word_count)
test_df['distinct_capital_word_count']=test_df.excerpt.apply(get_unique_capital_word_count)


test_df['word_redundancy']=1-(test_df['distinct_word_count'].div(test_df['word_count']))
test_df['stopword_redundancy']=1-(test_df['stopword_count'].div(test_df['word_count']))
test_df['word_redundancy_witout_stopwords']=1-(test_df['distinct_word_count_without_stopword'].div(test_df['word_count_without_stopword']))


test_df['title_word_proportion']=test_df['title_words_count'].div(test_df['word_count'])
test_df['title_word_per_sentence']=test_df['title_words_count'].div(test_df['sentence_count'])

test_df['capital_word_proportaion']=test_df['capital_word_count'].div(test_df['word_count'])


test_df['words_per_punctuation']=test_df['word_count'].div(test_df.punctuation_counts)
test_df['words_per_sentence']=test_df['word_count'].div(test_df['sentence_count'])

test_df['syllable_freq']=test_df.excerpt.apply(get_syllable_counts)
test_df['syllable_without_stop_freq']=test_df.excerpt.apply(get_syllable_count_without_stop)


test_df['0syllable']=test_df.syllable_freq.apply(lambda x: x[0])
test_df['0syllable_proportion']=test_df['0syllable'].div(test_df['word_count'])

test_df['1syllable']=test_df.syllable_freq.apply(lambda x: x[1])
test_df['1syllable_proportion']=test_df['1syllable'].div(test_df['word_count'])

test_df['2syllable']=test_df.syllable_freq.apply(lambda x: x[2])
test_df['2syllable_proportion']=test_df['2syllable'].div(test_df['word_count'])


test_df['3syllable']=test_df.syllable_freq.apply(lambda x: x[3])
test_df['3syllable_proportion']=test_df['3syllable'].div(test_df['word_count'])


test_df['4syllable']=test_df.syllable_freq.apply(lambda x: x[4])
test_df['4syllable_proportion']=test_df['4syllable'].div(test_df['word_count'])


test_df['>=5syllable']=test_df.syllable_freq.apply(lambda x: sum(x.values()) - x[0]- x[1]- x[2]- x[3]- x[4] )
test_df['>=5syllable_proportion']=test_df['>=5syllable'].div(test_df['word_count'])


test_df['0syllable_no_stop']=test_df.syllable_without_stop_freq.apply(lambda x: x[0])
test_df['0syllable_no_stop_proportion']=test_df['0syllable_no_stop'].div(test_df['word_count_without_stopword'])


test_df['1syllable_no_stop']=test_df.syllable_without_stop_freq.apply(lambda x: x[1])
test_df['1syllable_no_stop_proportion']=test_df['1syllable_no_stop'].div(test_df['word_count_without_stopword'])

test_df['2syllable_no_stop']=test_df.syllable_without_stop_freq.apply(lambda x: x[2])
test_df['2syllable_no_stop_proportion']=test_df['2syllable_no_stop'].div(test_df['word_count_without_stopword'])

test_df['3syllable_no_stop']=test_df.syllable_without_stop_freq.apply(lambda x: x[3])
test_df['3syllable_no_stop_proportion']=test_df['3syllable_no_stop'].div(test_df['word_count_without_stopword'])

test_df['4syllable_no_stop']=test_df.syllable_without_stop_freq.apply(lambda x: x[4])
test_df['4syllable_no_stop_proportion']=test_df['4syllable_no_stop'].div(test_df['word_count_without_stopword'])

test_df['>=5syllable_no_stop']=test_df.syllable_without_stop_freq.apply(lambda x: sum(x.values()) - x[0]- x[1]- x[2]- x[3]- x[4] )
test_df['>=5syllable_no_stop_proportion']=test_df['>=5syllable_no_stop'].div(test_df['word_count_without_stopword'])

test_df=test_df[['id']+train_columns].copy()
test_df.head()

In [None]:
print(train_df.shape)
print(val_df.shape)
print(test_df.shape)

In [None]:
scaler=StandardScaler()
X_train=scaler.fit_transform(train_df[train_columns])
print(X_train.shape)

In [None]:
target_mean=train_df.target.mean()
target_std=train_df.target.std()

print('Target Mean:', target_mean)
print("Target Std:", target_std)

# Dataset

In [None]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, df, phase):
        self.df=df
        self.phase=phase
    def __getitem__(self, idx):
        row=self.df.iloc[idx]
        X=scaler.transform(row[train_columns].values.reshape(1, -1))
        X=torch.tensor(X,dtype=torch.float32)
        X=X.view(-1)
        if (self.phase=='train') or (self.phase=='val'):
            y=(row.target - target_mean)/target_std
            y=torch.tensor(y, dtype=torch.float32)
            return (X, y)
        return X
    def __len__(self):
        return len(self.df)

In [None]:
train_dataset=Dataset(train_df, 'train')
val_dataset=Dataset(val_df, 'val')
test_dataset=Dataset(test_df, 'test')


train_dataloader=torch.utils.data.DataLoader(train_dataset, batch_size=512, shuffle=True)
val_dataloader=torch.utils.data.DataLoader(val_dataset, batch_size=512, shuffle=False)
test_dataloader=torch.utils.data.DataLoader(test_dataset, batch_size=512, shuffle=False)

# Model 

In [None]:
class Model(nn.Module):
    def __init__(self, in_feat, out_feat):
        super().__init__()
        self.linear1=nn.Linear(in_feat, 128)
        self.bn1=nn.BatchNorm1d(128)
        self.dropout1=nn.Dropout(0.4)
        self.relu1=nn.ReLU()
        
        self.linear2=nn.Linear(128, 64)
        self.bn2=nn.BatchNorm1d(64)
        self.dropout2=nn.Dropout(0.5)
        self.relu2=nn.ReLU()
        
        self.out=nn.Linear(64, out_feat)
    def forward(self, x):
        x=self.linear1(x)
        x=self.bn1(x)
        x=self.dropout1(x)
        x=self.relu1(x)
        
        x=self.linear2(x)
        x=self.bn2(x)
        x=self.dropout2(x)
        x=self.relu2(x)
        
        y=self.out(x)
        return y

In [None]:
def train_epoch(model, criterion, optimizer, train_dataloader):
    epoch_loss=0.0
    model.train()
    for (X, y) in train_dataloader:
        y_hat=model(X).view(-1)
        loss=criterion(y_hat, y)
        loss.backward()
        optimizer.step()
        epoch_loss+=loss.item()
    epoch_loss/=len(train_dataloader)
    return epoch_loss

def infer(models, dataloader):
    preds=[]
    for X in dataloader:
        y=torch.zeros(X.shape[0])
        for model in models:
            model.eval()
            with torch.no_grad():
                y+=((model(X).view(-1) * target_std) + target_mean)
        y/=len(models)
        preds+=list(y.numpy())
    return preds

def evaluate(model, criterion, dataloader):
    epoch_loss=0.0
    model.eval()
    for (X,ytrue) in dataloader:
        y=torch.zeros(X.shape[0])
        with torch.no_grad():
            y+=model(X).view(-1)
        loss=criterion(y, ytrue)
        epoch_loss+=loss.item()
    epoch_loss/=len(dataloader)
    return epoch_loss

In [None]:
models=[]
epochs=20
for i in range(5):
    model=Model(29, 1)
    optimizer=torch.optim.AdamW(model.parameters(), weight_decay=1e-4)
    criterion=nn.MSELoss(reduction='mean')
    best_loss=None
    
    for j in range(epochs):
        epoch_loss=train_epoch(model, criterion, optimizer, train_dataloader)
        val_loss=evaluate(model,criterion, val_dataloader)
        
        if (best_loss is None) or (val_loss < best_loss):
            best_loss=val_loss
            torch.save(model, 'model_{}.pt'.format(i+1))
        if (j+1)%5==0:
            print('Epoch:{} | Train Loss:{:.4f} | Val Loss:{:.4f}'.format(j+1, epoch_loss, val_loss))
    
    print("Best Loss")
    print(best_loss)
    print("==="*10)
    model=torch.load( 'model_{}.pt'.format(i+1))
    models.append(model)

In [None]:
test_dataset=Dataset(test_df, 'test')
test_dataloader=torch.utils.data.DataLoader(test_dataset, shuffle=False, batch_size=512)
test_preds=infer(models, test_dataloader)
test_df['target']=test_preds

In [None]:
submission_df=test_df[['id', 'target']].copy()
submission_df.to_csv('submission.csv', index=False)

In [None]:
submission_df