In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import transformers
from transformers import AutoModel, BertTokenizerFast

# specify GPU
device = torch.device("cuda")

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<h1 style="background-color:#001f3f; font-family:'Brush Script MT',cursive;color:#FFD700;font-size:200%; text-align:center;border-radius: 50% 20% / 10% 40%">Robert Frost</h1>

"Robert Lee Frost (March 26, 1874 – January 29, 1963) was an American poet. His work was initially published in England before it was published in the United States. Known for his realistic depictions of rural life and his command of American colloquial speech, Frost frequently wrote about settings from rural life in New England in the early 20th century, using them to examine complex social and philosophical themes."

"Frost was honored frequently during his lifetime and is the only poet to receive four Pulitzer Prizes for Poetry. He became one of America's rare "public literary figures, almost an artistic institution." He was awarded the Congressional Gold Medal in 1960 for his poetic works. On July 22, 1961, Frost was named poet laureate of Vermont."

https://en.wikipedia.org/wiki/Robert_Frost

![](https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQ2HE-uDLvOoiRdNw810jMvODPrc24976bALg&usqp=CAU)slideshare.net

In [None]:

df = pd.read_csv("/kaggle/input/robert-frost-collection/robert_frost_collection.csv", low_memory=False)
print(df.shape)
df.head().style.set_properties(**{'background-color':'gold',
                                     'color': 'purple'})

In [None]:
df.isnull().sum()

![](https://i.redd.it/m721qo3ywsk31.jpg)reddit.com

In [None]:
# categorical features with missing values
categorical_nan = [feature for feature in df.columns if df[feature].isna().sum()>0 and df[feature].dtypes=='O']
print(categorical_nan)

In [None]:
# replacing missing values in categorical features
for feature in categorical_nan:
    df[feature] = df[feature].fillna('None')

In [None]:
df[categorical_nan].isna().sum()

In [None]:
# Handle numerical features with nan value
numerical_nan = [feature for feature in df.columns if df[feature].isna().sum()>1 and df[feature].dtypes!='O']
numerical_nan

In [None]:
df[numerical_nan].isna().sum()

In [None]:
## Replacing the numerical Missing Values

for feature in numerical_nan:
    ## We will replace by using median since there are outliers
    median_value=df[feature].median()
    
    df[feature].fillna(median_value,inplace=True)
    
df[numerical_nan].isnull().sum()

In [None]:
df = df.rename(columns={'Name':'name', 'Content': 'content', 'Collection':'collection', 'Year of Publication': 'year'})

#Code by Debanjan Sarkar  https://www.kaggle.com/deb009/commonlit-readability-prize-using-bert

In [None]:
# split train dataset into train, validation sets
df_content, valid_content, df_year, valid_year = train_test_split(df['content'], df['year'], 
                                                                    random_state=2018, 
                                                                    test_size=0.3)

#Import BERT Model and BERT Tokenizer

In [None]:
# import BERT-base pretrained model
bert = AutoModel.from_pretrained('bert-base-uncased')

# Load the BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

In [None]:
#Third Row. Second column: content. 

df.iloc[3,1]

#See how BERT Tokenizer works, We will try to encode a couple of sentences using the tokenizer.

In [None]:
#sample data
text_list = ["Before man came to blow it right The wind once blew itself untaught, And did its loudest day and night In any rough place where it caught. Man came to tell it what was wrong: It hadn’t found the place to blow; It blew too hard the aim was song. And listen how it ought to go! He took a little in his mouth, And held it long enough for north To be converted into south, And then by measure blew it forth. By measure. It was word and note, The wind the wind had meant to be A little through the lips and throat. The aim was song the wind could see"]


print(text_list)
# encode text
sent_id = tokenizer.batch_encode_plus(text_list, padding=True)

# output
print(sent_id)

#I was not sure if the rename worked since I was receiving errors with the column's names. 

In [None]:
df.columns.tolist()

In [None]:
df['content'].astype(str)#Trying to avoid str error float' object has no attribute 'split'

#Tokenize the Sentences

#Below the original snippet was:

seq_len = [len(i.split()) for i in train['excerpt']] 

Which resulted in AttributeError: 'float' object has no attribute 'split'

Then I introduced str().split  Instead of i.split

https://github.com/lingualytics/py-lingualytics/issues/1 By argoniteXD 

In [None]:
#https://github.com/lingualytics/py-lingualytics/issues/1 By argoniteXD 

# get length of all the messages in the train set
seq_len = [len(str(i).split()) for i in df['content']]

pd.Series(seq_len).hist(bins = 30)

#We will set the padding length as 2000. See chart above. There is very few at 3000.

In [None]:
# tokenize and encode sequences in the training set
tokens_df = tokenizer.batch_encode_plus(
    df_content.tolist(),
    max_length = 2000,
    pad_to_max_length=True,
    truncation=True
)

# tokenize and encode sequences in the validation set
tokens_val = tokenizer.batch_encode_plus(
    valid_content.tolist(),
    max_length = 2000,
    pad_to_max_length=True,
    truncation=True
)

#Converting the integer sequences to tensors.

In [None]:
## convert lists to tensors

df_seq = torch.tensor(tokens_df['input_ids'])
df_mask = torch.tensor(tokens_df['attention_mask'])
df_y = torch.tensor(df_year.tolist())

val_seq = torch.tensor(tokens_val['input_ids'])
val_mask = torch.tensor(tokens_val['attention_mask'])
val_y = torch.tensor(valid_year.tolist())

In [None]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

#define a batch size
batch_size = 32

# wrap tensors
df_data = TensorDataset(df_seq, df_mask, df_y)

# sampler for sampling the data during training
df_sampler = RandomSampler(df_data)

# dataLoader for train set
df_dataloader = DataLoader(df_data, sampler=df_sampler, batch_size=batch_size)

# wrap tensors
val_data = TensorDataset(val_seq, val_mask, val_y)

# sampler for sampling the data during training
val_sampler = SequentialSampler(val_data)

# dataLoader for validation set
val_dataloader = DataLoader(val_data, sampler = val_sampler, batch_size=batch_size)

#At this point the author (Debanjan Sarkar) wrote: WIP and I don't know if he will keep coding or not. 

![](http://2.bp.blogspot.com/-iPpPd6ErybU/U1dKbD4LksI/AAAAAAAAAF4/8JKu956uzqE/s1600/gold.gif)outsiders2014.blogspot.com

In [None]:
#sample data
text_gold = ["Nature’s first green is gold, Her hardest hue to hold. Her early leaf’s a flower; But only so an hour. Then leaf subsides to leaf. So Eden sank to grief, So dawn goes down to day. Nothing gold can stay."]


print(text_gold)
# encode text
sent_gold = tokenizer.batch_encode_plus(text_gold, padding=True)

# output
print(sent_gold)

![](https://www.biography.com/.image/c_fit%2Ccs_srgb%2Cfl_progressive%2Cq_auto:good%2Cw_620/MTY2Nzk4OTY5ODAyODU5NjQx/robertfrost_facts_desktop.jpg)biography.com

In [None]:
#Code by Olga Belitskaya https://www.kaggle.com/olgabelitskaya/sequential-data/comments
from IPython.display import display,HTML
c1,c2,f1,f2,fs1,fs2=\
'#eb3434','#eb3446','Akronim','Smokum',30,15
def dhtml(string,fontcolor=c1,font=f1,fontsize=fs1):
    display(HTML("""<style>
    @import 'https://fonts.googleapis.com/css?family="""\
    +font+"""&effect=3d-float';</style>
    <h1 class='font-effect-3d-float' style='font-family:"""+\
    font+"""; color:"""+fontcolor+"""; font-size:"""+\
    str(fontsize)+"""px;'>%s</h1>"""%string))
    
    
dhtml('In 3 words, I can sum up everything I ve learned about life: it goes on.' )