In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<center style="font-family:verdana;"><h1 style="font-size:200%; padding: 20px; background: #001f3f;"><i><b style="color:orange;">The World as Will and Representation</b></i></h1></center>


"The World as Will and Representation (German: Die Welt als Wille und Vorstellung) is the central work of the German philosopher Arthur Schopenhauer. The first edition was published in late 1818, with the date 1819 on the title-page. A second, two-volume edition appeared in 1844: volume one was an edited version of the 1818 edition, while volume two consisted of commentary on the ideas expounded in volume one. A third expanded edition was published in 1859, the year prior to Schopenhauer's death. In 1948, an abridged version was edited by Thomas Mann."

"In the summer of 1813, Schopenhauer submitted his doctoral dissertation—On the Fourfold Root of the Principle of Sufficient Reason—and was awarded a doctorate from the University of Jena. After spending the following winter in Weimar, he lived in Dresden and published his treatise On Vision and Colours in 1816. Schopenhauer spent the next several years working on his chief work, The World as Will and Representation. Schopenhauer asserted that the work is meant to convey a "single thought" from various perspectives. He develops his philosophy over four books covering epistemology, ontology, aesthetics, and ethics. Following these books is an appendix containing Schopenhauer’s detailed Criticism of the Kantian Philosophy."

https://en.wikipedia.org/wiki/The_World_as_Will_and_Representation

![](https://philosophymaps.files.wordpress.com/2014/10/schopenhauer.png)philosophymaps.wordpress.com

#Code by Shubham Kumar https://www.kaggle.com/eiann1509/commonlit-fine-tuning-with-roberta-base

In [None]:
import tensorflow as tf
import transformers
import missingno as msno
import re
import spacy
import nltk
from wordcloud import WordCloud

In [None]:
from sklearn.model_selection import KFold
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.metrics import RootMeanSquaredError
from nltk.tokenize import sent_tokenize, word_tokenize 
from transformers import TFAutoModel,AutoTokenizer,TFAutoModelForSequenceClassification
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from collections import Counter

import warnings
warnings.filterwarnings("ignore")

sns.set_palette('husl')

<center style="font-family:verdana;"><h1 style="font-size:200%; padding: 20px; background: #001f3f;"><i><b style="color:orange;">The world as Representation is The ‘objectification’ of the Will</b></i></h1></center>

"Schopenhauer identifies the thing-in-itself the inner essence of everything as will: a blind, unconscious, aimless striving devoid of knowledge, outside of space and time, and free of all multiplicity. The world as representation is, therefore, the ‘objectification’ of the will. Aesthetic experiences release a person briefly from his endless servitude to the will, which is the root of suffering. True redemption from life, Schopenhauer asserts, can only result from the total ascetic negation of the ‘will to life.’ Schopenhauer notes fundamental agreements between his philosophy, Platonism, and the philosophy of the ancient Indian Vedas."

https://en.wikipedia.org/wiki/The_World_as_Will_and_Representation

In [None]:
df = pd.read_csv("/kaggle/input/schopenhauer-work-corpus/Schopenhauer_works_corpus.csv")
df.tail()

In [None]:
world = df[(df['book_title']=='The World As Will And Idea (Vol. 1 of 3)')].reset_index(drop=True)
world.head()

<center style="font-family:verdana;"><h1 style="font-size:200%; padding: 20px; background: #001f3f;"><i><b style="color:orange;">The pinnacle of Schopenhauer's philosophical thought</b></i></h1></center>

"The World as Will and Representation marked the pinnacle of Schopenhauer's philosophical thought; he spent the rest of his life refining, clarifying, and deepening the ideas presented in this work without any fundamental changes. The first edition was met with near-universal silence. The second edition of 1844 similarly failed to attract any interest. At the time, post-Kantian German academic philosophy was dominated by the German Idealists—foremost among them G. W. F. Hegel, whom Schopenhauer bitterly denounced as a "charlatan". It was not until the publication of his Parerga and Paralipomena in 1851 that Schopenhauer began to see the start of the recognition that eluded him for so long."

https://en.wikipedia.org/wiki/The_World_as_Will_and_Representation

In [None]:
#8th row. And 5th column, text_clean 

df.iloc[8,4]

In [None]:
print(f'length of df : {len(df)}')

In [None]:
#Code by Shubham Kumar https://www.kaggle.com/eiann1509/commonlit-fine-tuning-with-roberta-base 

def my_plot(df,row):
  idx=0
  j=0
  feat=['publishing_date','Unnamed: 0']
  plt.rcParams['figure.figsize'] = (15,5)
  fig,axes=plt.subplots(row,2)
  plt.subplots_adjust(top = 1.95)  
  for i in range(row):
      axes[i,j].axvline(df[feat[idx]].mean(), linestyle=':', linewidth=2)
      sns.kdeplot(df[feat[idx]],color='red',ax=axes[i,j])   
      axes[i,j].set_title(feat[idx])
      j+=1
      sns.violinplot(df[feat[idx]],color='red',ax=axes[i,j])     
      axes[i,j].set_title(feat[idx])
      idx+=1
      j=0

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
my_plot(df,2)

In [None]:
plt.figure(figsize=(20,10))
plt.xticks(rotation=90)
sns.countplot(df['publishing_date'][df['publishing_date'].notnull()]);

In [None]:
df=df[['text_clean','publishing_date']]                 
print(f'range of publishing_date values : ({df.publishing_date.min()},{df.publishing_date.max()})')

In [None]:
df=df.rename(columns={'text_clean':'text'})
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words("english")

In [None]:
df['publishing_date'].skew

#Word frequency text_clean visualization in word cloud

In [None]:
cloud=WordCloud(background_color = 'black',stopwords=stopwords,max_words=200,max_font_size = 40,scale=3).generate(str(df['text']))

title='word count'
fig = plt.figure(figsize=(15,15))
fig.subplots_adjust(top = 2.25)
fig.suptitle(title, fontsize = 20)
plt.imshow(cloud);

In [None]:
#Be careful because last column name is text_clean. I changed too upper case to avoid ambiguity  

def Text_clean(text):
  pattern=re.compile("[^a-zA-Z]|https?://\S+|www\.\S+")
  return pattern.sub(r' ',text)

In [None]:
x_data=df['text'].apply(lambda text:Text_clean(text).strip())

In [None]:
x_data=[" ".join(data.split()) for data in x_data]
x_data[:10]

In [None]:
#using spacy we do lemmatization and singularization
train_data=[]
nlp=spacy.load('en_core_web_sm')
for data in x_data:
  doc = nlp(data)
  train_data.append(" ".join([str(token.lemma_) for token in doc]))

In [None]:
train_data=[' '.join([word for word in data.split() if '-PRON-'!=word]) for data in train_data]
train_data=[' '.join([word for word in data.split() if word not in stopwords]) for data in train_data]
y_data=df['publishing_date'].values

In [None]:
x_data[:2],y_data[:2]

In [None]:
train_data[:2],y_data[:2]

In [None]:
#Dataset prep for linear reg and Roberta

class DATASET:
  def __init__(self,train_data,y_data):
    
    self.train_data=train_data
    self.y_data=y_data
  
  def __call__(self,pad_sequences,train_test_split,model_name,roberta_tokenizer):

    if model_name=='LR':

      x_train,x_test,y_train,y_test=train_test_split(self.train_data,self.y_data,test_size=0.3)
      tfidf=TfidfVectorizer(analyzer='word', ngram_range=(1,3))
      table_c=tfidf.fit_transform(list(x_train)+list(x_test))
      train_table_data=tfidf.transform(x_train)
      test_table_data=tfidf.transform(x_test)
      
      return train_table_data,test_table_data,y_train,y_test

    elif model_name=='roberta':
      sequences=[]
      length=[]
      for text in self.train_data:
        tokens=roberta_tokenizer.encode(text,add_special_tokens=True, truncation=True)
        sequences.append(tokens)

      
      roberta_data=pad_sequences(sequences,maxlen=200,padding='pre',value=roberta_tokenizer.encode('<pad>')[1])  #roberta_tokenizer.encode('<pad>')[1] is the token value for padding
      return roberta_data,self.y_data

In [None]:
roberta_tokenizer=AutoTokenizer.from_pretrained('roberta-base')

In [None]:
data=DATASET(train_data,y_data)
x_train_data,x_test_data,y_train_data,y_test_data=data(pad_sequences,train_test_split,'LR',roberta_tokenizer)
x_roberta_data,y_roberta_data=data(pad_sequences,train_test_split,'roberta',roberta_tokenizer)

In [None]:
print(f'for linear reg training sample set: {x_train_data.shape,y_train_data.shape} and for roberta whole dataset : {x_roberta_data.shape,y_roberta_data.shape}')

#Linear Regression model

In [None]:
class Linear_Model(tf.keras.Model):
  def __init__(self,x_train_data,y_train_data):
    self.x_train_data=x_train_data
    self.y_train_data=y_train_data
    self.lreg=LinearRegression()

  def linear_regression_result(self,x_test_data):
    self.lreg.fit(self.x_train_data,self.y_train_data)     #train                    
    
    #predict
    return self.lreg.predict(x_test_data)

In [None]:
linear_model=Linear_Model(x_train_data,y_train_data)
lr_y_pred=linear_model.linear_regression_result(x_test_data)

In [None]:
#performance metric evaluation on linear regression
from sklearn.metrics import mean_squared_error
print(f'RMSE Score {mean_squared_error(lr_y_pred,y_test_data,squared=False)}')

#RoBERTa Base

In [None]:
class Custom_roberta(tf.keras.Model):

  def __init__(self):
    super(Custom_roberta,self).__init__()
    self.roberta_model = TFAutoModelForSequenceClassification.from_pretrained('roberta-base',output_hidden_states=False, output_attentions=False, num_labels=1)

  def call(self,input_ids):
    x=self.roberta_model(input_ids)
    

    return x

In [None]:
def loss_func(y_true,y_pred):  #root mean sqruared error (RMSE) 
  return tf.sqrt(tf.reduce_mean(tf.square(y_pred-y_true)))

In [None]:
#with fit function use simple scheduler
#constant lr for first 10 epochs and then lr is decreased exponentially 

def schedule(epochs,lr):
    if epochs<10:
        return lr
        
    else:
        return lr * tf.math.exp(-0.01)

In [None]:
#train_Size of the data for training and dev

train_size=int(0.8*(len(x_roberta_data)))

In [None]:
x_roberta_data.shape,y_roberta_data.shape

In [None]:
#k folds
def train_in_folds(x_roberta_data,y_roberta_data,folds):

    # initiate the kfold class from model_selection module
    kf = KFold(n_splits=folds,shuffle=True)
    
    for (fold, (train_index, test_index)) in enumerate(kf.split(x_roberta_data)):
        print(f'for fold : {fold+1}\n')
        x_train,x_test=x_roberta_data[train_index],x_roberta_data[test_index]
        y_train,y_test=y_roberta_data[train_index],y_roberta_data[test_index]

        x_test,y_test=tf.convert_to_tensor(x_test),tf.convert_to_tensor(y_test)
        x_train,y_train=tf.convert_to_tensor(x_train),tf.convert_to_tensor(y_train)
        

        model=Custom_roberta()
        optimizer = tf.keras.optimizers.Adam(lr=2e-5)
        callback_1=tf.keras.callbacks.LearningRateScheduler(schedule)
        model.compile(optimizer=optimizer,loss='mse',metrics=[RootMeanSquaredError()])
        callback_2=tf.keras.callbacks.EarlyStopping(monitor='val_root_mean_squared_error',mode='min',restore_best_weights=True,patience=2)
        

        r=model.fit(x_train,y_train,batch_size=10,validation_data=(x_test,y_test),epochs=6,callbacks=[callback_1,callback_2],verbose=1)
        print(f"best rmse : {np.min(r.history['val_root_mean_squared_error']):.4f}")
        print('\n')

In [None]:
train_in_folds(x_roberta_data,y_roberta_data,folds=5)

#We can clearly see that transformer model outperforming simple linear model in terms of performance in terms of RMSE metric.

#I don't know since this is my First RoBERTa

In [None]:

#Code by Olga Belitskaya https://www.kaggle.com/olgabelitskaya/sequential-data/comments
from IPython.display import display,HTML
c1,c2,f1,f2,fs1,fs2=\
'#eb3434','#eb3446','Akronim','Smokum',30,15
def dhtml(string,fontcolor=c1,font=f1,fontsize=fs1):
    display(HTML("""<style>
    @import 'https://fonts.googleapis.com/css?family="""\
    +font+"""&effect=3d-float';</style>
    <h1 class='font-effect-3d-float' style='font-family:"""+\
    font+"""; color:"""+fontcolor+"""; font-size:"""+\
    str(fontsize)+"""px;'>%s</h1>"""%string))
    
    
dhtml('Thank you Shubham Kumar @eiann1509 for the script' )