In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<center style="font-family:verdana;"><h1 style="font-size:200%; padding: 20px; background: #2B3A67;"><i><b style="color:orange;">Of The Injustice of Counterfeiting Books by Immanuel Kant</b></i></h1></center>

Immanuel Kant (1724 - 1804)

Translated by John Richardson ( - 19th Cent.)

"This essay of Kant’s on copyright argues that the unlicensed copying of books cannot possibly be permissible, due to the fact that it assumes a consent on the part of the author which it is logically impossible for the author to give. The argument is dependent upon an assumption that the writings be commodified, for the reason why the author is unable to possibly give consent to multiple publishers is due to the author’s will – to communicate with the public – necessitating the profitability of the publisher, for, it is assumed, there is no way to communicate with the public at large without a great expense which can only be borne by a publishing firm. This is, of course, no longer a necessary assumption." (Summary by D.E. Wittkower)

https://librivox.org/of-the-injustice-of-counterfeiting-books-by-immanuel-kant/

![](https://i.ytimg.com/vi/qzo9FZwIq1M/maxresdefault.jpg)ciouliralisy.gq

In [None]:
import gensim

import lightgbm as lgb

from sklearn.model_selection import KFold
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

In [None]:
df = pd.read_csv("/kaggle/input/immanuel-kant-bibliography/Kant_works_corpus.csv")
df.head(6)

##Code by Syurenuko https://www.kaggle.com/syurenuko/clrp-word2vec-lightgbm-baseline/data

In [None]:
data_dir = '../input/immanuel-kant-bibliography/'
sample_submission = pd.read_csv(data_dir + 'Kant_works_corpus.csv')
target = df['publishing_date'].to_numpy()

#This notebook is a LightGBM learning & inference model using Word2vec. It's a very light model so it can be run on a CPU.

Word2vec represents words in 300 dimensions. By averaging the 300-dimensional vectors of the words in the sentence, the sentence was represented in 300 dimensions.

#Embedding by Word2vec

In [None]:
#Code by Syurenuko https://www.kaggle.com/syurenuko/clrp-word2vec-lightgbm-baseline/data

word2vec_model = gensim.models.KeyedVectors.load_word2vec_format('../input/googlenewsvectorsnegative300/GoogleNews-vectors-negative300.bin', binary=True)
print(word2vec_model.vectors.shape)

In [None]:
#Code by Syurenuko https://www.kaggle.com/syurenuko/clrp-word2vec-lightgbm-baseline/data

def avg_feature_vector(sentence, model, num_features):
    words = sentence.replace('\n'," ").replace(',',' ').replace('.'," ").split()
    feature_vec = np.zeros((num_features,),dtype="float32")#特徴ベクトルの初期化
    i=0
    for word in words:
        try:
            feature_vec = np.add(feature_vec, model[word])
        except KeyError as error:
            feature_vec 
            i = i + 1
    if len(words) > 0:
        feature_vec = np.divide(feature_vec, len(words)- i)
    return feature_vec

In [None]:
#Code by Syurenuko https://www.kaggle.com/syurenuko/clrp-word2vec-lightgbm-baseline/data

word2vec_df = np.zeros((len(df.index),300),dtype="float32")#特徴ベクトルの初期化
#word2vec_test = np.zeros((len(test.index),300),dtype="float32")

for i in range(len(df.index)):
    word2vec_df[i] = avg_feature_vector(df["text_clean"][i],word2vec_model, 300)
    
#for i in range(len(test.index)):
 #   word2vec_test[i] = avg_feature_vector(test["text_clean"][i],word2vec_model, 300) 

In [None]:
print(word2vec_df.shape)
print(target.shape)
#print(word2vec_test.shape)

#Training & Inference

lightgbm (KFold=5)

In [None]:
#Code by Syurenuko https://www.kaggle.com/syurenuko/clrp-word2vec-lightgbm-baseline/data

#parameter settings
params = {
    'boosting_type': 'gbdt',
    'metric': 'rmse',
    'objective': 'regression',
    'seed': 42,
    'learning_rate': 0.01,
    "n_jobs": -1,
    "verbose": -1
}

pred = np.zeros(df.shape[0])

In [None]:
#KFold 　n_splits=5
from sklearn.model_selection import KFold
fold = KFold(n_splits=5, shuffle=True, random_state=42)
cv=list(fold.split(word2vec_df, target))

In [None]:
#Code by Syurenuko https://www.kaggle.com/syurenuko/clrp-word2vec-lightgbm-baseline/data

rmses = []
for tr_idx, val_idx in cv: 
    x_tr, x_va = word2vec_df[tr_idx], word2vec_df[val_idx]
    y_tr, y_va = target[tr_idx], target[val_idx]
        
    df_set = lgb.Dataset(x_tr, y_tr)
    val_set = lgb.Dataset(x_va, y_va, reference=df_set)
        
    # Training
    model = lgb.train(params, df_set, num_boost_round=10000, early_stopping_rounds=100,
                      valid_sets=[df_set, val_set], verbose_eval=-1)#lgb.train is module train. Don't write df
        
    y_pred = model.predict(x_va)
    rmse =  np.sqrt(mean_squared_error(y_va, y_pred))
    rmses.append(rmse)
        
    #Inference
   # test_pred = model.predict(word2vec_test)
    #pred += test_pred / 5  
        
print("\n", "Mean Fold RMSE:", np.mean(rmses))

In [None]:
#Saving for the next competition

sample_submission.target = pred
sample_submission.to_csv('Kant_works_corpus.csv',index=False)

In [None]:
#sample_submission

#Of the Injustice of Counterfeiting Books

In [None]:
#5th row. And 5th column, text_clean 

df.iloc[5,4]

In [None]:
#Code by Olga Belitskaya https://www.kaggle.com/olgabelitskaya/sequential-data/comments
from IPython.display import display,HTML
c1,c2,f1,f2,fs1,fs2=\
'#eb3434','#eb3446','Akronim','Smokum',30,15
def dhtml(string,fontcolor=c1,font=f1,fontsize=fs1):
    display(HTML("""<style>
    @import 'https://fonts.googleapis.com/css?family="""\
    +font+"""&effect=3d-float';</style>
    <h1 class='font-effect-3d-float' style='font-family:"""+\
    font+"""; color:"""+fontcolor+"""; font-size:"""+\
    str(fontsize)+"""px;'>%s</h1>"""%string))
    
    
dhtml('Thank you Syurenuko @syurenuko for the script' )