# Trying a statistical approach to philosophy :-)

## Table of Contents
* [Categorical Features](#1)
* [Numerical Features](#2)
* [Evaluate by School and Author](#3)
* [Wordclouds by School](#4)
* [Word2Vec - Word Embeddings](#5)
* [Visualize Word Embeddings using UMAP](#6)
* [GBM model based on word embeddings](#7)

In [None]:
# PACKAGES

# standard
import numpy as np
import pandas as pd
import time
import random

# plots
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns

# NLP
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

# H2O
import h2o
from h2o.estimators import H2OWord2vecEstimator
from h2o.estimators import H2OGradientBoostingEstimator

# UMAP
import umap

In [None]:
# read data
t1 = time.time()
df = pd.read_csv('../input/history-of-philosophy/phil_nlp.csv')
t2 = time.time()
print('Elapsed time: ', np.round(t2-t1,2))

In [None]:
# structure of data
df.info()

In [None]:
# preview
df

In [None]:
# add derived features
df['n_tokens'] = list(map(len,map(eval,df.tokenized_txt)))

<a id='1'></a>
# Categorical Features

In [None]:
# categorical features
features_cat = ['title', 'author', 'school']

# plot distributions
for f in features_cat:
    plt.figure(figsize=(14,5))
    df[f].value_counts().plot(kind='bar')
    plt.title(f)
    plt.grid()
    plt.show()

<a id='2'></a>
# Numerical Features

In [None]:
# sentence length
print(df.sentence_length.describe())

plt.figure(figsize=(12,5))
df.sentence_length.plot(kind='hist', bins=200)
plt.title('Sentence Length')
plt.grid()
plt.show()

plt.figure(figsize=(12,5))
np.log10(df.sentence_length).plot(kind='hist', bins=50)
plt.title('log10(Sentence Length)')
plt.grid()
plt.show()

In [None]:
# number of tokens
print(df.n_tokens.describe())

plt.figure(figsize=(12,5))
df.n_tokens.plot(kind='hist', bins=200)
plt.title('Number of Tokens')
plt.grid()
plt.show()

### By School:

In [None]:
schools = df.school.unique().tolist()
print(schools)

In [None]:
# plot sentence length split by school
plt.figure(figsize=(16,5))
sns.violinplot(x='school', y='sentence_length', data=df)
plt.title('Sentence Length - By School')
plt.grid()

In [None]:
# plot number of tokens split by school
plt.figure(figsize=(16,5))
sns.violinplot(x='school', y='n_tokens', data=df)
plt.title('Number of Tokens - By School')
plt.grid()

<a id='3'></a>
# Evaluate by School and Author

In [None]:
# mean of numerical features
df.groupby(by=['school','author']).mean().round(2)

In [None]:
# sentence count (wrap in DataFrame to get nicer display)
pd.DataFrame( df.groupby(by=['school','author'])['title'].count() )

In [None]:
# drilldown further to title level (wrap in DataFrame to get nicer display)
pd.DataFrame( df.groupby(by=['school','author','title'])['title'].count() )

<a id='4'></a>
# Wordclouds by School

In [None]:
stopwords = set(STOPWORDS)

In [None]:
t1 = time.time()
for sc in schools:
    df_temp = df[df.school==sc]
    
    print('School = ', sc.upper(), ':')
    
    # render wordcloud
    text = " ".join(txt for txt in df_temp.sentence_lowered)
    wordcloud = WordCloud(stopwords=stopwords, max_font_size=50, max_words=500,
                          width = 600, height = 400,
                          background_color="white").generate(text)
    plt.figure(figsize=(12,8))
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.show()
t2 = time.time()
print('Elapsed time: ', np.round(t2-t1,2))

<a id='5'></a>
# Word2Vec - Word Embeddings
#### Using code from: https://docs.h2o.ai/h2o/latest-stable/h2o-docs/data-science/word2vec.html

In [None]:
# start H2O
h2o.init()

In [None]:
# utility function for tokenization
def tokenize(sentences, stop_word = stopwords): # use stop words from wordcloud package
    tokenized = sentences.tokenize("\\W+")
    tokenized_lower = tokenized.tolower()
    tokenized_filtered = tokenized_lower[(tokenized_lower.nchar() >= 2) | (tokenized_lower.isna()),:]
    tokenized_words = tokenized_filtered[tokenized_filtered.grep("[0-9]",invert=True,output_logical=True),:]
    tokenized_words = tokenized_words[(tokenized_words.isna()) | (~ tokenized_words.isin(stop_word)),:]
    return tokenized_words

In [None]:
# upload data to H2O environment
text_h2o = h2o.H2OFrame(df[['school','sentence_lowered']])

In [None]:
# tokenize text
t1 = time.time()
words = tokenize(text_h2o['sentence_lowered'])
t2 = time.time()
print('Elapsed time:', np.round(t2-t1,2), 'secs')

In [None]:
# train Word2Vec model
random.seed(1234)

t1 = time.time()
w2v_model = H2OWord2vecEstimator(vec_size = 50,
                                 window_size = 5,
                                 sent_sample_rate = 0.001,
                                 init_learning_rate = 0.025,
                                 epochs = 10)
w2v_model.train(training_frame=words)
t2 = time.time()
print('Elapsed time:', np.round(t2-t1,2), 'secs')

In [None]:
# check model
w2v_model.find_synonyms('knowledge', count = 5)

In [None]:
# create vector representation for each sentence (as average of the word vectors)
text_vec = w2v_model.transform(words, aggregate_method = 'AVERAGE')
# and add target 'school' to vectors
text_vec = text_vec.cbind(text_h2o['school'])
text_vec.head()

In [None]:
# vector features (columns w/o the label "school")
features = text_vec.columns
features.remove('school')

<a id='6'></a>
# Visualize Word Embeddings using UMAP

In [None]:
# convert H2O frame to Pandas data frame
df_text_vec = text_vec.as_data_frame();

# drop rows with missing values
df_text_vec = df_text_vec.dropna(axis=0)

In [None]:
# let's first make a simple visualization: boxplot for each column
plt.figure(figsize=(18,6))
df_text_vec[features].boxplot()
plt.show()

In [None]:
# use subset only (for performance and clarity of plot)
df_text_vec = df_text_vec.sample(25000)

In [None]:
# run UMAP algorithm to get a low dimensional (in our case 2D) representation
dim_reducer = umap.UMAP(random_state=111)

t1 = time.time()
text_vec_umap = dim_reducer.fit_transform(df_text_vec[features])
t2 = time.time()
print('Elapsed time:', np.round(t2-t1,2), 'secs')

# convert result matrix to data frame
df_text_vec_umap = pd.DataFrame(text_vec_umap, columns=['x','y'])
# and add school again
df_text_vec_umap['school'] = df_text_vec.school.tolist()

In [None]:
# show result
df_text_vec_umap.head()

In [None]:
# now plot
plt.figure(figsize=(12,10))
sns.scatterplot(data=df_text_vec_umap, x='x', y='y', 
                hue='school', alpha=0.5, s=10) # adjust marker size => avoid overplotting
plt.legend(loc='upper right')
plt.grid()
plt.show()

<a id='7'></a>
# GBM model based on word embeddings

In [None]:
# train/test split
random.seed(1234)
perc_train = 0.7
data_split = text_vec.split_frame(ratios=[perc_train]) # => data_split[0]:training, data_split[1]:validation

In [None]:
# export to file - for potential external processing
h2o.export_file(data_split[0], 'df_train.csv')
h2o.export_file(data_split[1], 'df_test.csv')

In [None]:
# define gradient boosting model
n_CV = 5 # number of cross validations
fit_1 = H2OGradientBoostingEstimator(ntrees=200,
                                     max_depth=4,
                                     col_sample_rate=0.5,
                                     min_rows=10,
                                     nfolds=n_CV,
                                     seed=999)

In [None]:
# train model
t1 = time.time()
fit_1.train(x = features,
                y = 'school',
                training_frame = data_split[0],
                validation_frame = data_split[1])
t2 = time.time()
print('Elapsed time:', np.round(t2-t1,2), 'secs')

In [None]:
# show cross validation metrics
fit_1.cross_validation_metrics_summary()

In [None]:
# show scoring history - training vs cross validation
for i in range(n_CV):
    cv_model_temp = fit_1.cross_validation_models()[i]
    df_cv_score_history = cv_model_temp.score_history()
    my_title = 'CV ' + str(1+i) + ' - Scoring History'
    plt.scatter(df_cv_score_history.number_of_trees,
                y=df_cv_score_history.training_classification_error, 
                c='blue', label='training')
    plt.scatter(df_cv_score_history.number_of_trees,
                y=df_cv_score_history.validation_classification_error, 
                c='darkorange', label='validation')
    plt.ylim(0,1)
    plt.title(my_title)
    plt.xlabel('Number of Trees')
    plt.legend()
    plt.grid()
    plt.show()

In [None]:
# variable importance
fit_1.varimp_plot(20)

### Evaluate on validation set

In [None]:
# predictions on validation set
pred_valid = fit_1.predict(data_split[1])
pred_valid = pred_valid.as_data_frame() # back to pandas

# show preview
pred_valid.head()

In [None]:
# actual values on validation set
actuals = data_split[1]['school'].as_data_frame()
actuals = actuals.school

In [None]:
# evaluate confusion matrix
conf_valid = pd.crosstab(pred_valid.predict, actuals)

In [None]:
# visualize confusion matrix (validation set)
plt.figure(figsize=(10,8))
sns.heatmap(data=conf_valid, annot=True, fmt='g', cmap='Blues')
plt.show()