In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2

In [None]:
dataset = pd.read_csv("../input/empiricisms-thinkers/Empiricism_works_corpus.csv")
dataset.head()

In [None]:
df = dataset[['authors', 'book_title','text_clean']].copy()
df.head()

In [None]:
pd.DataFrame(dataset.authors.unique()).values

# Word clouds

In [None]:
from wordcloud import WordCloud

plt.figure(figsize=(40,25))
subset = df[df['authors']=='John Locke']
text = subset.text_clean.values
cloud1=WordCloud(background_color='pink',colormap="Dark2",collocations=False,width=2500,height=1800
                ).generate(" ".join(text))

plt.subplot(1,3,1)
plt.axis('off')
plt.title("John Locke",fontsize=40)
plt.imshow(cloud1)

subset = df[df['authors']=='David Hume']
text = subset.text_clean.values
cloud2=WordCloud(background_color='pink',colormap="Dark2",collocations=False,width=2500,height=1800
                       ).generate(" ".join(text))
plt.subplot(1,3,2)
plt.axis('off')
plt.title("David Hume",fontsize=40)
plt.imshow(cloud2)

subset = df[df['authors']=='George Berkeley']
text = subset.text_clean.values
cloud3=WordCloud(background_color='pink',colormap="Dark2",collocations=False,width=2500,height=1800
                       ).generate(" ".join(text))
plt.subplot(1,3,3)
plt.axis('off')
plt.title("George Berkeley",fontsize=40)
plt.imshow(cloud3)

In [None]:
from collections import Counter
cnt=Counter()

for text in df['text_clean'].values:
    for word in text.split():
        cnt[word]+=1
        
cnt.most_common(10)

In [None]:
FREQWORDS = set([w for (w, wc) in cnt.most_common(10)])
def remove_freqwords(text):
    return " ".join([word for word in str(text).split() if word not in FREQWORDS])

df["text_clean"] = df["text_clean"].apply(lambda text: remove_freqwords(text))

# Word clouds without frequent words

In [None]:
from wordcloud import WordCloud

plt.figure(figsize=(40,25))
subset = df[df['authors']=='John Locke']
text = subset.text_clean.values
cloud1=WordCloud(background_color='pink',colormap="Dark2",collocations=False,width=2500,height=1800
                ).generate(" ".join(text))

plt.subplot(1,3,1)
plt.axis('off')
plt.title("John Locke",fontsize=40)
plt.imshow(cloud1)

subset = df[df['authors']=='David Hume']
text = subset.text_clean.values
cloud2=WordCloud(background_color='pink',colormap="Dark2",collocations=False,width=2500,height=1800
                       ).generate(" ".join(text))
plt.subplot(1,3,2)
plt.axis('off')
plt.title("David Hume",fontsize=40)
plt.imshow(cloud2)

subset = df[df['authors']=='George Berkeley']
text = subset.text_clean.values
cloud3=WordCloud(background_color='pink',colormap="Dark2",collocations=False,width=2500,height=1800
                       ).generate(" ".join(text))
plt.subplot(1,3,3)
plt.axis('off')
plt.title("George Berkeley",fontsize=40)
plt.imshow(cloud3)

In [None]:
df['authors_id'] = df['authors'].factorize()[0]
authors_id_df = df[['authors', 'authors_id']].drop_duplicates()

authors_to_id = dict(authors_id_df.values)
id_to_authors = dict(authors_id_df[['authors_id', 'authors']].values)

df.head()

# Term frequency

In [None]:
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5,
                        ngram_range=(1, 2), 
                        stop_words='english')

features = tfidf.fit_transform(df.text_clean).toarray()

labels = df.authors_id

print("Each of the %d text is represented by %d features (TF-IDF score of unigrams and bigrams)" %(features.shape))

In [None]:
N = 3
for authors, authors_id in sorted(authors_to_id.items()):
  features_chi2 = chi2(features, labels == authors_id)
  indices = np.argsort(features_chi2[0])
  feature_names = np.array(tfidf.get_feature_names())[indices]
  unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
  bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
  print("\n==> %s:" %(authors))
  print("  * Most Correlated Unigrams are: %s" %(', '.join(unigrams[-N:])))
  print("  * Most Correlated Bigrams are: %s" %(', '.join(bigrams[-N:])))