In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import os
import re
import joblib

from nltk import pos_tag, word_tokenize
from nltk.corpus import stopwords, names
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

import pandas as pd

In [None]:
dataroot = os.path.join(os.path.abspath(os.path.sep), "kaggle", "input", "librispeechtext")
outroot = os.path.join(os.path.abspath(os.path.sep), "kaggle", "working", "librispeechtext")

In [None]:
# Input and output filepaths
train_clean_100_path = os.path.join(dataroot, "data", "train-clean-100.csv")
dev_clean_path = os.path.join(dataroot, "data", "dev-clean.csv")
test_clean_path = os.path.join(dataroot, "data", "test-clean.csv")

model_outpath = os.path.join(outroot, "models", "model.jl")
pyLDAvis_outpath = os.path.join(outroot, "view", "data.html")

os.makedirs(os.path.join(outroot, "models"), exist_ok=True)
os.makedirs( os.path.join(outroot, "view"), exist_ok=True)

In [None]:
# Read the csv files
train_df = pd.read_csv(train_clean_100_path, index_col=0)
dev_df = pd.read_csv(dev_clean_path, index_col=0)
test_df = pd.read_csv(test_clean_path, index_col=0)

train_df =  pd.DataFrame({"TEXT": train_df["REAL TEXT"], "BOOK": train_df["BOOK TITLE"]})
dev_df = pd.DataFrame({"TEXT": dev_df["TEXT"], "BOOK": dev_df["BOOK TITLE"]})
test_df = pd.DataFrame({"TEXT": test_df["TEXT"], "BOOK": test_df["BOOK TITLE"]})

In [None]:
# helper functions to normalize the words in the books
# Used to remove short words. how short is a short words
_short = 2

# Used to remove stopwords from the english language
_more_stopwords = set([
    # interjections
    "oh", "ah",
    # useless
    "yes", "no",
    # archaic terms: they, you, triplet, to do, you
    "thy", "thou", "thrin", "didst", "thee",
    # names
    *map(str.lower, names.words())
])
_stopwords = set(stopwords.words('english')) | _more_stopwords

# Used to lemmatize words that are either adj, nouns or verbs
# depending on _pos_tags otherwise it does nothing.
_lemmatizer = WordNetLemmatizer()
_pos_tags = ["a", "n", "v"]

# Used for min/max filering. CountVectorizer removes words 
# that have a frequency higher that max_df. It also removes 
# words that appear in less documents than min_df.
_max_df = 0.40
_min_df = 3

# https://stackoverflow.com/questions/19790188/expanding-english-language-contractions-in-python
# Used to decontract words that contain "'".
def decontract(phrase: str):
    # specific
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

def is_not_short(word):
    return len(word) > _short

def not_in_stopwords(word):
    return word not in _stopwords

def lemmatize(pair):
    word, pos = pair
    pos = pos[0].lower()
    if pos not in _pos_tags:
        return word
    return _lemmatizer.lemmatize(word, pos=pos)

def document_analyzer(book: str):
    book = str.lower(book)
    book = decontract(book)
    words = word_tokenize(book)
    words = list(filter(is_not_short, words))
    words = list(filter(not_in_stopwords, words))
    words = list(map(lemmatize, pos_tag(words)))
    return words

In [None]:
vectorizer = CountVectorizer(analyzer=document_analyzer, min_df=_min_df, max_df=_max_df)
X_train = vectorizer.fit_transform([*train_df["TEXT"].to_list(), *dev_df["TEXT"].to_list()])
X_dev = vectorizer.transform(dev_df["TEXT"].to_list())
X_test = vectorizer.transform(test_df["TEXT"].to_list())

In [None]:
lda = LatentDirichletAllocation(n_components=20, max_iter=50, evaluate_every=1, learning_method='online', verbose=1, n_jobs=-1)
lda.fit_transform(X_train)
y_hat = lda.transform(X_test)

print(lda.score(X_dev))
print(lda.perplexity(X_dev))

In [None]:
# Setup for GloVe vectors
#!wget https://nlp.stanford.edu/data/glove.42B.300d.zip
#!unzip glove.42B.300d.zip
#with open("./glove.42B.300d.txt", 'r') as glove:
#    lines = [next(glove) for x in xrange(100)]
#[(l.split(" ")[0], ", ".join(l.split(" ")[1:20]) + "...") for l in lines[30:33]]
#!head -n 50000 glove.42B.300d.txt > top_50000.txt
import numpy as np
import pandas as pd
from scipy.spatial import distance
embeddings = {}
nothingz = 0
with open("../input/top50000/top_50000.txt", 'r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings[word] = vector
def distance1(word, reference):
    global nothingz
    if word in embeddings.keys():
        nothingz = nothingz + 1
    else:
        embeddings[word] = np.ones(300, dtype=float)
    if reference in embeddings.keys():
        nothingz = nothingz + 1
    else:
        embeddings[reference] = np.ones(300, dtype=float)
    return distance.cosine(embeddings[word], embeddings[reference])

def closest_words(reference):
    return sorted(embeddings.keys(), key=lambda w: distance1(w, reference))

[(w, ", ".join(closest_words(w)[1:10]) + "...") for w in ["magic", "sport", "scuba", "sock"]]
def goodness(word, answers, bad):
    if word in answers + bad: return -999
    return sum([distance1(word, b) for b in bad]) - 4.0 * sum([distance1(word, a) for a in answers])

def minimax(word, answers, bad):
    if word in answers + bad: return -999
    return min([distance1(word, b) for b in bad]) - max([distance1(word, a) for a in answers])

def candidates(answers, bad, size=100):
    best = sorted(embeddings.keys(), key=lambda w: -1 * goodness(w, answers, bad))
    res = [(str(i + 1), "{0:.2f}".format(minimax(w, answers, bad)), w) for i, w in enumerate(sorted(best[:250], key=lambda w: -1 * minimax(w, answers, bad))[:size])]
    return [(". ".join([c[0], c[2]]) + " (" + c[1] + ")") for c in res]
import pandas as pd
from itertools import zip_longest

def grouper(n, iterable, fillvalue=None):
    args = [iter(iterable)] * n
    return zip_longest(fillvalue=fillvalue, *args)

from IPython.display import HTML

def tabulate(data):
    data = list(grouper(10, data))
    return HTML(pd.DataFrame(data).to_html(index=False, header=False))


In [None]:
# Functions for printing keywords for each topic
def print_topics(model, vectorizer, top_n=10):
    words = vectorizer.get_feature_names()
    
    w_topic = np.empty((20, 15), dtype=object)
    nr = 0
    
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx), end='')
        print([words[i] for i in topic.argsort()[:-top_n - 1:-1]])
        ind = 0
        for i in topic.argsort()[:-top_n-1:-1]:
            w_topic[idx][ind] = words[i]
            ind = ind + 1
        nr = idx
    
    current = 0
    while current<=nr:
        good = []
        bad = []
        other = 0
        while other<=nr:
            if other==current:
                for name in w_topic[other]:
                    good.append(name)
            else:
                for name in w_topic[other]:
                    bad.append(name)
            other = other + 1
        current = current + 1
        
        # process the good and bad
        # print(good)
        # print(bad)
        bad = ["accenture"]
        print("Topic "+str(current-1)+": ")
        print(candidates(good, bad))

# Print the topics found by the LDA model
print("Topics found via LDA:")
print_topics(lda, vectorizer, 15) 

In [None]:
topic_df = pd.DataFrame({"BOOK": test_df["BOOK"], "TOPIC": y_hat.argmax(axis=1)}, copy=True)

topic_df

In [None]:
model = {"vectorizer": vectorizer, "lda": lda}
joblib.dump(model, model_outpath)

In [None]:
import pyLDAvis
import pyLDAvis.sklearn

pyLDAvis.enable_notebook()
data = pyLDAvis.sklearn.prepare(lda, X_train, vectorizer, mds='tsne')
pyLDAvis.save_html(data, pyLDAvis_outpath)

In [None]:
data