In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import os
import re

from nltk import pos_tag, word_tokenize
from nltk.corpus import stopwords, names
from nltk.stem import WordNetLemmatizer

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from functools import partial

In [None]:
dataroot = os.path.join(os.path.abspath(os.path.sep), "kaggle", "input", "librispeechtext", "data")
outroot = os.path.join(os.path.abspath(os.path.sep), "kaggle", "working", "librispeechtext", "data")

In [None]:
# Input and output filepaths
train_clean_100_path = os.path.join(dataroot, "train-clean-100.csv")
dev_clean_path = os.path.join(dataroot, "dev-clean.csv")
test_clean_path = os.path.join(dataroot, "test-clean.csv")

train_clean_100_outpath = os.path.join(outroot, "preprocessed-train-clean-100.csv")
dev_clean_outpath = os.path.join(outroot, "preprocessed-dev-clean.csv")
test_clean_outpath = os.path.join(outroot, "preprocessed-test-clean.csv")

os.makedirs(outroot, exist_ok=True)

In [None]:
# Read the csv files
train_df = pd.read_csv(train_clean_100_path, index_col=0)
dev_df = pd.read_csv(dev_clean_path, index_col=0)
test_df = pd.read_csv(test_clean_path, index_col=0)

# https://stackoverflow.com/questions/19790188/expanding-english-language-contractions-in-python
def decontracted(phrase):
    # specific
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

train_df =  pd.DataFrame({"TEXT": train_df["REAL TEXT"].apply(str.lower).apply(decontracted), "BOOK": train_df["BOOK TITLE"]})
dev_df = pd.DataFrame({"TEXT": dev_df["TEXT"].apply(str.lower).apply(decontracted), "BOOK": dev_df["BOOK TITLE"]})
test_df = pd.DataFrame({"TEXT": test_df["TEXT"].apply(str.lower).apply(decontracted), "BOOK": test_df["BOOK TITLE"]})

train_df

In [None]:
# Tokenize the books into lists of words
tok_train_df = pd.DataFrame(train_df, copy=True)
tok_dev_df = pd.DataFrame(dev_df, copy=True)
tok_test_df = pd.DataFrame(test_df, copy=True)

tok_train_df["TEXT"] = tok_train_df["TEXT"].apply(word_tokenize)
tok_train_df["N_WORDS"] = tok_train_df["TEXT"].apply(len)
tok_train_df["U_WORDS"] = tok_train_df["TEXT"].apply(set).apply(len)

tok_dev_df["TEXT"] = tok_dev_df["TEXT"].apply(word_tokenize)
tok_dev_df["N_WORDS"] = tok_dev_df["TEXT"].apply(len)
tok_dev_df["U_WORDS"] = tok_dev_df["TEXT"].apply(set).apply(len)

tok_test_df["TEXT"] = tok_test_df["TEXT"].apply(word_tokenize)
tok_test_df["N_WORDS"] = tok_test_df["TEXT"].apply(len)
tok_test_df["U_WORDS"] = tok_test_df["TEXT"].apply(set).apply(len)

tok_train_df

In [None]:
# Remove stop words from books
stop_train_df = pd.DataFrame(tok_train_df, copy=True)
stop_dev_df = pd.DataFrame(tok_dev_df, copy=True)
stop_test_df = pd.DataFrame(tok_test_df, copy=True)

_more_stopwords = set([
    # interjections
    "oh", "ah",
    # useless
    "yes", "no",
    # archaic terms: they, you, triplet, to do, you
    "thy", "thou", "thrin", "didst", "thee",
    # names
    *map(str.lower, names.words())
])
_stopwords = set(stopwords.words('english')) | _more_stopwords
stopwords_filter = partial(filter, lambda w: w not in _stopwords)

stop_train_df["TEXT"] = stop_train_df["TEXT"].apply(stopwords_filter).apply(list)
stop_train_df["N_WORDS"] = stop_train_df["TEXT"].apply(len)
stop_train_df["U_WORDS"] = stop_train_df["TEXT"].apply(set).apply(len)

stop_dev_df["TEXT"] = stop_dev_df["TEXT"].apply(stopwords_filter).apply(list)
stop_dev_df["N_WORDS"] = stop_dev_df["TEXT"].apply(len)
stop_dev_df["U_WORDS"] = stop_dev_df["TEXT"].apply(set).apply(len)

stop_test_df["TEXT"] = stop_test_df["TEXT"].apply(stopwords_filter).apply(list)
stop_test_df["N_WORDS"] = stop_test_df["TEXT"].apply(len)
stop_test_df["U_WORDS"] = stop_test_df["TEXT"].apply(set).apply(len)

stop_train_df

In [None]:
# Stem the words in each book to a simpler form using a lemmatizer
lemm_train_df = pd.DataFrame(stop_train_df, copy=True)
lemm_dev_df = pd.DataFrame(stop_dev_df, copy=True)
lemm_test_df = pd.DataFrame(stop_test_df, copy=True)

_lemmatizer = WordNetLemmatizer()
lemmatize = lambda tokens: [_lemmatizer.lemmatize(token, pos=pos[0].lower()) 
                            if pos[0].lower() in ["a", "n", "v"] else token
                            for token, pos in pos_tag(tokens)]


lemm_train_df["TEXT"] = lemm_train_df["TEXT"].apply(lemmatize).apply(list)
lemm_train_df["N_WORDS"] = lemm_train_df["TEXT"].apply(len)
lemm_train_df["U_WORDS"] = lemm_train_df["TEXT"].apply(set).apply(len)

lemm_dev_df["TEXT"] = lemm_dev_df["TEXT"].apply(lemmatize).apply(list)
lemm_dev_df["N_WORDS"] = lemm_dev_df["TEXT"].apply(len)
lemm_dev_df["U_WORDS"] = lemm_dev_df["TEXT"].apply(set).apply(len)

lemm_test_df["TEXT"] = lemm_test_df["TEXT"].apply(lemmatize).apply(list)
lemm_test_df["N_WORDS"] = lemm_test_df["TEXT"].apply(len)
lemm_test_df["U_WORDS"] = lemm_test_df["TEXT"].apply(set).apply(len)

lemm_train_df

In [None]:
out_train_df = pd.DataFrame(lemm_train_df, copy=True)
out_dev_df = pd.DataFrame(lemm_dev_df, copy=True)
out_test_df = pd.DataFrame(lemm_test_df, copy=True)

out_train_df.to_csv(train_clean_100_outpath, columns=["TEXT", "BOOK"])
out_dev_df.to_csv(dev_clean_outpath, columns=["TEXT", "BOOK"])
out_test_df.to_csv(test_clean_outpath, columns=["TEXT", "BOOK"])

In [None]:
df = pd.DataFrame({
    'X': tok_train_df["BOOK"],
    'Y': tok_train_df["U_WORDS"],
    'Z': stop_train_df["U_WORDS"],
    'W': lemm_train_df["U_WORDS"]
})

# creating subplots
ax = plt.subplots(figsize=(15, 10))
  
# plotting columns
ax = sns.barplot(x=df["X"], y=df["Y"], color='r', label="tok")
ax = sns.barplot(x=df["X"], y=df["Z"], color='g', label="stop")
ax = sns.barplot(x=df["X"], y=df["W"], color='b', label="lemm")

# renaming the axes
ax.set(xlabel="book", ylabel="unique words")
ax.set(xticklabels=[])
ax.set_title("Change in the number of unique words")
ax.legend()
  
# visulaizing illustration
plt.show()

df

In [None]:
# Unique words after each step: tokenization, stopwords removal and lemmatization
df.describe()

In [None]:
x = len(set(tok_train_df["TEXT"].sum()))
y = len(set(tok_dev_df["TEXT"].sum()))
z = len(set(tok_test_df["TEXT"].sum()))

print(f"Number of unique words in the train set {x}")
print(f"Number of unique words in the dev set {y}")
print(f"Number of unique words in the test set {z}")

In [None]:
x = len(set(lemm_train_df["TEXT"].sum()))
y = len(set(lemm_dev_df["TEXT"].sum()))
z = len(set(lemm_test_df["TEXT"].sum()))

print(f"Number of unique words in the train set after preprocessing {x}")
print(f"Number of unique words in the dev set after preprocessing {y}")
print(f"Number of unique words in the test set after preprocessing {z}")