In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="white", context="talk")

In [None]:
SAVE_FIGS = True
BASE_DIR = "../input/feedback-prize-effectiveness/"

In [None]:
df_train = pd.read_csv(BASE_DIR + "train.csv")
df_test = pd.read_csv(BASE_DIR + "test.csv")
df_sub = pd.read_csv(BASE_DIR + "sample_submission.csv")

In [None]:
def read_text(filepath):
    with open(filepath) as f:
        data = f.read()
        
    return data

In [None]:
unique_essay_count = len(df_train["essay_id"].unique())

print("Length of the input dataset:", len(df_train))
print("Number of unique essays:", unique_essay_count)

# Label Counts

In [None]:
df_train["discourse_effectiveness"].value_counts().plot(kind="bar", figsize=(12, 6))
if SAVE_FIGS:
    plt.savefig("label_counts.png")
plt.show()

# Label Counts per Discourse Type

In [None]:
# groupby_class_counts = df_train.groupby(["discourse_type"])["discourse_effectiveness"].value_counts(normalize=True)
groupby_class_counts = df_train.groupby(["discourse_type", "discourse_effectiveness"]).size().unstack(fill_value=0).reset_index(level=0)

plt.figure(figsize=(12, 9))
sns.barplot(
    data=groupby_class_counts.melt(
        id_vars="discourse_type", 
        value_vars=["Adequate", "Effective", "Ineffective"], 
        var_name="target"
    ),
    x="discourse_type",
    y="value",
    hue="target"
)
plt.xticks(rotation=45)
if SAVE_FIGS:
    plt.savefig("label_counts_per_discourse.png")
plt.show()

# Character Counts

In [None]:
df_train["character_count"] = df_train["discourse_text"].apply(len)

fig = plt.figure(figsize=(12, 10))
num_bins = int(np.ceil(1+np.log2(len(df_train["character_count"]))))
sns.histplot(
    data=df_train["character_count"],
    kde=True,
    bins=num_bins,
    stat="density",
    log_scale=True
)
if SAVE_FIGS:
    plt.savefig("character_count_raw_text.png")
plt.show()

In [None]:
fig = plt.figure(figsize=(12, 10))
sns.histplot(
    data=df_train,
    x="discourse_type",
    y="character_count",
    bins=num_bins,
    log_scale=(False, True),
    cbar=True,
    cbar_kws=dict(shrink=.75)
)
plt.xticks(rotation=45)
if SAVE_FIGS:
    plt.savefig("character_count_per_discourse_raw_text.png")
plt.show()

# Word Counts

In [None]:
# Poor mans tokenizer.
df_train["word_count"] = df_train["discourse_text"].str.split().apply(len)

fig = plt.figure(figsize=(12, 10))
sns.histplot(
    data=df_train["word_count"],
    bins=num_bins * 2,
    stat="density",
    kde=True,
    log_scale=True
)
if SAVE_FIGS:
    plt.savefig("word_counts_raw_text.png")
plt.show()

In [None]:
df_train.groupby("discourse_type").agg({"character_count": ["median", "mean"], "word_count": ["median", "mean"]})

In [None]:
fig, axes = plt.subplots(ncols=2, figsize=(24, 10))
x_cols = ["character_count", "word_count"]
for ax, x_col in zip(axes, x_cols):    
    sns.histplot(
        data=df_train,
        x=x_col,
        hue="discourse_type",
        bins=num_bins,
        log_scale=True,
        ax=ax
    )

fig.suptitle("Character-Word histogram per discourse type")
if SAVE_FIGS:
    plt.savefig("character_word_counts_per_discourse.png")
plt.show()

In [None]:
fig, axes = plt.subplots(ncols=2, figsize=(20, 8))
y_cols = ["character_count", "word_count"]

for ax, y_col in zip(axes, y_cols):
#     sns.boxplot(
#         data=df_train,
#         x="discourse_type",
#         y=y_col,
#         inner=None,
#         color=".8",
#         ax=ax
#     )
    sns.stripplot(
        data=df_train,
        x="discourse_type",
        y=y_col,
        ax=ax
    )
    ax.tick_params(axis="x", labelrotation=45)

if SAVE_FIGS:
    plt.savefig("character_count_raw_text_scatter.png")

plt.show()

# The most common n-grams

In [None]:
from collections import Counter
from itertools import chain

from nltk.corpus import stopwords
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

en_stopwords = set(stopwords.words("english"))
en_stopwords = en_stopwords.union(ENGLISH_STOP_WORDS)

In [None]:
print(len(en_stopwords))
print(en_stopwords)

In [None]:
def remove_stopwords(words, stopwords_set=en_stopwords):
    return [word for word in words if word not in stopwords_set]


def remove_puncts():
    pass

In [None]:
def get_ngrams(text, n):
    words = text.split()
    cleaned_words = remove_stopwords(words)
    ngrams = [tuple(cleaned_words[idx: idx+n]) for idx in range(len(cleaned_words) - n+1)]
    return ngrams

In [None]:
fig, axes = plt.subplots(ncols=3, figsize=(18, 8))
fig.subplots_adjust(wspace=1.75)
top_k = 20

for n in range(1, 4):
    words = list(chain.from_iterable(df_train["discourse_text"].apply(lambda x: get_ngrams(x.lower(), n=n))))
    counter = Counter(words)
    df_ngrams = pd.DataFrame(counter.most_common(top_k), columns=[f"ngram_{n}", "count"])
    # df_ngrams[f"ngram_{n}"] = df_ngrams[f"ngram_{n}"].apply(lambda x: " ".join(x))
    sns.barplot(
        x=df_ngrams["count"],
        y=df_ngrams[f"ngram_{n}"],
        ax=axes[n-1]
    )
fig.suptitle("The most common n-grams(1, 2, 3)")
if SAVE_FIGS:
    plt.savefig("most_common_ngrams.png")
plt.show()

# Word Cloud

- https://github.com/amueller/word_cloud/blob/master/examples/a_new_hope.py

In [None]:
import random
from wordcloud import WordCloud

def grey_color_func(word, font_size, position, orientation, random_state=None,
                    **kwargs):
    return "hsl(0, 0%%, %d%%)" % random.randint(60, 100)

In [None]:
full_text = " ".join(df_train["discourse_text"].apply(lambda x: " ".join(remove_stopwords(x.lower().split()))).values)
wc = WordCloud(max_words=500, width=800, height=600, margin=10, min_font_size=6).generate(full_text)

plt.figure(figsize=(12, 8))
plt.imshow(wc.to_array(), interpolation="bilinear")
plt.axis("off")
if SAVE_FIGS:
    plt.savefig("word_cloud.png")
plt.show()

### WIP...

- Keyword extraction
- Topic extraction