In [2]:
from collections import Counter

import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import gridspec

from nltk import word_tokenize
from tqdm.auto import tqdm

In [3]:
import nltk

nltk.download('punkt')

In [13]:
# Load the CSV file into a DataFrame
df = pd.read_csv('../data/qa_finance.csv')

df = df.dropna(subset=['Question Text', 'Answer Text'])
df = df.drop_duplicates(subset=['Question Text', 'Answer Text'])
df = df.reset_index(drop=True)

df['Question Title'] = df['Question Title'].str.replace(r'\r\n|\n|\r', ' ', regex=True)
df['Question Text'] = df['Question Text'].str.replace(r'\r\n|\n|\r', ' ', regex=True)
df['Answer Text'] = df['Answer Text'].str.replace(r'\r\n|\n|\r', ' ', regex=True)

df['Question Text'] = df['Question Text'].str.strip()
df['Answer Text'] = df['Answer Text'].str.strip()

df['Question Date'] = df['Question Date'].str.replace(r'\d+\.\d+\.-0001 \d+:\d+', 'NaN')
df['Question Date'] = df['Question Date'].ffill()
df['Question Date'] = pd.to_datetime(df['Question Date'], format='%d.%m.%Y %H:%M')
df['Year'] = df['Question Date'].dt.year
# save
df.to_csv('../data/qa_finance_cleaned.csv', index=False)

In [4]:
df['Question Text'][0]

In [5]:
df['Answer Text'][0]

In [24]:
df.tail()

In [5]:
plt.figure(figsize=(16, 12))
plt.suptitle('Counts of Sections and Themes', size=22)

# Plot for Section counts
plt.subplot(211)
section_counts = df['Section'].value_counts()
g0 = sns.barplot(x=section_counts.index, y=section_counts.values, color='blue')
g0.set_title("Counts of Sections", fontsize=22)
g0.set_xlabel("Section Name", fontsize=19)
g0.set_ylabel("Total Count", fontsize=19)
g0.set_xticks(range(len(section_counts)))  # Set the ticks explicitly
g0.set_xticklabels(section_counts.index, rotation=75)  # Remove rotation from tick labels
for p in g0.patches:
    height = p.get_height()
    g0.text(p.get_x() + p.get_width() / 2.,
            height + 3,
            '{:1.1f}%'.format(height / section_counts.sum() * 100),
            ha="center", fontsize=11)

# Plot for Theme counts
plt.subplot(212)
theme_counts = df['Theme'].value_counts()
g1 = sns.barplot(x=theme_counts.index[:20], y=theme_counts.values[:20], color='blue')
g1.set_title("Top 20 Themes with More Counts", fontsize=22)
g1.set_xlabel("Theme Name", fontsize=19)
g1.set_ylabel("Total Count", fontsize=19)
g1.set_xticks(range(len(theme_counts[:20])))  # Set the ticks explicitly
g1.set_xticklabels(theme_counts.index[:20], rotation=75)  # Remove rotation from tick labels
for p in g1.patches:
    height = p.get_height()
    g1.text(p.get_x() + p.get_width() / 2.,
            height + 3,
            '{:1.1f}%'.format(height / theme_counts.sum() * 100),
            ha="center", fontsize=11)

plt.subplots_adjust(hspace=0.75, top=0.90)  # Increase space between subplots

plt.show()

In [14]:
# take all section - Dane and count themes
df[df['Section'] == 'Daně']['Theme'].value_counts()

In [12]:
df[df['Theme'] == 'Daň z příjmu FO']

In [6]:
plt.figure(figsize=(12, 4))
plt.suptitle('Counts of Questions by Year', size=22)
question_count_by_year = df['Year'].value_counts().sort_index()
g0 = sns.barplot(x=question_count_by_year.index, y=question_count_by_year.values, color='blue')
g0.set_xlabel("Year", fontsize=19)
g0.set_ylabel("Total Count", fontsize=19)
for p in g0.patches:
    height = p.get_height()
    g0.text(p.get_x() + p.get_width() / 2.,
            height + 3,
            '{:1.0f}'.format(height),
            ha="center", fontsize=11)

plt.show()

In [7]:
plt.figure(figsize=(10, 8))
plt.title('Count of Responders', size=22)

responder_counts = df["Responder Name"].value_counts()

g = sns.barplot(x=responder_counts.index, y=responder_counts.values, color='blue')
g.set_xlabel("Responder", fontsize=16)
g.set_ylabel("Total Count", fontsize=16)

for p in g.patches:
    height = p.get_height()
    g.text(p.get_x() + p.get_width() / 2.,
           height + 3,
           '{:1.0f}'.format(height),
           ha="center", fontsize=10)

plt.show()


In [8]:
question_tokens = [word_tokenize(question) for question in tqdm(df['Question Text'])]
len_question_tokens = [len(tokens) for tokens in question_tokens]
df['Question Length'] = len_question_tokens

In [9]:
grid = gridspec.GridSpec(5, 3)
plt.figure(figsize=(16, 6 * 4))

plt.suptitle('Question Lenghts by Different Categories', size=20)
count = 0
top_cats = df['Section'].value_counts().index
for n, col in enumerate(top_cats):
    plt.subplot(grid[n])
    sns.histplot(df[df['Section'] == col]['Question Length'], bins=50, color='green', kde=True)
    plt.axvline(df[df['Section'] == col]['Question Length'].mean(), color='red', linestyle='--')
    plt.axvline(df[df['Section'] == col]['Question Length'].quantile(0.05), color='blue', linestyle='--')
    plt.axvline(df[df['Section'] == col]['Question Length'].quantile(0.95), color='blue', linestyle='--')
    plt.title(f'Question Lengths for {col}', size=12)
    plt.xlabel('Length', size=10)
    plt.ylabel('Count', size=10)
    count += 1

plt.subplots_adjust(top=0.95, hspace=.4, wspace=.15)
plt.show()

In [10]:
answer_tokens = [word_tokenize(answer) for answer in tqdm(df['Answer Text'])]
len_answer_tokens = [len(tokens) for tokens in answer_tokens]
df['Answer Length'] = len_answer_tokens

In [11]:
grid = gridspec.GridSpec(5, 3)
plt.figure(figsize=(16, 6 * 4))

plt.suptitle('Answer Lenghts by Different Categories', size=20)
count = 0
top_cats = df['Section'].value_counts().index
for n, col in enumerate(top_cats):
    plt.subplot(grid[n])
    sns.histplot(df[df['Section'] == col]['Answer Length'], bins=50, color='green', kde=True)
    plt.axvline(df[df['Section'] == col]['Answer Length'].mean(), color='red', linestyle='--')
    plt.axvline(df[df['Section'] == col]['Answer Length'].quantile(0.05), color='blue', linestyle='--')
    plt.axvline(df[df['Section'] == col]['Answer Length'].quantile(0.95), color='blue', linestyle='--')
    plt.title(f'Answer Lengths for {col}', size=10)
    plt.xlabel('Length', size=10)
    plt.ylabel('Count', size=10)
    count += 1
    
plt.subplots_adjust(top=0.95, hspace=.4, wspace=.15)
plt.show()

In [12]:
stop_words = ['a', 'aby', 'aj', 'ak', 'ako', 'ale', 'alebo', 'and', 'ani', 'áno', 'asi', 'až', 'bez', 'bol', 'bola', 'boli', 'bolo', 'by', 'bol', 'bola', 'boli', 'bolo', 'by', 'byť', 'cez', 'čo', 'či', 'ďalší', 'ďalšia', 'ďalšie', 'dnes', 'do', 'ho', 'hoci', 'i', 'iba', 'iné', 'iný', 'ja', 'je', 'jeho', 'jsem', 'jej', 'ju', 'k', 'kam', 'každý', 'každá', 'každé', 'každí', 'kde', 'keď', 'kto', 'ktorá', 'ktoré', 'ktorou', 'ktorý', 'ktorí', 'ku', 'lebo', 'len', 'ma', 'mať', 'má', 'máte', 'medzi', 'mi', 'mna', 'mne', 'mnou', 'musieť', 'môcť', 'môj', 'môže', 'my', 'na', 'nad', 'nám', 'náš', 'naši', 'ne', 'neho', 'nej', 'nemu', 'nich', 'nie', 'nič', 'niektorý', 'niektorá', 'niektoré', 'niektorí', 'nielen', 'nim', 'nimi', 'nič', 'no', 'o', 'od', 'odo', 'on', 'ona', 'ono', 'oni', 'ono', 'ony', 'po', 'pod', 'podľa', 'pokiaľ', 'potom', 'práve', 'pre', 'prečo', 'pred', 'predo', 'preto', 'pretože', 'prvý', 'prvá', 'prvé', 'prví', 's', 'sa', 'so', 'si', 'se', 'svoj', 'svoje', 'svojich', 'svojím', 'svojími', 'ta', 'tak', 'takže', 'taký', 'taká', 'také', 'takí', 'tam', 'te', 'teba', 'tebe', 'tebou', 'teda', 'ten', 'tento', 'tieto', 'tiež', 'to', 'toto', 'toho', 'tohoto', 'tom', 'tomto', 'tomuto', 'tu', 'tú', 'tvoj', 'tvojími', 'ty', 'tý', 'tým', 'tými', 'už', 'v', 'vám', 'váš', 'vaši', 'veľmi', 'viac', 'vo', 'voči', 'však', 'všetok', 'vy', 'z', 'za', 'zo', 'že', 'den', 'dobrý', 'děkuji', 'bych', 'nebo', 'jak', 'jako', 'u', 'pro', 'tak', 'mít', 'ale', 'tak', 'jsem', 'když', 'zda']

In [13]:
top_words_questions = {}
for section in tqdm(df['Section'].value_counts().index):
    words = [word for question in df[df['Section'] == section]['Question Text'] for word in word_tokenize(question) if word.isalpha() and word.lower() not in stop_words]
    top_words_questions[section] = Counter(words).most_common(10)

In [14]:
plt.figure(figsize=(16, 6 * 4))
plt.suptitle('Top Words in Questions by Section', size=20)
count = 0
for section, words in top_words_questions.items():
    plt.subplot(grid[count])
    sns.barplot(x=[word[0] for word in words], y=[word[1] for word in words], color='green')
    plt.title(f'Top Words in {section}', size=12)
    plt.xlabel('Word', size=10)
    plt.ylabel('Count', size=10)
    plt.xticks(rotation=90)
    count += 1
    
plt.subplots_adjust(top=0.95, hspace=.4, wspace=.15)
plt.show()

In [15]:
top_words_answers = {}

for section in tqdm(df['Section'].value_counts().index):
    words = [word for answer in df[df['Section'] == section]['Answer Text'] for word in word_tokenize(answer) if word.isalpha() and word.lower() not in stop_words]
    top_words_answers[section] = Counter(words).most_common(10)

In [16]:
plt.figure(figsize=(16, 6 * 4))
plt.suptitle('Top Words in Answers by Section', size=20)
count = 0
for section, words in top_words_answers.items():
    plt.subplot(grid[count])
    sns.barplot(x=[word[0] for word in words], y=[word[1] for word in words], color='green')
    plt.title(f'Top Words in {section}', size=12)
    plt.xlabel('Word', size=10)
    plt.ylabel('Count', size=10)
    plt.xticks(rotation=90)
    count += 1
    
plt.subplots_adjust(top=0.95, hspace=.4, wspace=.15)
plt.show()

In [4]:
df_penize = pd.read_csv('../data/qa_penize.csv')

In [15]:
df_penize.head()

In [22]:
df_penize = df_penize.dropna(subset=['question_text', 'answer_text'])
df_penize = df_penize.drop_duplicates(subset=['question_text', 'answer_text'])
df_penize = df_penize.reset_index(drop=True)

df_penize['question_text'] = df_penize['question_text'].str.replace(r'\r\n|\n|\r', ' ', regex=True)
df_penize['answer_text'] = df_penize['answer_text'].str.replace(r'\r\n|\n|\r', ' ', regex=True)

df_penize['question_text'] = df_penize['question_text'].str.strip()
df_penize['answer_text'] = df_penize['answer_text'].str.strip()

df_penize['year'] = df_penize['question_date'].str.extract(r'(\d{4})')

df_penize.to_csv('../data/qa_penize_cleaned.csv', index=False)

In [17]:
# create plot by year
plt.figure(figsize=(12, 4))
plt.suptitle('Counts of Questions by Year', size=22)
question_count_by_year = df_penize['year'].value_counts().sort_index()
g0 = sns.barplot(x=question_count_by_year.index, y=question_count_by_year.values, color='blue')
g0.set_xlabel("Year", fontsize=19)
g0.set_ylabel("Total Count", fontsize=19)
for p in g0.patches:
    height = p.get_height()
    g0.text(p.get_x() + p.get_width() / 2.,
            height + 3,
            '{:1.0f}'.format(height),
            ha="center", fontsize=11)

In [19]:
# question answer length
question_tokens = [word_tokenize(question) for question in tqdm(df_penize['question_text'])]
len_question_tokens = [len(tokens) for tokens in question_tokens]
df_penize['question_length'] = len_question_tokens

answer_tokens = [word_tokenize(answer) for answer in tqdm(df_penize['answer_text'])]
len_answer_tokens = [len(tokens) for tokens in answer_tokens]
df_penize['answer_length'] = len_answer_tokens

In [21]:
# plot avg question and answer length
grid = gridspec.GridSpec(1, 2)
plt.figure(figsize=(16, 6))

plt.subplot(grid[0])
sns.histplot(df_penize['question_length'], bins=50, color='green', kde=True)
plt.axvline(df_penize['question_length'].mean(), color='red', linestyle='--')
plt.axvline(df_penize['question_length'].quantile(0.05), color='blue', linestyle='--')
plt.axvline(df_penize['question_length'].quantile(0.95), color='blue', linestyle='--')
plt.title('Question Lengths', size=12)
plt.xlabel('Length', size=10)
plt.ylabel('Count', size=10)
plt.subplot(grid[1])
sns.histplot(df_penize['answer_length'], bins=50, color='green', kde=True)
plt.axvline(df_penize['answer_length'].mean(), color='red', linestyle='--')
plt.axvline(df_penize['answer_length'].quantile(0.05), color='blue', linestyle='--')
plt.axvline(df_penize['answer_length'].quantile(0.95), color='blue', linestyle='--')
plt.title('Answer Lengths', size=12)
plt.xlabel('Length', size=10)
plt.ylabel('Count', size=10)