In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.tokenize import word_tokenize
from collections import Counter
import numpy as np
import os

In [None]:
data_path = "../../../multilingual-tst-datasets_private/"
languages = ['en', 'hi', 'mag', 'mr', 'ml', 'or', 'pa', 'te', 'ur']

In [None]:
avg_num_words_per_sentence = {}
sentence_length_distributions = {}
num_unique_words = {}
word_frequency_distributions = {}
word_length_distributions = {}

In [None]:
for lang in languages:
    df_neg_to_pos = pd.read_csv(os.path.join(data_path, f"{lang}_yelp_reference-0.csv"))
    df_pos_to_neg = pd.read_csv(os.path.join(data_path, f"{lang}_yelp_reference-1.csv"))

    df_merged = pd.concat([df_neg_to_pos, df_pos_to_neg], ignore_index=True)

    merged_text_list = (df_neg_to_pos['POSITIVE'] + ' ' + df_neg_to_pos['NEGATIVE']).tolist() + \
                       (df_pos_to_neg['POSITIVE'] + ' ' + df_pos_to_neg['NEGATIVE']).tolist()

    sentences = [word_tokenize(text) for text in merged_text_list]
    words = [word for sentence in sentences for word in sentence]

    avg_num_words_per_sentence[lang] = np.mean([len(sentence) for sentence in sentences])

    sentence_length_distributions[lang] = [len(sentence) for sentence in sentences]

    num_unique_words[lang] = len(set(words))

    word_freq = Counter(words)
    word_frequency_distributions[lang] = word_freq

    word_length_distributions[lang] = [len(word) for word in words]

In [None]:
# Average number of words per sentence
plt.figure(figsize=(10, 6))
plt.bar(avg_num_words_per_sentence.keys(), avg_num_words_per_sentence.values())
plt.xlabel('Language')
plt.ylabel('Average Number of Words per Sentence')
plt.title('Average Number of Words per Sentence by Language')

plt.tight_layout()
# plt.show()
save_path = os.path.join('figs/', f'avg_words.png')
plt.savefig(save_path)
plt.close()

In [None]:
# Distribution of Sentence Lengths
for lang, lengths in sentence_length_distributions.items():
    plt.figure(figsize=(10, 6))
    sns.histplot(lengths, kde=True)
    plt.xlabel('Sentence Length (in words)')
    plt.ylabel('Frequency')
    plt.title(f'Sentence Length Distribution for {lang}')
    plt.tight_layout()
    # plt.show()
    save_path = os.path.join('figs/', f'sent_ln_dist_{lang}.png')
    plt.savefig(save_path)
    plt.close()

In [None]:
# Number of Unique Words
plt.figure(figsize=(10, 6))
plt.bar(num_unique_words.keys(), num_unique_words.values())
plt.xlabel('Language')
plt.ylabel('Number of Unique Words')
plt.title('Number of Unique Words by Language')
plt.tight_layout()
# plt.show()
save_path = os.path.join('figs/', f'unique_words.png')
plt.savefig(save_path)
plt.close()

In [None]:
# Word Frequency Distribution
for lang, freq_dist in word_frequency_distributions.items():
    plt.figure(figsize=(10, 6))
    top_words = freq_dist.most_common(20)
    words, counts = zip(*top_words)
    plt.bar(words, counts)
    plt.xlabel('Words')
    plt.ylabel('Frequency')
    plt.title(f'Top 20 Words Frequency Distribution for {lang}')
    plt.xticks(rotation=45)
    plt.tight_layout()
    # plt.show()
    save_path = os.path.join('figs/', f'word_freq_dist_{lang}.png')
    plt.savefig(save_path)
    plt.close()

In [None]:
# Word Length Distribution
for lang, lengths in word_length_distributions.items():
    plt.figure(figsize=(10, 6))
    sns.histplot(lengths, kde=True)
    plt.xlabel('Word Length (in characters)')
    plt.ylabel('Frequency')
    plt.title(f'Word Length Distribution for {lang}')
    plt.tight_layout()
    # plt.show()
    save_path = os.path.join('figs/', f'word_ln_dist_{lang}.png')
    plt.savefig(save_path)
    plt.close()

In [None]:
# Language Comparison Metrics
comparison_df = pd.DataFrame({
    'Language': languages,
    'Avg_Words_per_Sentence': [avg_num_words_per_sentence[lang] for lang in languages],
    'Num_Unique_Words': [num_unique_words[lang] for lang in languages]
})
print(comparison_df)

In [None]:
# Correlation Analysis
correlation_results = []

for lang in languages:
    df = pd.DataFrame({
        'Sentence_Length': sentence_length_distributions[lang],
        'Word_Length': [np.mean([len(word) for word in word_tokenize(sentence)]) for sentence in merged_text_list]
    })
    correlation = df.corr().iloc[0, 1]
    correlation_results.append({
        'Language': lang,
        'Sentence_Length_to_Word_Length_Correlation': correlation
    })

correlation_df = pd.DataFrame(correlation_results)
print(correlation_df)

In [None]:
# Plot Correlation Results
plt.figure(figsize=(10, 6))
sns.barplot(x='Language', y='Sentence_Length_to_Word_Length_Correlation', data=correlation_df)
plt.xlabel('Language')
plt.ylabel('Correlation')
plt.title('Correlation between Sentence Length and Word Length by Language')
plt.ylim(-1, 1)  # Correlation values range from -1 to 1
plt.tight_layout()
# plt.show()
save_path = os.path.join('figs/', f'corr_snln_wdln.png')
plt.savefig(save_path)
plt.close()