In [None]:
!pip -q install langdetect

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import scipy.stats as stats
import re
import nltk
from nltk.corpus import words
from collections import Counter
import math
import unicodedata
import time
from tqdm import tqdm
import pickle
from langdetect import detect
import warnings

In [None]:
courses_df = pd.read_csv('/kaggle/input/course-reviews-on-coursera/Coursera_courses.csv')
reviews_df = pd.read_csv('/kaggle/input/course-reviews-on-coursera/Coursera_reviews.csv')
merged_df = pd.merge(courses_df, reviews_df, on='course_id', how='inner')

In [None]:
def calculate_entropy(text):
    """Calculate Shannon entropy of the text to detect randomness."""
    if not text:
        return 0
    if not isinstance(text, str) or pd.isna(text):
        return 0  # Return 0 for NaN or non-string values
    text = str(text).lower()
    length = len(text)
    if length == 0:  # Handle empty strings
        return 0
    char_counts = Counter(text)
    entropy = -sum((count/length) * math.log2(count/length) for count in char_counts.values())
    return entropy

In [None]:
# Suppress langdetect warnings for cleaner output
warnings.filterwarnings("ignore", category=UserWarning)

def detect_language(text):

    if not isinstance(text, str) or pd.isna(text) or len(text.strip()) < 3:
        return 'unknown'  # For NaN, empty, or very short text
    try:
        return detect(text)
    except:
        return 'unknown'  # Fallback for any detection errors

In [None]:
def get_df_with_entropy_and_language_tag(df, review_col='reviews'):
    df['entropy'] = df[review_col].apply(calculate_entropy)
    df['language'] = df[review_col].apply(detect_language)
    df.to_pickle('df_w_langs_entropies.pkl')
    return df

In [None]:
# Get distribution of langugages using langdetect
def plot_language_distribution(df, review_col='reviews'):

    # Count reviews per language
    language_counts = df['language'].value_counts().reset_index()
    language_counts.columns = ['language', 'count']
    
    # Print stats
    print("Language Distribution:")
    print(language_counts.to_markdown())
    
    # Plot bar chart
    plt.figure(figsize=(12, 6))
    sns.barplot(data=language_counts, x='language', y='count')
    plt.title('Number of Reviews by Language')
    plt.xlabel('Language (ISO 639-1 Code)')
    plt.ylabel('Number of Reviews')
    plt.xticks(rotation=45, ha='right')
    plt.grid(True, alpha=0.3)
    
    # Add count labels on top of bars
    for i, row in language_counts.iterrows():
        plt.text(i, row['count'], row['count'], ha='center', va='bottom')
    
    plt.tight_layout()
    plt.savefig('num_reviews_language.png')
    plt.show()
    return

In [None]:
# Actuall run stuff
df_entropy_lang = get_df_with_entropy_and_language_tag(merged_df.copy(), review_col='reviews')
plot_language_distribution(df_entropy_lang)