In [None]:
# Group Assignment - UN General Debates Dataset Analysis
# Student Name: Raju Dubey
# Group Number: 26
# Dataset: UN General Debates Dataset
# Date: December 12, 2025

# ============================================================================
# PART I: SENTENCE COMPLETION USING N-GRAM MODEL
# ============================================================================

# Replace or run this cell to get a robust version of the bigram completion code

import pandas as pd
import numpy as np
import re
from collections import defaultdict, Counter
import warnings
warnings.filterwarnings('ignore')


def load_dataset(path):
    try:
        df = pd.read_csv(path)
    except FileNotFoundError:
        raise FileNotFoundError(f"CSV not found at: {path}")
    except Exception as e:
        raise RuntimeError(f"Failed to read CSV: {e}")
    return df


def tokenize_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)  # remove punctuation
    return text.split()


def build_bigram_model(tokens):
    bigram_counts = defaultdict(Counter)
    for i in range(len(tokens) - 1):
        w1 = tokens[i]
        w2 = tokens[i + 1]
        bigram_counts[w1][w2] += 1
    return bigram_counts


def recommend_next_words(bigram_counts, last_word, top_n=3):
    if last_word in bigram_counts:
        return bigram_counts[last_word].most_common(top_n)
    return []


def main(csv_path):
    df = load_dataset(csv_path)
    print("Dataset shape:", df.shape)
    print("Columns:", df.columns.tolist())

    if 'text' not in df.columns:
        raise KeyError("Column 'text' not found in dataset. Available columns: " + ", ".join(df.columns))

    # build corpus (optionally sample if dataset is huge)
    texts = df['text'].dropna().astype(str)
    # optional: texts = texts.sample(frac=1.0)[:10000]  # sample first 10k rows for speed
    corpus = " ".join(texts.tolist())

    tokens = tokenize_text(corpus)
    print(f"Total tokens: {len(tokens):,}")

    bigram_counts = build_bigram_model(tokens)
    print(f"Unique words in model: {len(bigram_counts):,}")

    # test sentence
    test_sentence = "it is a pleasure"
    last_word = tokenize_text(test_sentence)[-1]
    print(f"\nTest sentence: {test_sentence}")
    print(f"Last word for prediction: '{last_word}'")

    recs = recommend_next_words(bigram_counts, last_word, top_n=3)
    if recs:
        print("\nTop recommendations:")
        for i, (w, cnt) in enumerate(recs, 1):
            print(f"{i}. {w} (count={cnt})  -> '{test_sentence} {w}'")
    else:
        print(f"No suggestions found for '{last_word}'")

    # show top bigrams for 'pleasure' if exists
    if 'pleasure' in bigram_counts:
        print("\nTop bigrams starting with 'pleasure':")
        for w, cnt in bigram_counts['pleasure'].most_common(5):
            print(f"  'pleasure {w}': {cnt}")


# adjust path to your local file if needed
csv_path = r"C:\Users\rajud\Downloads\archive\un-general-debates.csv"
main(csv_path)