In [None]:
!apt-get install -y build-essential git wget
!git clone https://github.com/moses-smt/giza-pp.git
%cd giza-pp
!make -j4
%cd ..


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
build-essential is already the newest version (12.9ubuntu3).
git is already the newest version (1:2.34.1-1ubuntu1.15).
wget is already the newest version (1.21.2-2ubuntu1.1).
0 upgraded, 0 newly installed, 0 to remove and 41 not upgraded.
fatal: destination path 'giza-pp' already exists and is not an empty directory.
/content/giza-pp
make -C GIZA++-v2
make -C mkcls-v2
make[1]: Entering directory '/content/giza-pp/mkcls-v2'
make[1]: 'mkcls' is up to date.
make[1]: Leaving directory '/content/giza-pp/mkcls-v2'
make[1]: Entering directory '/content/giza-pp/GIZA++-v2'
make[1]: Nothing to be done for 'opt'.
make[1]: Leaving directory '/content/giza-pp/GIZA++-v2'
/content


In [None]:
def tokenize_file(inp, out):
    with open(inp, "r", encoding="utf-8") as fi, open(out, "w", encoding="utf-8") as fo:
        for line in fi:
            fo.write(" ".join(line.strip().split()) + "\n")

tokenize_file("/content/src.txt", "src.tok")
tokenize_file("/content/tgt.txt", "tgt.tok")

print("Tokenized → src.tok, tgt.tok")


Tokenized → src.tok, tgt.tok


In [None]:
%%bash
# Input tokenized files
SRC=src.tok
TGT=tgt.tok

# Working directory
WORKDIR=giza_work
PREFIX=corpus

# Make work directory and enter it
mkdir -p $WORKDIR
cd $WORKDIR

# Copy tokenized files
cp ../$SRC .
cp ../$TGT .

# Step 1: Convert to SNT format
echo "Running plain2snt..."
../giza-pp/GIZA++-v2/plain2snt.out $SRC $TGT

echo "Generated SNT files:"
ls -lh *.snt

# Auto-detect correct .snt file generated by plain2snt
SNT_FILE=$(ls | grep -E "${SRC%.*}_${TGT%.*}.snt")
echo "Using SNT file: $SNT_FILE"

# Step 2: Create co-occurrence file
echo "Building co-occurrence file..."
../giza-pp/GIZA++-v2/snt2cooc.out ${SRC}.vcb ${TGT}.vcb $SNT_FILE > ${PREFIX}.cooc

echo "Cooc file size:"
ls -lh ${PREFIX}.cooc

# Step 3: Train GIZA++
echo "Running GIZA++..."
../giza-pp/GIZA++-v2/GIZA++ \
  -S ${SRC}.vcb \
  -T ${TGT}.vcb \
  -C $SNT_FILE \
  -CoocurrenceFile ${PREFIX}.cooc \
  -o $PREFIX

echo "========================================="
echo " DONE! Alignment file created:"
echo "   giza_work/corpus.A3.final"
echo "========================================="


Running plain2snt...
src -> src
tgt -> tgt
Generated SNT files:
-rw-r--r-- 1 root root 7.9K Nov 27 21:26 src_tgt.snt
-rw-r--r-- 1 root root 7.9K Nov 27 21:26 tgt_src.snt
Using SNT file: src_tgt.snt
Building co-occurrence file...
Cooc file size:
-rw-r--r-- 1 root root 131K Nov 27 21:26 corpus.cooc
Running GIZA++...
 DONE! Alignment file created:
   giza_work/corpus.A3.final


w1:src w2:tgt
Vocabulary does not exist.
Vocabulary does not exist.
END.
bash: line 42: 23749 Segmentation fault      (core dumped) ../giza-pp/GIZA++-v2/GIZA++ -S ${SRC}.vcb -T ${TGT}.vcb -C $SNT_FILE -CoocurrenceFile ${PREFIX}.cooc -o $PREFIX




---



---



In [None]:
GITHUB_TOKEN = ""


In [2]:
from getpass import getpass

token = getpass('Enter GitHub Token: ')
repo_url = "https://github.com/rashiranjan22/IndicUD-Data.git"

clone_url = f"https://{token}:x-oauth-basic@github.com/rashiranjan22/IndicUD-Data.git"

!git clone $clone_url


Enter GitHub Token: ··········
Cloning into 'IndicUD-Data'...
remote: Enumerating objects: 1474, done.[K
remote: Counting objects: 100% (1474/1474), done.[K
remote: Compressing objects: 100% (1471/1471), done.[K
remote: Total 1474 (delta 3), reused 1470 (delta 3), pack-reused 0 (from 0)[K
Receiving objects: 100% (1474/1474), 497.60 KiB | 5.08 MiB/s, done.
Resolving deltas: 100% (3/3), done.
Filtering content: 100% (1449/1449), 326.79 MiB | 13.51 MiB/s, done.


In [4]:
# ==========================================
# STEP 2 — MERGING SCRIPT
# ==========================================
import os
from pathlib import Path

ROOT = Path("/content/IndicUD-Data/Hindi/by_file")
OUTPUT = "/content/hindi_final_merged.conllu"

DOMAIN_ORDER = [
    "AGRICULTURE",
    "BOX-OFFICE",
    "CONVERSATIONAL",
    "CRICKET",
    "DISEASE",
    "ENTERTAINMENT",
    "GADGET",
    "JUDICIARY",
    "NEWS-ARTICLES",
    "RECIPE",
    "TOURISM"
]


def split_sentences(filepath):
    """Return list of full raw sentence blocks (including sent_id, text, tokens)."""
    blocks = []
    current = []

    with open(filepath, "r", encoding="utf-8") as f:
        for line in f:
            if line.strip() == "":
                if current:
                    blocks.append(current)
                    current = []
            else:
                current.append(line.rstrip("\n"))

    if current:
        blocks.append(current)

    return blocks


global_index = 0
output_lines = []


for domain in DOMAIN_ORDER:
    domain_path = ROOT / domain
    conllu_files = sorted(domain_path.glob("*.conllu"))

    for file in conllu_files:

        # SKIP 1 INDEX WHEN NEW FILE STARTS
        global_index += 1

        sentences = split_sentences(file)

        for sentence_block in sentences:
            # Add metadata line
            output_lines.append(f"# Sentence pair {global_index}")
            global_index += 1

            # Add original content
            for line in sentence_block:
                output_lines.append(line)

            output_lines.append("")   # blank line between sentences


# Write final output
with open(OUTPUT, "w", encoding="utf-8") as f:
    f.write("\n".join(output_lines))

print("MERGING COMPLETE!")
print("Output saved at:", OUTPUT)

MERGING COMPLETE!
Output saved at: /content/hindi_final_merged.conllu
