In [None]:
# Step 1: Import required libraries
from datasets import load_dataset
from collections import Counter
from nltk import ngrams
import nltk

In [None]:
# Download punkt for word tokenization
nltk.download('punkt')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
# Step 2: Load dataset
ds = load_dataset("Helsinki-NLP/opus-100", "en-si", cache_dir="./hf_cache")

README.md: 0.00B [00:00, ?B/s]

en-si/test-00000-of-00001.parquet:   0%|          | 0.00/155k [00:00<?, ?B/s]

en-si/train-00000-of-00001.parquet:   0%|          | 0.00/65.8M [00:00<?, ?B/s]

en-si/validation-00000-of-00001.parquet:   0%|          | 0.00/153k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/979109 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [None]:
# Step 3: Extract Sinhala training texts
train_texts_si = [ex["si"] for ex in ds["train"]["translation"]]

In [None]:
# Step 4: Tokenize words
import re
train_texts_si_head = train_texts_si[:300000]
whitespace_pattern = r" ?[\u0D80-\u0DFF]+| ?\d+| ?[^\s\u0D80-\u0DFF\d]+"
tokenized_texts = [text.split() for text in train_texts_si_head]

In [None]:
# Step 4: Flatten list of tokens for unigram counts
all_tokens = [token for sublist in tokenized_texts for token in sublist]


In [None]:
# Step 5: Count unigrams
import pandas as pd
unigram_counts = Counter(all_tokens)
print("Top 10 Unigrams:")
df_unigrams = pd.DataFrame(unigram_counts.most_common(50), columns=['Unigram', 'Count'])
print(df_unigrams)

Top 10 Unigrams:
   Unigram  Count
0       මම  33751
1        -  28066
2      ඔයා  27453
3       මට  21642
4       ඒක  16536
5       මේ  13658
6      අපි  13256
7     ඔයාට  12321
8        ඒ  10591
9    කරන්න  10044
10     මගේ   9484
11     එයා   8439
12      එක   8097
13       ?   6648
14     නෑ.   6444
15      මං   6244
16     මේක   6100
17     ගැන   5890
18       .   5673
19      නෑ   5510
20    අපිට   5476
21    දැන්   5427
22   කියලා   5360
23   ඔයාගේ   4903
24    යන්න   4895
25     හරි   4862
26   වෙන්න   4609
27     ඒත්   4342
28  කියලා.   4319
29    එක්ක   4034
30    ඉන්න   3935
31     මාව   3750
32    තමයි   3747
33    මොකද   3554
34     ඔහු   3504
35       ,   3477
36   දෙයක්   3405
37     වගේ   3357
38     ඇයි   3350
39    එහෙම   3310
40     කතා   3298
41     නම්   3293
42       !   3280
43   නැහැ.   3278
44    ඔයාව   3056
45     අපේ   3045
46  කියන්න   2997
47    ගන්න   2982
48    වෙලා   2938
49    එයාට   2882


In [None]:
# Step 6: Count bigrams
all_bigrams = [bigram for sublist in tokenized_texts for bigram in ngrams(sublist, 2)]
bigram_counts = Counter(all_bigrams)
print("\nTop 10 Bigrams:")
df_unigrams = pd.DataFrame(bigram_counts.most_common(50), columns=['Unigram', 'Count'])
print(df_unigrams)


Top 10 Bigrams:
             Unigram  Count
0            (-, මම)   1629
1           (-, ඔයා)   1262
2         (මම, ඔයාට)   1221
3          (ඒ, වගේම)   1164
4         (City, in)    970
5            (-, මට)    927
6          (ඔයා, මට)    907
7         (ඇයි, ඔයා)    794
8       (මම, දන්නවා)    779
9       (කතා, කරන්න)    770
10          (මම, ඒක)    752
11          (ඒ, ගැන)    750
12     (මම, හිතන්නේ)    734
13          (මට, ඒක)    618
14           (-, ඒක)    611
15         (ඔයා, ඒක)    610
16          (මම, මේ)    607
17        (මම, ඔයාව)    602
18     (මම, හිතන්නෙ)    512
19          (-, අපි)    496
20         (-, ඔයාට)    482
21        (ඔයා, මාව)    464
22       (ඒක, කරන්න)    453
23        (ඒක, තමයි)    450
24         (ඒත්, මම)    433
25      (මාත්, එක්ක)    422
26      (මොකක්, හරි)    421
27        (ඔයා, ගැන)    418
28  (කරන්න, පුළුවන්)    406
29         (ඔයා, මේ)    404
30       (මම, දන්නේ)    401
31      (මට, කියන්න)    391
32        (ඔයාට, ඒක)    389
33     (ඔයා, දන්නවද)    385
34 

In [None]:
# Step 7: Count trigrams
all_trigrams = [trigram for sublist in tokenized_texts for trigram in ngrams(sublist, 3)]
trigram_counts = Counter(all_trigrams)
print("\nTop 10 Trigrams:")
df_trigrams = pd.DataFrame(trigram_counts.most_common(50), columns=['Trigram', 'Count'])
print(df_trigrams)


Top 10 Trigrams:
                          Trigram  Count
0               (මම, දන්නවා, ඔයා)    146
1                   (ඒ, වගේම, මම)    135
2                 (මම, දන්නෙ, නෑ)    129
3                    (මම, ඒ, ගැන)    105
4                  (මම, කියන, දේ)    105
5                 (මම, දන්නේ, නෑ)    101
6                (ඒකට, කමක්, නෑ.)     93
7              (එක්ක, කතා, කරන්න)     89
8               (මම, දන්නේ, නැහැ)     89
9                  (මම, ඔයා, ගැන)     85
10                (මට, ඒක, කරන්න)     82
11                   (මට, ඒ, ගැන)     80
12              (මට, උදව්, කරන්න)     78
13                 (ඇයි, ඔයා, මට)     77
14          (මාව, විශ්වාස, කරන්න)     77
15                  (ඒ, වගේම, මට)     73
16              (මම, හිතන්නේ, මම)     69
17               (වෙන, එකක්, නෑ.)     69
18                 (මං, කියන, දේ)     67
19             (මම, හිතන්නේ, ඔයා)     67
20                  (-, ඇයි, ඔයා)     66
21         (මාව, විශ්වාස, කරන්න.)     63
22              (ගැන, කතා, කරන්න)     6

In [None]:
# Step 3: Extract Sinhala training texts
train_texts_en = [ex["en"] for ex in ds["train"]["translation"]]

In [None]:
# Step 4: Tokenize words
import re
train_texts_en_head = train_texts_en[:300000]
tokenized_texts_en = [text.split() for text in train_texts_en_head]

In [None]:
# Step 4: Flatten list of tokens for unigram counts
all_tokens_en = [token for sublist in tokenized_texts_en for token in sublist]


In [None]:
unigram_counts = Counter(all_tokens_en)
print("Top 10 Unigrams:")
df_unigrams = pd.DataFrame(unigram_counts.most_common(50), columns=['Unigram', 'Count'])
print(df_unigrams)

Top 10 Unigrams:
   Unigram  Count
0      the  50326
1        I  46269
2       to  42586
3      you  42120
4        a  32175
5        -  31294
6       of  20097
7       is  17874
8       in  16154
9      and  16037
10     You  14731
11    that  13131
12    your  12370
13     for  11569
14    have  10866
15      it  10782
16     I'm  10686
17      my  10669
18     are  10026
19    this   9593
20      be   9525
21     was   9065
22      me   9048
23     not   8821
24    with   8557
25      on   8360
26      we   8287
27      do   7719
28   don't   7325
29    What   7017
30     get   6662
31    know   6573
32      he   6393
33     The   6289
34    It's   6173
35    what   6166
36    just   6138
37     And   6021
38     all   5772
39     can   5678
40    you.   5578
41    like   5516
42    will   5496
43      We   5461
44   about   5226
45      at   4937
46     it.   4666
47     got   4652
48     out   4584
49     me.   4440


In [None]:
# Step 6: Count bigrams
all_bigrams = [bigram for sublist in tokenized_texts_en for bigram in ngrams(sublist, 2)]
bigram_counts = Counter(all_bigrams)
print("\nTop 10 Bigrams:")
df_unigrams = pd.DataFrame(bigram_counts.most_common(50), columns=['Unigram', 'Count'])
print(df_unigrams)


Top 10 Bigrams:
          Unigram  Count
0       (in, the)   4149
1      (I, don't)   3258
2       (of, the)   3033
3       (to, the)   2538
4      (are, you)   2503
5        (to, be)   2243
6      (want, to)   2226
7       (on, the)   2204
8       (do, you)   2116
9      (have, to)   2112
10       (I, was)   2048
11    (going, to)   1974
12         (-, I)   1969
13     (This, is)   1915
14      (I, have)   1873
15      (out, of)   1776
16      (I, know)   1542
17     (need, to)   1511
18      (have, a)   1462
19        (I, am)   1461
20      (to, get)   1432
21      (Do, you)   1395
22      (you, to)   1390
23     (I, think)   1385
24     (Are, you)   1384
25       (-, You)   1320
26        (is, a)   1309
27     (I'm, not)   1291
28     (for, the)   1289
29       (I, can)   1260
30      (I, want)   1251
31        (in, a)   1183
32    (you, know)   1167
33      (at, the)   1165
34      (if, you)   1126
35     (I, can't)   1103
36      (is, the)   1100
37     (this, is)   1094
38    (y

In [None]:
# Step 7: Count trigrams
all_trigrams = [trigram for sublist in tokenized_texts_en for trigram in ngrams(sublist, 3)]
trigram_counts = Counter(all_trigrams)
print("\nTop 10 Trigrams:")
df_trigrams = pd.DataFrame(trigram_counts.most_common(50), columns=['Trigram', 'Count'])
print(df_trigrams)


Top 10 Trigrams:
                Trigram  Count
0      (What, are, you)    842
1       (What, do, you)    785
2      (I, don't, know)    672
3       (you, want, to)    547
4         (I, want, to)    530
5      (I, don't, want)    429
6       (Why, are, you)    413
7         (I, have, to)    382
8          (a, lot, of)    380
9         (I, need, to)    377
10    (don't, want, to)    356
11     (I'm, going, to)    345
12      (want, you, to)    338
13    (are, you, doing)    334
14      (Do, you, know)    308
15         (I, have, a)    306
16    (I, don't, think)    301
17     (do, you, think)    296
18    (What, the, hell)    293
19          (to, be, a)    282
20        (-, I, don't)    280
21       (I, want, you)    267
22       (out, of, the)    266
23       (get, out, of)    260
24       (We, need, to)    258
25      (going, to, be)    256
26      (This, is, the)    254
27    (Why, don't, you)    241
28  (don't, know, what)    240
29       (How, do, you)    233
30     (out, of, here