In [1]:
# Task 1. Tokenize your dataset
import os
from collections import Counter
import re

directory = "/Users/sokratbashirov/Desktop/NLP/Proj1"


# Function to tokenize a document
def tokenize_document(document):
    words = [word.strip('“”“!"#&\'()*+,-./:;<=>?@[\\]^_`{|}~') for word in document.split()]
    words = [word for word in words if word]

    return words


# Function to tokenize all documents in the specified directory
def tokenize_dataset(directory):
    all_tokens = []
    for filename in os.listdir(directory):
        if filename.endswith('.txt'):
            filepath = os.path.join(directory, filename)
            with open(filepath, 'r', encoding='utf-8') as file:
                text = file.read()
                tokens = tokenize_document(text)
                all_tokens.extend(tokens)
    return all_tokens

# Function to calculate token and type frequency
def calculate_frequency(tokens):
    token_freq = Counter(tokens)
    types = len(token_freq)
    tokens_count = sum(token_freq.values())
    return types, tokens_count, token_freq

# Main function
def main():
    
    # Tokenize the dataset
    tokens = tokenize_dataset(directory)
    
    # Calculate token and type frequency
    types, tokens_count, token_freq = calculate_frequency(tokens)
    
    # Print results
    print("Number of types:", types)
    print("Number of tokens:", tokens_count)
    print("Token frequency:")
    for token, freq in token_freq.most_common(10): # Print top 10 most frequent tokens
        print(f"{token}: {freq}")

if __name__ == "__main__":
    main()


Number of types: 7746
Number of tokens: 55208
Token frequency:
və: 1696
üçün: 524
–: 484
•: 446
2: 366
1: 314
ilə: 306
istifadə: 306
bir: 284
3: 282


In [4]:
# Task 2. Test Heaps law
import numpy as np
from scipy.optimize import curve_fit

# Function to fit Heap's Law equation
def heaps_law(T, k, beta):
    return k * np.power(T, beta)

types = 7746
tokens = 27604

# Generate array of tokens
max_tokens = tokens * 2  
token_values = np.arange(1, max_tokens, max_tokens // 100) 

# Fit Heap's Law to the data
popt, pcov = curve_fit(heaps_law, token_values, types)

# Extract fitted parameters
k_fit, beta_fit = popt

print("Fitted k:", k_fit)
print("Fitted beta:", beta_fit)


Fitted k: 7745.999980684839
Fitted beta: 2.529396789542398e-10


In [16]:
# Task3 - BPE tokenizer
import os
from collections import Counter

# Function to read all text files in a directory
def read_corpus(directory):
    corpus = []
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            with open(os.path.join(directory, filename), "r", encoding="utf-8") as file:
                corpus.append(file.read())
    return corpus

# Function to compute byte pair frequencies in the corpus
def get_pair_freqs(corpus):
    pair_freqs = Counter()
    for text in corpus:
        tokens = text.split()
        for token in tokens:
            for i in range(len(token) - 1):
                pair_freqs[token[i], token[i + 1]] += 1
    return pair_freqs

# Function to merge most frequent byte pair
def merge_pairs(corpus, pair):
    new_corpus = []
    for text in corpus:
        new_text = text.replace(''.join(pair), ' '.join(pair))
        new_corpus.append(new_text)
    return new_corpus

# Function to tokenize using BPE
def tokenize_bpe(corpus, num_merges):
    for _ in range(num_merges):
        pair_freqs = get_pair_freqs(corpus)
        most_common_pair = pair_freqs.most_common(1)[0][0]
        corpus = merge_pairs(corpus, most_common_pair)
    return corpus

# Main function
def main():
    num_merges = 100 # Number of BPE merges
    corpus = read_corpus(directory)
    tokenized_corpus = tokenize_bpe(corpus, num_merges)
    
    # Print tokenized corpus
    for i, text in enumerate(tokenized_corpus):
        print(f"Tokenized text {i}:")
        print(text)

if __name__ == "__main__":
    main()


Tokenized text 0:
Azə rb a yca n  Re spubl i ka s ı  Təhs i l  Nazi r l i yi

Təhs i l   İns t i tutu
Peşə ixt i s a s t əhs i l i şöb ə s i




         İlk peşə-ixt i s a s t əhs i l i m üə s s i s ə l ə r i üçü n
“ Kompüt e r üzr ə ope r a to r, d iza yne r” ixt i s a s ı üzr ə
“Si s t e m p r oqr a m t ə m i n a tı” fə nn i n i n
NÜMUNƏVİ TƏDRİS PROQRAMI
          










Bakı – 2017




İzaha t v ə r əqi
Kompüt e r i n p r oqr a m t ə m i n a tı (PT) – t ə l i m a t l a r yığımı n d a n ib a r ə t o l ub, kompüt e r i id a r ə e d i r, l azım o l a n m ə s ə l ə l ə r i h ə l l e d i r v ə ye r i n ə ye t i r d i kl ə r i fu nks i y a l a r a gör ə s i s t e m p r oqr a m t ə m i n a tı n a v ə t ə tb iqi p r oqr a m t ə m i n a tı n a bölü nür .
Tə tb iqi p r oqr a m t ə m i n a tı i s t i fa d əçi n i n h ə r ha ns ı m ə s ə l ə n i h ə l l e tm ə k m əqs ə d i l ə y a r a d ı l ı r v ə s i s t e m p r oqr a ml a r ı n ı n id a r ə s i a ltı n d a i şl əyi r . 
Si s t e m p 

In [17]:
# Task4 - Sentence segmentation

import re
import os

def sentence_segmentation(text):
    # Define regular expression pattern to match sentence-ending punctuation
    punctuation_pattern = r'(?<=[.!?]) +'

    # Split text into sentences based on punctuation
    sentences = re.split(punctuation_pattern, text)

    # Clean up sentences (remove leading/trailing whitespace)
    sentences = [sentence.strip() for sentence in sentences]

    # Filter out sentences that consist only of digits
    sentences = [sentence for sentence in sentences if not re.match(r'^\d+$', sentence)]

    return sentences

def read_file(filepath):
    with open(filepath, 'r', encoding='utf-8') as file:
        return file.read()

def process_corpus(directory):
    segmented_sentences = []
    for filename in os.listdir(directory):
        if filename.endswith('.txt'):
            filepath = os.path.join(directory, filename)
            text = read_file(filepath)

            # Split text into sentences based on line breaks
            lines = [line.strip() for line in text.split('\n') if line.strip()]
            for line in lines:
                # Perform sentence segmentation on each line
                sentences = sentence_segmentation(line)
                segmented_sentences.extend(sentences)
    return segmented_sentences

# Process the corpus and perform sentence segmentation
segmented_sentences = process_corpus(directory)

# Print segmented sentences
for i, sentence in enumerate(segmented_sentences, start=1):
    print(f"Sentence {i}: {sentence}")
    
#Starts from Document 6179

Sentence 1: Azərbaycan  Respublikası  Təhsil  Nazirliyi
Sentence 2: Təhsil   İnstitutu
Sentence 3: Peşə ixtisas təhsili şöbəsi
Sentence 4: İlk peşə-ixtisas təhsili müəssisələri üçün
Sentence 5: “ Kompüter üzrə operator, dizayner” ixtisası üzrə
Sentence 6: “Sistem proqram təminatı” fənninin
Sentence 7: NÜMUNƏVİ TƏDRİS PROQRAMI
Sentence 8: Bakı – 2017
Sentence 9: İzahat vərəqi
Sentence 10: Kompüterin proqram təminatı (PT) – təlimatlar yığımından ibarət olub, kompüteri idarə edir, lazım olan məsələləri həll edir və yerinə yetirdikləri funksiyalara görə sistem proqram təminatına və tətbiqi proqram təminatına bölünür.
Sentence 11: Tətbiqi proqram təminatı istifadəçinin hər hansı məsələni həll etmək məqsədilə yaradılır və sistem proqramlarının idarəsi altında işləyir.
Sentence 12: Sistem proqram təminatı (SPT) kompüterdə informasiyanın emalı prosesinin təşkili ilə yanaşı tətbiqi proqramlar üçün normal mühiti təmin edir.
Sentence 13: Sistem proqram təminatı kompüterin aparat vasitələri ilə sı

In [18]:
# Task5 - Spelling checker with Levenshtein distance.

import os
import numpy as np

# Function to calculate Levenshtein distance between two strings
def levenshtein_distance(s1, s2):
    m, n = len(s1), len(s2)
    dp = np.zeros((m+1, n+1))
    for i in range(m+1):
        dp[i, 0] = i
    for j in range(n+1):
        dp[0, j] = j
    for i in range(1, m+1):
        for j in range(1, n+1):
            cost = 0 if s1[i-1] == s2[j-1] else 1
            dp[i, j] = min(dp[i-1, j] + 1, dp[i, j-1] + 1, dp[i-1, j-1] + cost)
    return dp[m, n]

# Function to check spelling and suggest corrections
def spelling_checker(word, dictionary, max_distance=2, max_suggestions=5):
    suggestions = []
    for entry in dictionary:
        distance = levenshtein_distance(word, entry)
        if distance <= max_distance:
            suggestions.append((entry, distance))
    suggestions.sort(key=lambda x: x[1])
    return suggestions[:max_suggestions]

# Function to tokenize all documents in the specified directory and build the dictionary
def build_dictionary(directory):
    dictionary = set()
    for filename in os.listdir(directory):
        if filename.endswith('.txt'):
            filepath = os.path.join(directory, filename)
            with open(filepath, 'r', encoding='utf-8') as file:
                text = file.read()
                tokens = text.split()  # Split text into tokens (words)
                dictionary.update(tokens)  # Add unique tokens to the dictionary set
    return dictionary


# Build the dictionary from the corpus
dictionary = build_dictionary(directory)


# Example misspelled word
misspelled_words = ["edmək","tapib","şilkəti","maraklı"]

for misspelled_word in misspelled_words:
    corrections = spelling_checker(misspelled_word, dictionary)
    print(f"Misspelled word: {misspelled_word}")
    if corrections:
        print("Suggestions:")
        for suggestion, distance in corrections:
            print(f"- {suggestion} (Levenshtein distance: {distance})")
    else:
        print("No suggestions found.")


Misspelled word: edmək
Suggestions:
- edək (Levenshtein distance: 1.0)
- etmək (Levenshtein distance: 1.0)
- elmə (Levenshtein distance: 2.0)
- etmə” (Levenshtein distance: 2.0)
- etmk (Levenshtein distance: 2.0)
Misspelled word: tapib
Suggestions:
- tapıb (Levenshtein distance: 1.0)
- tarix (Levenshtein distance: 2.0)
- tapa (Levenshtein distance: 2.0)
- vacib (Levenshtein distance: 2.0)
- tipik (Levenshtein distance: 2.0)
Misspelled word: şilkəti
Suggestions:
- şirkəti (Levenshtein distance: 1.0)
- şirkət (Levenshtein distance: 2.0)
Misspelled word: maraklı
Suggestions:
- maraqlı (Levenshtein distance: 1.0)
- yararlı (Levenshtein distance: 2.0)
