# Semantic Relatedness

In [1]:
import re
import pandas as pd
from collections import defaultdict
import string

remove stopwords, lowercase and special characters

In [2]:
def load_stopwords(stopwords_file):
    stopwords_df = pd.read_csv(stopwords_file)
    stopwords = set(stopwords_df['stopwords'].values)
    return stopwords

def clean_sentence(sentence):
    # Lowercase the sentence
    sentence = sentence.lower()
    # Remove any special characters
    sentence = ''.join(c for c in sentence if ord(c) < 128)
    return sentence

# Find lexical overlap

In [3]:
def find_lexical_overlap(text_file, stopwords_file):
    # Read the text file
    with open(text_file, 'r') as file:
        text = file.read()

    # Split the text into sentences
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)

    # Load stopwords from CSV file
    stopwords = load_stopwords(stopwords_file)

    # List to store sentence pairs with lexical overlap
    sentence_pairs = []

    # Dictionary to store sentence occurrence count
    sentence_counts = defaultdict(int)

    # Iterate over sentences to find pairs with lexical overlap
    for i in range(len(sentences)):
        sentences[i] = clean_sentence(sentences[i])
        # Check if sentence length is between 5 and 25 words
        if 5 <= len(sentences[i].split()) <= 25:
            for j in range(i+1, len(sentences)):
                sentences[j] = clean_sentence(sentences[j])
                # Check if sentence length is between 5 and 25 words
                if 5 <= len(sentences[j].split()) <= 25:
                    words1 = set(sentences[i].split())
                    words2 = set(sentences[j].split())
                    # Remove stopwords from words
                    words1 = words1.difference(stopwords)
                    words2 = words2.difference(stopwords)
                    overlap = words1.intersection(words2)
                    if len(overlap) >= 5:  # Choose the lexical overlap
                        # Only add pair if neither sentence has appeared more than twice
                        if sentence_counts[sentences[i]] < 2 and sentence_counts[sentences[j]] < 2:
                            sentence_pairs.append((sentences[i], sentences[j]))
                            # Increase count for each sentence
                            sentence_counts[sentences[i]] += 1
                            sentence_counts[sentences[j]] += 1

    df = pd.DataFrame(sentence_pairs, columns=["Sentence 1", "Sentence 2"])

    return df

# Example 

In [4]:
text_file = "./sentence.txt"  # 
stopwords_file = './stopwords_hausa.csv'  # 
result_df_hausa = find_lexical_overlap(text_file, stopwords_file)

result_df_hausa.head(5)

Unnamed: 0,Sentence 1,Sentence 2
0,mun auna cewa za a samu kyakkyawan shugabanci ...,don haka masu oarin ata mana suna har su na ce...
1,"makonni biyu da suka gabata, wannan jarida ta ...",makonni biyu ne wannan jarida ta buga labarin ...
2,"jamiyyun sun haa da pdp, lp, nnpp, apga, sdp, ...",jamiyyun da su ka karya wannan doka sun haa ha...
3,tuni dai su ka kafa kwamitin mambobi 13 arashi...,gwamnan jihar delta kuma an takarar mataimakin...
4,tuni dai su ka kafa kwamitin mambobi 13 arashi...,wike da wasu gwamnonin pdp huu dai su na so sh...


# Pretty Print

In [None]:
for index, row in result_df_hausa.iterrows():
    print("S1:", row["Sentence 1"])
    print("S2:", row["Sentence 2"])
    print("----------------------------------------------------------------------")

In [None]:
!python3 scripts/semantic_relatedness.py \
  -i data/sentence.txt \
  -s data/stopwords_hausa.csv \
  -o results \
  --clean_sentences \
  --remove_stopwords \