In [20]:
# Function to read sentences from file
def read_sentences_from_file(filename):
    sentences = []
    with open(filename, 'r') as file:
        for line in file:
            sentences.append(line.strip())
    return sentences

sentences1 = read_sentences_from_file('input1.txt')
sentences2 = read_sentences_from_file('input2.txt')

# Split sentences into words and count total letters
def sentence_to_words(sentences):
    words_list = []
    letter_count_list = []
    for sentence in sentences:
        words = sentence.split()
        words_list.append(words)
        letter_count = sum(len(word) for word in words)
        letter_count_list.append(letter_count)
    return words_list, letter_count_list

words1, letters1 = sentence_to_words(sentences1)
words2, letters2 = sentence_to_words(sentences2)

# Edit distance function to compute difference between two words
def compute_distance(word1, word2):
    m=len(word1)
    n=len(word2)
    dp=[[0 for i in range(n+1)] for j in range(m+1)]
    for i in range(m+1):
        for j in range(n+1):
            if i == 0:
                dp[i][j]=j
            elif j == 0:
                dp[i][j]=i
            elif word1[i-1] == word2[j-1]:
                dp[i][j]=dp[i-1][j-1]
            else:
                dp[i][j]=1+min(dp[i-1][j], dp[i][j-1], dp[i-1][j-1])
    return dp[m][n]

# Heuristic function for sentence comparison
def heuristic(sent1, sent2):
    words1 = sent1.split(' ')
    words2 = sent2.split(' ')
    n=len(words1)
    m=len(words2)

    dp = [[0 for i in range(n+1)] for j in range(m+1)]
    for i in range(m+1):
        for j in range(n+1):
            if(i==0):
                if(j==0):
                    dp[i][j]=0
                else:
                    dp[i][j]=dp[i][j-1]+len(words1[j-1])
            elif(j==0):
                if(i==0):
                    dp[i][j]=0
                else:
                    dp[i][j]=dp[i-1][j]+len(words2[i-1])
            else:
                dp[i][j]=min(dp[i-1][j-1] + compute_distance(words2[i-1], words1[j-1]), dp[i-1][j] + len(words2[i-1]), dp[i][j-1] + len(words1[j-1]))

    return dp[m][n]

# Precompute heuristic values for sentence pairs
heuristic_ = [[0 for i in range(len(sentences1))] for j in range(len(sentences2))]
for i in range(len(sentences1)-1,-1,-1):
    for j in range(len(sentences2)-1,-1,-1):
        if(i==len(sentences1)-1 and j==len(sentences2)-1):
            heuristic_[i][j]=heuristic(sentences1[i],sentences2[j])
            continue
        heuristic_[i][j]= float('inf')
        if(i+1<len(sentences1)):
            s=0
            for word in sentences1[i]:
                s+=len(word)
            heuristic_[i][j]=min(heuristic_[i][j],heuristic_[i+1][j]+s)
        if(j+1<len(sentences2)):
            s=0
            for word in sentences2[j]:
                s+=len(word)
            heuristic_[i][j]=min(heuristic_[i][j],heuristic_[i][j+1]+s)
        if(i+1<len(sentences1) and j+1<len(sentences2)):
            heuristic_[i][j]=min(heuristic_[i][j],heuristic_[i+1][j+1])
        heuristic_[i][j]+=heuristic(sentences1[i],sentences2[j])

# Search function to compare two documents
def search_similarity():
    state = [0,0,0]

    while(state[0]<len(sentences1) and state[1]<len(sentences2)):
        pos = (state[0],state[1])
        cost = state[2]
        moves = []
        if(pos[0]+1<len(sentences1)):
            h = heuristic_[pos[0]+1][pos[1]]+letters1[pos[0]]
            moves.append([pos[0]+1,pos[1],h+cost+letters1[pos[0]]])
        if(pos[1]+1<len(sentences2)):
            h = heuristic_[pos[0]][pos[1]+1]+letters2[pos[1]]
            moves.append([pos[0],pos[1]+1,h+cost+letters2[pos[1]]])
        if(pos[0]+1<len(sentences1) and pos[1]+1<len(sentences2)):
            h = heuristic_[pos[0]+1][pos[1]+1]
            c1=heuristic(sentences1[i],sentences2[j])
            moves.append([pos[0]+1,pos[1]+1,h+cost+c1])
            h+=letters1[pos[0]]+letters2[pos[1]]
            moves.append([pos[0]+1,pos[1]+1,h+cost])
        moves.sort(key=lambda x: x[2])
        if(len(moves)==0):
            state[2]+=heuristic_[state[0]][state[1]]
            break
        moves[0][2]-=heuristic_[moves[0][0]][moves[0][1]]
        state = moves[0]

    return state[2]

# Calculate similarity between two documents
non_similar_words = search_similarity()
similar_words = max(sum(letters1),sum(letters2)) - non_similar_words
if similar_words<=0 :
  similar_words=0
# Output result based on similarity ratio
ratio = similar_words/(max(sum(letters1),sum(letters2)))
print("Similarity Ratio: {:.2f}".format(ratio))
if ratio >= 0.3:
    print("Documents are significantly similar")
else:
    print("Documents are distinct")


Similarity Ratio: 1.00
Documents are significantly similar
