In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import math
import re
from difflib import SequenceMatcher

In [2]:
# df = pd.read_excel(r'C:\\Users\\Admin\\Desktop\\Intemo DA Project\\IDP Analytics Sheet.xlsx')
# df.head(10)

In [3]:
def levenshtein_distance(str1, str2):
    char_to_char_weight = 1
    char_to_symbol_weight = 1
    symbol_to_symbol_weight = 0.2
    missing_char_weight = 1
    missing_symbol_weight = 1
    case_dissimilarity_weight = 0.1

    m, n = len(str1), len(str2)

    dp = [[0 for _ in range(n + 1)] for _ in range(m + 1)]

    # for i in range(m + 1):
    #     dp[i][0] = i * (missing_symbol_weight if is_symbol(str1[i - 1]) else missing_char_weight)
    # for j in range(n + 1):
    #     dp[0][j] = j * (missing_symbol_weight if is_symbol(str2[j - 1]) else missing_char_weight)

    for i in range(1,m + 1):
        dp[i][0] = i * (missing_symbol_weight if is_symbol(str1[i - 1]) else missing_char_weight)
    for j in range(1,n + 1):
        dp[0][j] = j * (missing_symbol_weight if is_symbol(str2[j - 1]) else missing_char_weight)

    for i in range(1, m + 1):
        for j in range(1, n + 1):
            char1 = str1[i - 1]
            char2 = str2[j - 1]

            if char1 == char2:
                substitution_cost = dp[i - 1][j - 1]
            elif char1.lower() == char2.lower():
                substitution_cost = dp[i - 1][j - 1] + case_dissimilarity_weight
            else:
                is_char1_symbol = is_symbol(char1)
                is_char2_symbol = is_symbol(char2)

                if is_char1_symbol and is_char2_symbol:
                    weight = symbol_to_symbol_weight
                elif (is_char1_symbol and char2.isalpha()) or (char1.isalpha() and is_char2_symbol):
                    weight = char_to_symbol_weight
                else:
                    weight = char_to_char_weight

                substitution_cost = dp[i - 1][j - 1] + weight

            insertion_cost = dp[i][j - 1] + (missing_symbol_weight if is_symbol(char2) else missing_char_weight)
            deletion_cost = dp[i - 1][j] + (missing_symbol_weight if is_symbol(char1) else missing_char_weight)

            dp[i][j] = min(substitution_cost, insertion_cost, deletion_cost)

    return dp[m][n]

def is_symbol(c):
    symbols = set("/.*%@#$!&")
    return c in symbols

In [4]:
def word_similarity(word1, word2):
    return SequenceMatcher(None, word1, word2).ratio()

In [5]:
def tokenize_sentence(sentence):
    words = re.split(r'\s+', sentence)
    return words

In [6]:
def process_sentences(sentence1, sentence2, threshold=0.8):
    words1 = tokenize_sentence(sentence1)
    words2 = tokenize_sentence(sentence2)

    tot_lev = 0

    if(len(words1) == len(words2)):
        for i in range(len(words1)):
            tot_lev += levenshtein_distance(words1[i],words2[i])
        return tot_lev
    
    for word1 in words1:
        for word2 in words2:
            similarity = word_similarity(word1.lower(), word2.lower())
            if similarity >= threshold:
                tot_lev +=  levenshtein_distance(word1, word2)
    
    return tot_lev

In [7]:
def unmatched_sentences(sentence1, sentence2, threshold=0.8):
    words1 = tokenize_sentence(sentence1)
    words2 = tokenize_sentence(sentence2)

    words_not_matched = []
    
    for word1 in words1:
        found_match = False
        for word2 in words2:
            similarity = word_similarity(word1.lower(), word2.lower())
            if similarity >= threshold:
                found_match = True
                break

        if not found_match:
            words_not_matched.append(word1)
    
    return words_not_matched

In [8]:
def num_extra_space(str1, str2):
    extra_space = 0
    extra_len = 0
    strng = str1
    if(len(str2) > len(str1)):
        extra_len = len(str2)-len(str1)
    for i in range(extra_len):
        strng += " "
    
    for i in range(len(str2)):
        if(str2[i]==' ' and strng[i]!=' '):
            extra_space += 1
    return extra_space

In [9]:
def similarity_score(str1, str2):
    if str1 == "Not Provided" or str2 == "Not Provided":
        return 1.0
    
    lev = process_sentences(str1, str2, 0.75)
    unmatched = unmatched_sentences(str1, str2, 0.75)
    temp = 0
    for word in unmatched:
        temp += len(word)
    lev += (1.5*temp)
    lev += (0.05*num_extra_space(str1,str2))
    max_length = max(len(str1), len(str2))
    similarity = 1 - (lev / max_length)
    return similarity

In [10]:
def process_data(df):
    dataset = df.iloc[3:]
    dataset.reset_index(drop=True, inplace=True)
    dataset.columns = dataset.iloc[0]
    dataset = dataset.iloc[1:]
    dataset.reset_index(drop=True, inplace=True)
    dataset.head(10)

    dataset.replace(['--', 'NaN'], pd.NA, inplace=True)
    dataset.fillna('Not Provided', inplace=True)

    dataset.fillna('Not Provided', inplace=True)
    dataset['accuracy_normal'] = dataset.apply(lambda row: similarity_score(row['Source Data'], row['LLM Output (Normal Image)']), axis=1)
    dataset['accuracy_200'] = dataset.apply(lambda row: similarity_score(row['Source Data'], row['LLM Output (200% Resized)']), axis=1)
    dataset['accuracy_400'] = dataset.apply(lambda row: similarity_score(row['Source Data'], row['LLM Output (400% Resized)']), axis=1)
    dataset['accuracy_normal'] = dataset['accuracy_normal']*100
    dataset['accuracy_200'] = dataset['accuracy_200']*100
    dataset['accuracy_400'] = dataset['accuracy_400']*100

    dataset['accuracy_normal'] = dataset['accuracy_normal'].clip(lower=0)
    dataset['accuracy_200'] = dataset['accuracy_200'].clip(lower=0)
    dataset['accuracy_400'] = dataset['accuracy_400'].clip(lower=0)

    mean_normal = np.mean(dataset.loc[dataset['accuracy_normal'] != 0, 'accuracy_normal'])
    mean_200 = np.mean(dataset.loc[dataset['accuracy_200'] != 0, 'accuracy_200'])
    mean_400 = np.mean(dataset.loc[dataset['accuracy_400'] != 0, 'accuracy_400'])

    return mean_normal, mean_200, mean_400

In [11]:
import pandas as pd

def process_excel_sheet():
    df = pd.read_excel(r'C:\\Users\\Admin\\Desktop\\Intemo DA Project\\IDP Analytics Sheet.xlsx')
    starting_columns = [col for col in df.columns if col.startswith("INV X")]
    output_dataframes = []

    for start_col in starting_columns:
        start_index = df.columns.get_loc(start_col)
        columns = [start_col] + [
            df.columns[start_index + offset] for offset in [1, 3, 5] if start_index + offset < len(df.columns)
        ]
        dataset = df[columns]
        # print(dataset.head(5))

        output = process_data(dataset)

        output_df = pd.DataFrame({
            'INV Column': [start_col],
            'Accuracy (Normal)': [output[0]],
            'Accuracy (200%)': [output[1]],
            'Accuracy (400%)': [output[2]],
            'Max Accuracy': max(output[0],output[1],output[2])
        })

        output_dataframes.append(output_df)

    return output_dataframes

output_dataframes = process_excel_sheet()

In [12]:
final_dataframe = pd.concat(output_dataframes, ignore_index=True)
final_dataframe

Unnamed: 0,INV Column,Accuracy (Normal),Accuracy (200%),Accuracy (400%),Max Accuracy
0,INV X27.1,98.79836,95.73581,100.0,100.0
1,INV X5,98.61015,98.852322,99.205263,99.205263
2,INV X19,99.489796,97.936338,98.572422,99.489796
3,INV X6,99.22449,98.455882,100.0,100.0
4,INV X8,99.790921,99.944097,100.0,100.0
5,INV X9,99.679487,99.585595,100.0,100.0
6,INV X1,95.953733,97.529453,97.787193,97.787193
7,INV X2,97.339296,99.786325,99.786325,99.786325
8,INV X3,93.961601,97.802451,97.099749,97.802451
9,INV X4,95.346493,97.891003,97.913627,97.913627


In [13]:
final_dataframe.to_csv('all_processed.csv')