In [48]:
import joblib
import numpy as np
import regex as re
from fuzzywuzzy import fuzz
import xgboost as xgb
from collections import Counter

In [38]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [36]:
model = joblib.load('xgboost_model1.pkl')

In [37]:
common_words = {
    # Punctuation
    '.', ',', '!', '?', ';', ':', '"', "'", '-', '...', '(', ')', '[', ']', '{', '}',

    # Prepositions
    'in', 'on', 'at',
    'of', 'to', 'for', 'with',
    'by', 'from', 'about',
    'into', 'onto', 'upon',
    'within', 'without',

    # Articles
    'a', 'an', 'the',

    # Conjunctions
    'and', 'or', 'but',
    'so', 'yet', 'nor',
    'for',

    # Helping Verbs
    'am', 'is', 'are',
    'was', 'were',
    'be', 'been',
    'have', 'has', 'had',
    'do', 'does', 'did',
    'can', 'could',
    'will', 'would',
    'shall', 'should',
    'may', 'might',
    'must',

    # Common Pronouns
    'i', 'you', 'he', 'she', 'it', 'we', 'they',
    'me', 'him', 'her', 'us', 'them',
    'my', 'your', 'his', 'her', 'its', 'our', 'their',
    'mine', 'yours', 'hers', 'ours', 'theirs',
    'this', 'that', 'these', 'those',

    # Common Adverbs
    'now', 'then',
    'here', 'there',
    'quickly', 'slowly',
    'very', 'too',
    
    # Quantifier
    "all", "some", "any", "no", "few", "many", "several", "much", "most", 
    "more", "less", "little", "a lot of", "a few", "a little", "enough", 
    "lots of", "plenty of", "each", "every", "either", "neither", "both", "none"
}

interogative_words = {"what", "how", "why", "when", "where", "which", "who", "whom", "whose"}

extra_intero_words ={"are", "is", "am", "was", "were", "do", "does", "did", "can", "could",
                      "will", "would", "shall", "should", "have", "has", "had"}

In [39]:
def unique_word_out(text1,text2,unique_word_set):
    words = re.findall(r'\b\w+\b', text1.lower())
    
    uni_text1 = ""
    for word in words:
        if word in unique_word_set:
            uni_text1+=word+' '
    
    words = re.findall(r'\b\w+\b', text2.lower())
    uni_text2 = ""
    for word in words:
        if word in unique_word_set:
            uni_text2+=word+' '
    
    return fuzz.ratio(uni_text1, uni_text2)

In [40]:
def singularize_words(sentence):
    tokens = word_tokenize(sentence)
    lemmatizer = WordNetLemmatizer()
    singularized_sentence = []

    for word in tokens:
        # Exclude words that are already singular or not nouns or ending with "ing"
        if word.lower() in ["is", "am", "are", "was", "were", "has", "have", "had", "does", "do", "did"]:
            singularized_sentence.append(word)
        else:
            singularized_word = lemmatizer.lemmatize(word)
            singularized_sentence.append(singularized_word)
    return singularized_sentence

In [55]:
def preprocess_data(text1,text2):
    
    q1_cc=0
    q1_uc =0
    q1_ic =0

        
    q2_cc=0
    q2_uc =0
    q2_ic =0

    q1_common = []
    q1_unique = []
    q1_intero = []
    
    
    q2_common = []
    q2_unique = []
    q2_intero = []
    
    text1 = str(text1)
    text2 = str(text2)
    # Remove punctuation and split the text into words
    
    words1 = re.findall(r'\b\w+\b', text1.lower())
    words2 = re.findall(r'\b\w+\b', text2.lower())
    
    str1=""
    str2=""
    for word in words1:
        str1+=word+" "
        
    str1=str1.strip()
    
    for word in words2:
        str2+=word+" "
        
    str2=str2.strip()
    
    words1 = singularize_words(str1)
    words2 = singularize_words(str2)

    # print(words1)
    # print(words2)
    
    # Check if the first word is an extra interrogative word
    if words1 and words1[0] in extra_intero_words:
        q1_intero.append(words1[0])
        words1 = words1[1:]
    
    for word in words1:
        if word in common_words:
            q1_common.append(word)
        elif word in interogative_words:
            q1_intero.append(word)
        else:
            q1_unique.append(word)
  
  
     # Check if the second word is an extra interrogative word
    if words2 and words2[0] in extra_intero_words:
        q2_intero.append(words2[0])
        words2 = words2[1:]
    
    for word in words2:
        if word in common_words:
            q2_common.append(word)
        elif word in interogative_words:
            q2_intero.append(word)
        else:
            q2_unique.append(word)
  
    q1_cl = len(q1_common)
    q1_ul = len(q1_unique)
    q1_il = len(q1_intero)
    
    q2_cl = len(q2_common)
    q2_ul = len(q2_unique)
    q2_il = len(q2_intero)
    
    
    
    tempc = q2_common.copy()
    tempu = q2_unique.copy()
    tempi = q2_intero.copy()
    
    for word in q1_common:
        if word in tempc:
            q1_cc+=1
            tempc.remove(word)
    
    for word in q1_unique:
        if word in tempu:
            q1_uc+=1
            tempu.remove(word)
    
    for word in q1_intero:
        if word in tempi:
            q1_ic+=1
            tempi.remove(word)
    
    
    tempc = q1_common.copy()
    tempu = q1_unique.copy()
    tempi = q1_intero.copy()

    for word in q2_common:
        if word in tempc:
            q2_cc+=1
            tempc.remove(word)
    
    for word in q2_unique:
        if word in tempu:
            q2_uc+=1
            tempu.remove(word)
    
    for word in q2_intero:
        if word in tempi:
            q2_ic+=1
            tempi.remove(word)
            
    unique_set = set()
    
    
    for word in q1_unique:
        unique_set.add(word)
    
    for word in q2_unique:
        unique_set.add(word)
    
    fuzzy_ratio=unique_word_out(text1,text2,unique_set)
    
    if(q1_cl==0):
        q1_cl=1
    if(q1_ul==0):
        q1_ul=1
    if(q1_il==0):
        q1_il=1
    if(q2_cl==0):
        q2_cl=1
    if(q2_ul==0):
        q2_ul=1
    if(q2_il==0):
        q2_il=1

    return q1_cc/q1_cl,q1_uc/q1_ul,q1_ic/q1_il,q2_cc/q2_cl,q2_uc/q2_ul,q2_ic/q2_il,fuzzy_ratio/100.0


In [64]:
def get_output(x_test):
    x_test1 = x_test.reshape(1, -1)
    print(x_test1)
    predictions= model.predict(x_test1)
    # Count the occurrences of each number
    counter = Counter(predictions)

    # Get the number with maximum frequency
    max_frequency_number = max(counter, key=counter.get)

    return int(max_frequency_number)

In [65]:
import tkinter as tk

def concatenate_strings():
    ques1 = entry1.get("1.0", "end-1c")  # Get text from start to end without the trailing newline character
    ques2 = entry2.get("1.0", "end-1c")
    print(ques1)
    print(ques2)
    x_test = preprocess_data(ques1, ques2)
    x_test = np.array(x_test)
    res = get_output(x_test)
    print(res)
    if(res==1):
        ans="Duplicate"
    else:
        ans="Not Duplicate"
    output_label.config(text="Question1 and Question2 are " + ans)

# Create main window
root = tk.Tk()
root.title("Quora Question Duplicacy Checker")

# Set window size
root.geometry("500x400")

label1 = tk.Label(root, text="Question 1:")
label1.pack()

# Create input text boxes
entry1 = tk.Text(root, height=4, width=50)  # Set height to 5 lines
entry1.pack(pady=5)

# Label for question 2
label2 = tk.Label(root, text="Question 2:")
label2.pack()

entry2 = tk.Text(root, height=4, width=50)  # Set height to 5 lines
entry2.pack(pady=5)

# Create button
check_button = tk.Button(root, text="Check", command=concatenate_strings)
check_button.pack(pady=5)

# Create output label
output_label = tk.Label(root, text="")
output_label.pack(pady=5)

# Run the main event loop
root.mainloop()


fgdg fdg dfgd fgd
fg dfg dfg 
[0.   0.   0.   0.   0.   0.   0.69]
[[0.   0.   0.   0.   0.   0.   0.69]]
0
What is like to have sex with cousin?
What is it like to have sex with your cousin?
[1.         1.         1.         0.66666667 1.         1.
 1.        ]
[[1.         1.         1.         0.66666667 1.         1.
  1.        ]]
1
What should I do to be a great geologist?
What is it like to have sex with your cousin?
[0.16666667 0.         1.         0.16666667 0.         1.
 0.38      ]
[[0.16666667 0.         1.         0.16666667 0.         1.
  0.38      ]]
0
What should I do to be a great geologist?
How can I be a good geologist?
[0.5  0.5  0.   0.75 0.5  0.   0.77]
[[0.5  0.5  0.   0.75 0.5  0.   0.77]]
1
