# **Use of Levenshtein Distance to predict correct answer.**

Libraries used
1. panda
2. numpy
3. Levenshtein

In [None]:
import pandas as pd 
import numpy as np 
train = pd.read_csv("../input/chaii-hindi-and-tamil-question-answering/train.csv")
test = pd.read_csv("../input/chaii-hindi-and-tamil-question-answering/test.csv")
submission = pd.read_csv("../input/chaii-hindi-and-tamil-question-answering/sample_submission.csv")

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train["question"].tolist()
train["answer_text"].tolist()

**Function Decorator**

In [None]:
fails = []

def return_default_value_if_fails(default_value):

    def decorator(func):
        def inner(*args, **kwargs):
            try:
                return func(*args, **kwargs)
            except Exception as e:
                fails.append((func, (args, kwargs), e))
                return default_value
        return inner

    return decorator

**Creating Dataframe to store train or test data**

In [None]:
def getResults(questions, fn):
    @return_default_value_if_fails(default_value=0.1)
    def getResult(q):
        answer, score, prediction = fn(q)
        return [q, prediction, answer, score]
    output=pd.DataFrame(list(map(getResult, questions)), columns=["Question", "Prediction", "Correct Answer", "Score"])
    return output
train_data=train["question"].tolist()


# Levenshtein Distance
The Levenshtein distance between two sequences is the simplest weighting factor in which each operation that is insertion and deletion has a cost of 1. Substitution is considered as a pair of operation hence it has a cost of 2.

The Levenshtein Python C extension module contains functions for fast computation of:

* Levenshtein (edit) distance, and edit operations
* string similarity
* approximate median strings, and generally string averaging
* string sequence and set similarity

**Installation**
pip install levenshtein


# Training the model

In [None]:
from Levenshtein import ratio
data=train
def gettingApproximateAnswer(q):
    max_score = 0
    answer = ""
    prediction = ""
    for idx, row in data.iterrows():
        score = ratio(row["question"], q)
        if score >= 0.9: #You can stop
            return row["answer_text"], score, row["answer_text"]
        elif score > max_score: # Need to continue because unsure
            max_score = score
            answer = row["answer_text"]
            prediction = row["answer_text"]
    if max_score > 0.3:
        return answer, max_score, prediction
    
    return "Apology.I couldn't get you.", max_score, prediction

# Testing

In [None]:
test_data=test['question'].tolist()
output=getResults(test_data, gettingApproximateAnswer)
output

# Converting output to csv

In [None]:
output.iloc[:,1:4]
output.to_csv('submission.csv', index=False)