In [0]:
import numpy as np
np.seterr(divide='ignore', invalid='ignore')

{'divide': 'ignore', 'over': 'warn', 'under': 'ignore', 'invalid': 'ignore'}

In [0]:
import re, math
import nltk
import numpy as np
import wikipedia

from dateutil import parser
from collections import Counter
from nltk import pos_tag
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from gingerit.gingerit import GingerIt
from difflib import SequenceMatcher

# n-gram individual BLEU---Word Order
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import SmoothingFunction
    
#importing libraries
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [0]:
def lsa(text):
    
    stopset = set(stopwords.words('english'))
    text=[text]
    
    #scikit-learn's TF-IDF vectorizer
    vectorizer = TfidfVectorizer(stop_words=stopset, use_idf=True, ngram_range=(1, 3))
    X = vectorizer.fit_transform(text)
    #X[0]   #sparse matrix
    #print(X[0])
    
    lsa = TruncatedSVD(n_components=2, n_iter=100)
    lsa.fit(X)
    terms = vectorizer.get_feature_names()
    for i, comp in enumerate(lsa.components_): 
        termsInComp = zip (terms,comp)
        sortedTerms =  sorted(termsInComp, key=lambda x: x[1], reverse=True) [:20]
        
    return sortedTerms

In [0]:
def text_to_vector(text):
    WORD = re.compile(r'\w+')
    words = WORD.findall(text)
    return Counter(words)

In [0]:
def get_model_answer(key_text):
    key_wiki = (wikipedia.page(key_text))
    model_answer = key_wiki.content
    #print(modelanswer)    
    return model_answer

In [0]:
def cosine(vec1, vec2):
    intersection = set(vec1.keys()) & set(vec2.keys())
    numerator = sum([vec1[x] * vec2[x] for x in intersection])

    sum1 = sum([vec1[x]**2 for x in vec1.keys()])
    sum2 = sum([vec2[x]**2 for x in vec2.keys()])
    denominator = math.sqrt(sum1) * math.sqrt(sum2)

    if not denominator:
        return 0.0
    else:
        return float(numerator) / denominator

In [0]:
def gram(text):
    
    parser = GingerIt()
    a=parser.parse(text+".")
    
    check=a['result']
    return check
    

In [0]:
def cal_spgm(user_answer):
    regex = re.compile("([\(\[]).*?([\)\]])")
    ratio_spgm=[]
    
    for sent in user_answer.split("."):
        if( len(sent) != 0):
            result = re.sub( regex, "", sent)
            corrected_user_answer = gram(result)
            ratio_spgm.append( SequenceMatcher(None, corrected_user_answer, result).ratio())
    
    ratio_spgm = np.mean(ratio_spgm)
    print("Spelling and Grammer Score: %s"%(ratio_spgm)) 
    return ratio_spgm, corrected_user_answer

In [0]:
def cal_lsa(model_answer, user_answer):
    ma = dict(lsa(model_answer))
    #print(ma)
    ua = dict(lsa(user_answer))
    #print(ua)
    
    num_items = {k: ua[k] for k in ma if k in ua}
    score_lsa=0
    sum=0
    for key in num_items:

        if(num_items[key]>ma[key]):
            sum=sum+((2*ma[key])-num_items[key])
        else:
            sum=sum+(num_items[key]/ma[key])
    ratio_lsa=sum/(len(ua)-1)
    print("LSA Score: %s"%(ratio_lsa)) 

    return ratio_lsa

In [0]:
def cal_SentenceSimilarity( modelanswer, useranswer):
    #Sentence similarity
    list_ma = modelanswer.split(".")
    list_ua = useranswer.split(".") 

    vectorizer = TfidfVectorizer()
    ss_mean=0
    sum=0
    
    for j in range(0,len(list_ua)):
        if (len(list_ua[j])>10):
            max=0
            for i in range(0,len(list_ma)):
                corpus = [list_ma[i],list_ua[j]]
                tfidf = vectorizer.fit_transform(corpus)
                words = vectorizer.get_feature_names()
                similarity_matrix = cosine_similarity(tfidf)
                if(similarity_matrix[0][1]>max):
                    max=similarity_matrix[0][1]
            sum = sum + max

    ss_mean = sum /(len(list_ua) - 1)
    print("Sentence Similarity Score: %s"%(ss_mean))
    
    chencherry = SmoothingFunction()
    result=0.0
    sum=0.0
    for j in range(0,len(u)):
        if(len(u[j])!=0):
            max=0
            for i in range(0,len(m)):
                mm=list(m[i].split(" "))
            result=sentence_bleu(m[i], u[j], weights=(1, 0, 0, 0),smoothing_function=chencherry.method1)           
            sum=sum+result

    result=sum/(len(u)-1)
    print("BLEU Score: %s"%(result)) 
    
    return ss_mean, result

In [0]:
def cal_date( modelanswer, useranswer):
    re_day = r"0[1-9]|[12][0-9]|3[01]"
    re_days = r"[1-9]|[12][0-9]|3[01]"
    re_month_digits = r"0[1-9]|1[0-2]"
    re_month_char = r"January|Febuary|March|April|May|June|July|August|September|October|November|December"
    re_month_char_short = r"Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec"
    re_year_4_digits = r"0[0-9][0-9][1-9]|[1-9][0-9][0-9][0-9]"
    re_year_2_digits = r"0[1-9]|[1-9][0-9]"

    pattern_ddmmyy = r"(?:(?:%s)[/\.-](?:%s)[/\.-](?:%s))"%( re_day, re_month_digits, re_year_2_digits)
    pattern_ddmmyyyy = r"(?:(?:%s)[/\.-](?:%s)[/\.-](?:%s))"%( re_day, re_month_digits, re_year_4_digits)
    pattern_yyyymmdd = r"(?:(?:%s)[/\.-](?:%s)[/\.-](?:%s))"%( re_year_4_digits, re_month_digits, re_day)
    pattern_mddyyyy = r"(?:(?:%s) (?:%s), (?:%s))"%( re_month_char, re_day, re_year_4_digits)
    pattern_ddmmmyyyy = r"(?:(?:%s) (?:%s), (?:%s))"%( re_day, re_month_char_short, re_year_4_digits)
    repl_pat=r"(?:(?:%s) (?:%s) (?:%s))"%( re_day, re_month_char, re_year_4_digits)
    repl_pat_n=r"(?:(?:%s) (?:%s) (?:%s))"%( re_days, re_month_char, re_year_4_digits)
    pattern_date_all = r"%s|%s|%s|%s|%s|%s|%s"%(pattern_ddmmyy, pattern_ddmmyyyy, pattern_yyyymmdd, pattern_mddyyyy, pattern_ddmmmyyyy,repl_pat,repl_pat_n)
    
    lst_ul=[]
    lst_ml=[]
    sent_dt_lst={}
    user_dt_lst={}
    date_res=0.0
    final_dt_res=0.0
    countt=0.0 
    
    lst_m=re.findall(pattern_date_all,modelanswer)
    for i in lst_m:
        d = parser.parse(i)
        lst_ml.append(d.strftime("%Y-%m-%d"))

    lst_u=re.findall(pattern_date_all,useranswer)
    for i in lst_u:
        d = parser.parse(i)
        #if(d.strftime("%Y-%m-%d") in lst_ml)

    for sent in modelanswer.split("."):   
        dates=re.findall(pattern_date_all,sent)
        if(dates):
            for date in dates:
                if (date in sent_dt_lst):
                    sent_dt_lst[date]=sent_dt_lst[date]+"."+sent
                else:
                    sent_dt_lst[date]=sent



    for sent in useranswer.split("."): 
        dates=re.findall(pattern_date_all,sent)

        if len(dates)!=0:
            for date in dates:
                if date in sent_dt_lst.keys():
                    countt=countt+1
                    date_res+=SequenceMatcher(None, sent, sent_dt_lst[date]).ratio()
    if(countt!=0):
        final_dt_res=date_res/countt
    else:
        final_dt_res = 0
        
    print("Date Score: %s"%(final_dt_res)) 
    return final_dt_res

In [0]:
def cal_score( model_answer, user_answer):
    final_score = 0.0
    spgm_score = 0.0
    lsa_score = 0.0
    ss_score = 0.0
    bleu_score = 0.0
    dt_score = 0.0
    
    #Check Grammer
    spgm_score, corrected_user_answer = cal_spgm(user_answer)
    
    #Calculate LSA Score
    lsa_score = cal_lsa(model_answer, user_answer)
    
    #Calculate Sentence Similarity Score
    ss_score, bleu_score = cal_SentenceSimilarity(model_answer, user_answer)

    #Calculate Numerical Score
    dt_score = cal_date( model_answer, user_answer)
    
    if(dt_score==0.0):
        final_score = spgm_score*40 + lsa_score*20 + ss_score*20 + bleu_score*20
    else:
        final_score = spgm_score*40 + lsa_score*20 + ss_score*10 + bleu_score*20 + dt_score*10
        
    return final_score

In [0]:
def take_input():
    ann=[]
    topic=[]
    with open('/home/phaniraj/Desktop/RKD/5.txt') as f:
        for line in f:
            res=line.split("\t")
            if(len(res)>1):
                #print(res[1])
                if(res[0]=='Q1'):
                    topic.append("Non-Cooperation Movement")
                elif(res[0]=='Q2'):
                    topic.append("Simon Commission")
                elif(res[0]=='Q3'):
                    topic.append("Chauri Chaura")
                elif(res[0]=='Q4'):
                    topic.append("Rowlatt Act")
                elif(res[0]=='Q5'):
                    topic.append("Swaraj Party")
                ann.append(res[1])
    return topic, ann

In [0]:
try:
    topic, user_answers = take_input()
    for idx, user_answer in enumerate(user_answers):
        #print(topic[idx],user_answer)
        #model_answer = get_model_answer("Salt March")
        #user_answer = 'The accused were denied the right to know the accusers and the evidence used in the trial. Gandhi and others thought that constitutional opposition to the measure was fruitless, so on 6 April, a hartal was organised where Indians would suspend all business and would fast, pray and hold public meetings against the Black Act as a sign of their opposition and civil disobedience would be offered against the law. This event was known as the Rowlatt Satyagraha.'
        model_answer = get_model_answer(topic[idx])
        #print(model_answer)
     
        score = cal_score( model_answer, user_answer)
        print(score)
        print("\n")
except ArithmeticError:  
        print ("" )

Spelling and Grammer Score: 1.0
LSA Score: 0.1601501041955279
Sentence Similarity Score: 0.47061557365984547
BLEU Score: 0.3757335346580497
Date Score: 0
60.12998425026846


Spelling and Grammer Score: 0.9974358974358974
LSA Score: 0.1491152598124726
Sentence Similarity Score: 1.0000000000000002
BLEU Score: 0.3757335346580497
Date Score: 0
70.39441178684635


Spelling and Grammer Score: 1.0
LSA Score: 0.07873446687990465
Sentence Similarity Score: 0.35651892294281046
BLEU Score: 0.3757335346580497
Date Score: 0
56.219738489615295


Spelling and Grammer Score: 0.9912425045321435
LSA Score: 0.08855190725799711
Sentence Similarity Score: 0.3610331238585783
BLEU Score: 0.3757335346580497
Date Score: 0
56.15607149677824


Spelling and Grammer Score: 0.9938922151793702
LSA Score: 0.1024667921910661
Sentence Similarity Score: 0.8518826827522146
BLEU Score: 0.3757335346580497
Date Score: 0
66.35734879920142




In [0]:
#for user_answer in user_answers:
#    print(user_answer)
#    score = cal_score( model_answer, user_answer)
#    print(score)

score = cal_score( model_answer, user_answers)
print(score)

Spelling and Grammer Score: 1.0
LSA Score: 0.21446393454161006


  self.explained_variance_ratio_ = exp_var / full_var


Sentence Similarity Score: 0.32979600888138694
BLEU Score: 0.3757335346580497
Date Score: 0
58.39986956162094


In [0]:
normalise all section marks to out of 10 or 100