In [1]:
import numpy as np
import pandas as pd
import re
import nltk
import spacy
import string
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from pymagnitude import *
vectors = Magnitude("glove-lemmatized.6B.100d.magnitude")
#Run these once
# nltk.download('stopwords')
# nltk.download('averaged_perceptron_tagger')


PUNCT_TO_REMOVE = string.punctuation


#-------------- Preprocessing -------------

#Removing punctuations
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

#Stopwords removal
STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])


nouns_abv=['NN', 'NNS', 'NNP', 'NNPS']

#Stemming
stemmer = PorterStemmer()
def stem_words(text, check_for_noun=False):
    res=[]
    for word in text.split():
        if check_for_noun:
            #don't stem nouns
            tag=nltk.pos_tag([word])
            if tag[0][1] in nouns_abv:
                res.append(word)
            else:
                res.append(stemmer.stem(word))
        else:
            #stem
            r=stemmer.stem(word)
            res.append(r)
    return " ".join(res)

#Spell Checking
from spellchecker import SpellChecker

spell = SpellChecker()
def correct_spellings(text):
    corrected_text = []
    misspelled_words = spell.unknown(text.split())
    for word in text.split():
        if word in misspelled_words:
            corrected_text.append(spell.correction(word))
        else:
            corrected_text.append(word)
    return " ".join(corrected_text)
        
    
#Preprocess every short answer
def preprocess_data(data):
    n=len(data)
    res=[]
    for i in range(n):
        s=data[i].lower()
        s=remove_punctuation(s)
        s=stem_words(s, True)
        s=correct_spellings(s)
        res.append(s)
    return res

#-------------- Feature Engineering -------------

#assuming text is preprocessed then applying these
def num_of_words(text):
    return len(text.split())

def average_word_length(text):
    return len(text)/num_of_words(text)

def pos_tagging(text):
    #get the frequency of each tag after NER
    words=text.split()
    tagged=nltk.pos_tag(words)
    result=[]
    freq=defaultdict(int)
    for word, noun in tagged:
        freq[noun]+=1
    return freq

vectorizer = TfidfVectorizer(max_df=0.85, min_df=1, max_features=5000, stop_words='english')



def generate_vectors(documents):
    vectorizer.fit(documents)
    return vectorizer.transform(documents)




def tfidf_weighted_avg_vec(para, idx):
    s=0
    for i, word in enumerate(para.split(' ')):
        weight=tfidf_vec[idx].toarray().flatten()[i]
        v=weight*vectors.query(word)
        s+=v
    s/=len(para.split(' '))
    return s

In [6]:
answers=[
    'the combination of a clash symbol be accompany by a bass drum play an dent part played loudly',
    'it strategy location and excellent infrastructure with the largest airport in scandinavian local 14 minutes by train from the city centre',
    'a threshold flute made from a mammoth tusk and two flutes made from swans bones are among the oldest known music instruments']


answers = preprocess_data(answers)
try:
    tfidf_vec=generate_vectors(answers)
except ValueError:
    vectorizer=TfidfVectorizer(max_df=1.00, min_df=1, max_features=5000, stop_words='english')
    tfidf_vec=generate_vectors(answers)

In [7]:
for ans in answers:
    print(tfidf_weighted_avg_vec(answers[0], 0))

[-2.9300817e-03  2.2931504e-03  2.0506685e-03  1.7305912e-04
  5.3348509e-03  8.0365445e-03  4.6011037e-03  3.6264947e-03
 -7.5042523e-03 -8.3637651e-04 -5.7599521e-03 -4.1863513e-03
  2.4262592e-03 -1.2024435e-03  7.7131431e-04 -7.7635673e-04
 -7.8372488e-04 -7.5147748e-03 -7.6499890e-04 -2.0555598e-03
  1.6220044e-03 -6.0447939e-03 -2.1608723e-03 -2.1090482e-04
  6.2284865e-03 -7.3548069e-04 -6.7968196e-03 -3.6635110e-03
 -9.7914273e-04  2.5120957e-04 -5.2638063e-03  7.5143450e-03
 -3.0982709e-03 -6.3116653e-03  7.3193019e-04  5.6260079e-04
  8.2324981e-04  2.9192551e-03  5.9801214e-03  2.1169686e-05
 -3.1945626e-03  1.4871681e-03  7.3021716e-03 -5.4528721e-04
  7.3080678e-03  1.0240175e-03  1.9371760e-03 -3.6564542e-04
 -2.8312274e-03 -9.1403015e-03 -4.3323082e-03  1.1148909e-03
  3.7294303e-03  1.0253319e-02  1.4556984e-03 -3.7853051e-02
 -5.0113484e-04 -9.6221617e-04  2.2317898e-02  5.1350478e-04
  2.2571003e-03  9.5198061e-03  5.5621821e-04  3.1827656e-03
  8.1997477e-03 -1.09492

In [8]:
answers

['the combination of a clash symbol be accompany by a bass drum play an dent part played loudly',
 'it strategy location and excellent infrastructure with the largest airport in scandinavian local 14 minutes by train from the city centre',
 'a threshold flute made from a mammoth tusk and two flutes made from swans bones are among the oldest known music instruments']