This notebook extracts the lexical features from each query

# Load Libraries

In [1]:
import os
import pickle
import random
import re
import stanza
import string
import sys
import warnings

import numpy as np
import pandas as pd

from math import sqrt
from math import log
from nltk import word_tokenize
from nltk.tokenize import SyllableTokenizer
from tqdm import tqdm

stanza.download('en') 

# Functions for Lexical Complexity

Code taken from:

This code is the lexical complexity analyzer described in

Lu, Xiaofei (2012). The relationship of lexical richnes to the quality 
of ESL speakers' oral narratives. The Modern Language Journal, 96(2), 190-208. 

Version 1.1 Released on February 12, 2013

Which can be found at:

http://www.personal.psu.edu/xxl13/download.html

It has been modified to work with search queries, as it was initially designed for sentences.

All but getLex() were imported from that code.

In [8]:
# NDW for first z words in a sample
def getndwfirstz(z,lemmalist):
    ndwfirstztype={}
    for lemma in lemmalist[:z]:
        ndwfirstztype[lemma]=1
    return len(ndwfirstztype.keys())

In [9]:
# adjust minimum sample size here
standard=50

# Returns the keys of dictionary d sorted by their values
def sort_by_value(d): 
    items=d.items() 
    backitems=[ [v[1],v[0]] for v in items] 
    backitems.sort()
    return [ backitems[i][1] for i in range(0,len(backitems))]
 
# NDW for first z words in a sample
def getndwfirstz(z,lemmalist):
    ndwfirstztype={}
    for lemma in lemmalist[:z]:
        ndwfirstztype[lemma]=1
    return len(ndwfirstztype.keys())

# NDW expected random z words, 10 trials
def getndwerz(z,lemmalist):
    ndwerz=0
    for i in range(10):
        ndwerztype={}
        erzlemmalist=random.sample(lemmalist,z) 
        for lemma in erzlemmalist:
            ndwerztype[lemma]=1
        ndwerz+=len(ndwerztype.keys())
    return ndwerz/10.0

# NDW expected random sequences of z words, 10 trials
def getndwesz(z,lemmalist):
    ndwesz=0
    for i in range(10):
        ndwesztype={}
        startword=random.randint(0,len(lemmalist)-z)
        eszlemmalist=lemmalist[startword:startword+z]
        for lemma in eszlemmalist:
            ndwesztype[lemma]=1
        ndwesz+=len(ndwesztype.keys())
    return ndwesz/10.0

# MSTTR
def getmsttr(z,lemmalist):
    samples=0
    msttr=0.0
    while len(lemmalist)>=z:
        samples+=1
        msttrtype={}
        for lemma in lemmalist[:z]:
            msttrtype[lemma]=1
        msttr+=len(msttrtype.keys())/float(z)
        lemmalist=lemmalist[z:]    
    return msttr/samples

def isLetterNumber(character):
    if character in string.printable and not character in string.punctuation:
        return 1
    return 0

def isSentence(line):
    for character in line:
        if isLetterNumber(character):
            return 1
    return 0

# Accepts a list of queries, returns a dataframe of extracted lexical features 
# that correspond to each query.
#
# param getLex: list of queries to extract lexical features from
# returns lexical: dataframe containing lexical features

def getLex(queries):
    processor_dict = {
    'tokenize': 'gsd', 
    'pos': 'bnc', 
    'lemma': 'default'
    }

    nlp = stanza.Pipeline('en', processors=processor_dict)
    # reads information from bnc wordlist
    lexFeat = []
    adjdict={}
    verbdict={}
    noundict={}
    worddict={}
    wordlistfile=open("DataSets/BNC/bnc_all_filtered.txt","r") # list of words with pos
    wordlist=wordlistfile.readlines()
    wordlistfile.close()
    for word in wordlist:
        wordinfo=word.strip() 
        if not wordinfo or "Total words" in wordinfo: 
            continue
        infolist=wordinfo.split()
        lemma=infolist[0]
        pos=infolist[1]
        frequency=int(infolist[2])
        worddict[lemma]=worddict.get(lemma,0)+frequency
        if pos=="Adj":
            adjdict[lemma]=adjdict.get(lemma,0)+frequency
        elif pos=="Verb":
            verbdict[lemma]=verbdict.get(lemma,0)+frequency
        elif pos=="NoC" or pos=="NoP":
            noundict[lemma]=noundict.get(lemma,0)+frequency
    wordranks=sort_by_value(worddict)
    verbranks=sort_by_value(verbdict)
    length = len(queries)
    with tqdm(total = length) as pbar:
        for query in queries:
            filename=query
            doc = nlp(query)
            for sentence in doc.sentences:
                s = ''
                for word in sentence.words:
                    s+='{}_{}'.format(word.lemma, word.xpos) + ' '
            lemlines= s
            # print(lemlines)
            # process input file
            wordtypes={}
            wordtokens=0
            swordtypes={}
            swordtokens=0
            lextypes={}
            lextokens=0
            slextypes={}
            slextokens=0
            verbtypes={}
            verbtokens=0
            sverbtypes={}
            adjtypes={}
            adjtokens=0
            advtypes={}
            advtokens=0
            nountypes={}
            nountokens=0
            lemmaposlist=[]
            lemmalist=[]

            for lemline in lemlines.split():
                lemline=lemline.strip()
                lemline=lemline.lower()
                if not isSentence(lemline):
                    continue
                lemmas=lemline.split()
                for lemma in lemmas:
                    word=lemma.split("_")[0]
                    pos=lemma.split("_")[-1]
                    if (not pos in string.punctuation) and pos!="sent" and pos!="sym":
                        lemmaposlist.append(lemma)
                        lemmalist.append(word)  
                        wordtokens+=1
                        wordtypes[word]=1 
                        try:

                            if (not word in wordranks[-2000:]) and pos != "cd":
                                swordtypes[word]=1
                                swordtokens+=1
                            if pos[0]=="n":
                                lextypes[word]=1
                                nountypes[word]=1
                                lextokens+=1
                                nountokens+=1
                                if not word in wordranks[-2000:]:
                                    slextypes[word]=1
                                    slextokens+=1
                            elif pos[0]=="j":
                                lextypes[word]=1
                                adjtypes[word]=1
                                lextokens+=1
                                adjtokens+=1
                                if not word in wordranks[-2000:]:
                                    slextypes[word]=1
                                    slextokens+=1
                            elif pos[0]=="r" and (adjdict.has_key(word) or (word[-2:]=="ly" and adjdict.has_key(word[:-2]))):
                                lextypes[word]=1
                                advtypes[word]=1
                                lextokens+=1
                                advtokens+=1
                                if not word in wordranks[-2000:]:
                                    slextypes[word]=1
                                    slextokens+=1
                            elif pos[0]=="v" and not word in ["be","have"]:
                                verbtypes[word]=1
                                verbtokens+=1
                                lextypes[word]=1
                                lextokens+=1
                                if not word in wordranks[-2000:]:
                                    sverbtypes[word]=1
                                    slextypes[word]=1
                                    slextokens+=1
                        except(AttributeError):
                            pass

            # lexical density
            if wordtokens > 0:
                ld=float(lextokens)/wordtokens
            else:
                ld=0
            # lexical sophistication
            if lextokens != 0:
                ls1=slextokens/float(lextokens)
            else:
                ls1 = 0
            if len(wordtypes.keys()) > 0:
                ls2=len(swordtypes.keys())/float(len(wordtypes.keys()))
            else:
                ls2 = 0

            # verb sophistication
            vs1 = 0
            vs2=0
            cvs1=0
            if verbtokens > 0:
                vs1=len(sverbtypes.keys())/float(verbtokens)
                vs2=(len(sverbtypes.keys())*len(sverbtypes.keys()))/float(verbtokens)
                cvs1=len(sverbtypes.keys())/sqrt(2*verbtokens)

            # lexical diversity or variation
            # NDW, may adjust the values of "standard"
            ndw=len(wordtypes.keys())

            # TTR
            if wordtokens > 0:
                ttr=len(wordtypes.keys())/float(wordtokens)
                if len(lemmalist)>=standard:
                    msttr=getmsttr(standard,lemmalist)
                cttr=len(wordtypes.keys())/sqrt(2*wordtokens)
                rttr=len(wordtypes.keys())/sqrt(wordtokens)
            else:
                ttr = 0
                cttr = 0
                rttr = 0
            if wordtokens == 0 or len(wordtypes.keys()) == 0:
                logttr = 0
#             else:
#                 logttr=log(len(wordtypes.keys()))/log(wordtokens) 
            # 3.3 verb diversity
            vv1, svv1, cvv1 = 0, 0, 0
            if verbtokens > 0:
                vv1=len(verbtypes.keys())/float(verbtokens)
                svv1=len(verbtypes.keys())*len(verbtypes.keys())/float(verbtokens)
                cvv1=len(verbtypes.keys())/sqrt(2*verbtokens)

            # 3.4 lexical diversity
            if lextokens != 0:
                lv=len(lextypes.keys())/float(lextokens)
                vv2=len(verbtypes.keys())/float(lextokens)
                adjv=len(adjtypes.keys())/float(lextokens)

            else:
                lv=0
                vv2=0
                adjv=0

            if nountokens != 0:
                nv=len(nountypes.keys())/float(nountokens)
            else:
                nv=0


            lexFeat.append([query, ld, ls1, ls2, vs1, vs2, cvs1, ndw, ttr,
                           cttr, rttr,  lv, vv1, svv1, cvv1, vv2, nv, adjv]) 
            pbar.update()
    lexical = pd.DataFrame(data = lexFeat, columns = ["query", "ld", "ls1", "ls2", "vs1", "vs2", "cvs1", "ndw", "ttr",
                                                      "cttr", "rttr", "lv", "vv1", "svv1", "cvv1", "vv2", "nv", "adjv"])
    return lexical

# Load Data Sets

This block of code loads the date

In [4]:

allSessionsSQS = pickle.load( open( "../Data/DataSets/SQS/SQS.p", "rb" )) 
allQueries = allSessionsSQS['query'].tolist()
setQueries = allQueries

In [11]:
lexical = getLex(queries) 

2023-03-09 00:38:15 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json:   0%|   …

2023-03-09 00:38:16 INFO: Loading these models for language: en (English):
| Processor    | Package   |
----------------------------
| tokenize     | combined  |
| pos          | combined  |
| lemma        | combined  |
| depparse     | combined  |
| sentiment    | sstplus   |
| constituency | wsj       |
| ner          | ontonotes |

2023-03-09 00:38:16 INFO: Use device: cpu
2023-03-09 00:38:16 INFO: Loading: tokenize
2023-03-09 00:38:16 INFO: Loading: pos
2023-03-09 00:38:17 INFO: Loading: lemma
2023-03-09 00:38:17 INFO: Loading: depparse
2023-03-09 00:38:17 INFO: Loading: sentiment
2023-03-09 00:38:17 INFO: Loading: constituency
2023-03-09 00:38:17 INFO: Loading: ner
2023-03-09 00:38:18 INFO: Done loading processors!
100%|██████████| 1505/1505 [05:27<00:00,  4.60it/s]


In [12]:
lexical.head()

Unnamed: 0,query,ld,ls1,ls2,vs1,vs2,cvs1,ndw,ttr,cttr,rttr,lv,vv1,svv1,cvv1,vv2,nv,adjv
0,becoming a fireman,0.666667,0.5,0.333333,0.0,0.0,0.0,3,1.0,1.224745,1.732051,1.0,1.0,1.0,0.707107,0.5,1.0,0.0
1,hotel in Pocono Mountains,0.75,0.666667,0.5,0.0,0.0,0.0,4,1.0,1.414214,2.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,wedding traditions buddhism,1.0,0.666667,0.666667,0.0,0.0,0.0,3,1.0,1.224745,1.732051,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,diversification in hiring,0.666667,1.0,0.666667,0.0,0.0,0.0,3,1.0,1.224745,1.732051,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,traiditional swahili recipes,1.0,1.0,1.0,0.0,0.0,0.0,3,1.0,1.224745,1.732051,1.0,0.0,0.0,0.0,0.0,1.0,0.333333


In [13]:
lexical.columns

Index(['query', 'ld', 'ls1', 'ls2', 'vs1', 'vs2', 'cvs1', 'ndw', 'ttr', 'cttr',
       'rttr', 'lv', 'vv1', 'svv1', 'cvv1', 'vv2', 'nv', 'adjv'],
      dtype='object')

In [14]:
lexical.shape

(1505, 18)

# Extract Lexical Characteristics

This block of code extracts lexical characteristics from each query and starts building a dataframe for the values of these features. Filter warnings are set to ignore, as encountering numbers as well as characters with modifiers such as umlauts throw a 

*UserWarning: Character not defined in sonority_hierarchy*

which leads to the character/number being recast as the same symbol, but in a way that is recognized by NLTK.

In [15]:
warnings.filterwarnings("ignore")

totalSyl = []
avgSyl = []
simWords = []
comWords = []
simWordsAvg = []
comWordsAvg = []
mostSyl = []
leastSyl = []
SSP = SyllableTokenizer()

with tqdm(total = len(setQueries) ) as pbar:
    for text in setQueries:
        running = 0
        count = 0
        simpleWords = 0
        complexWords = 0
        most = 0
        least = sys.maxsize
        for word in text.split(" "):
            current = len(SSP.tokenize(word))
            running += current
            count +=1
            if current < 3:
                simpleWords += 1 # -- a word is simple when it's syllables are < 3 else it considered as complex 
            else:
                complexWords +=1
            if most < current:
                most = current
            if least > current:
                least = current
                
        totalSyl.append(running)
        avgSyl.append(running/count)
        simWords.append(simpleWords)
        comWords.append(complexWords)
        mostSyl.append(most)
        leastSyl.append(least)
        pbar.update()
        
lexChar = pd.DataFrame(setQueries)
lexChar = lexChar.set_index(0, drop=True)
lexChar = lexChar.reset_index().rename(columns={0:'query'})

lexChar['totalSyl'] = totalSyl
lexChar['avgSyl'] = avgSyl
lexChar['simWords'] = simWords
lexChar['comWords'] = comWords
lexChar['greatestSyl'] = mostSyl
lexChar['leastSyl'] = leastSyl
lexChar['numChars'] = lexChar['query'].str.len()
lexChar['numWords'] = lexChar['query'].str.split().str.len()
lexChar['avgLenWord'] = lexChar['numChars']/lexChar['numWords']

# lexChar['qID'] = qID


100%|██████████| 1505/1505 [00:00<00:00, 9084.33it/s] 


In [16]:
lexChar.shape


(1505, 10)

# Extract Lexical Complexity

This block of code below runs the previously defined functions that extract the feature corresponding to lexical complexity. This can be very slow/time consuming.  

In [17]:
allSessionsSQS

Unnamed: 0,query,class,sID
0,becoming a fireman,0,3199
1,hotel in Pocono Mountains,0,2515
2,wedding traditions buddhism,0,2823
3,diversification in hiring,0,3033
4,traiditional swahili recipes,0,3145
...,...,...,...
1500,Who plays the bad guy in Star Wars the Horde a...,1,3256
1501,What is a fox's favorite kind of food?,1,2859
1502,"Show me the movie called ""The Martian""",1,3208
1503,What is the biggest rock found on Mars?,1,2676


In [18]:
lexComplex = getLex(setQueries)


2023-03-09 00:43:45 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json:   0%|   …

2023-03-09 00:43:47 INFO: Loading these models for language: en (English):
| Processor    | Package   |
----------------------------
| tokenize     | combined  |
| pos          | combined  |
| lemma        | combined  |
| depparse     | combined  |
| sentiment    | sstplus   |
| constituency | wsj       |
| ner          | ontonotes |

2023-03-09 00:43:47 INFO: Use device: cpu
2023-03-09 00:43:47 INFO: Loading: tokenize
2023-03-09 00:43:47 INFO: Loading: pos
2023-03-09 00:43:47 INFO: Loading: lemma
2023-03-09 00:43:47 INFO: Loading: depparse
2023-03-09 00:43:47 INFO: Loading: sentiment
2023-03-09 00:43:48 INFO: Loading: constituency
2023-03-09 00:43:48 INFO: Loading: ner
2023-03-09 00:43:48 INFO: Done loading processors!
100%|██████████| 1505/1505 [05:43<00:00,  4.38it/s]


In [19]:
# -- S


In [20]:
lexComplex.columns


Index(['query', 'ld', 'ls1', 'ls2', 'vs1', 'vs2', 'cvs1', 'ndw', 'ttr', 'cttr',
       'rttr', 'lv', 'vv1', 'svv1', 'cvv1', 'vv2', 'nv', 'adjv'],
      dtype='object')

In [21]:
lexComplex.shape


(1505, 18)

In [22]:
lexComplex.head(2)


Unnamed: 0,query,ld,ls1,ls2,vs1,vs2,cvs1,ndw,ttr,cttr,rttr,lv,vv1,svv1,cvv1,vv2,nv,adjv
0,becoming a fireman,0.666667,0.5,0.333333,0.0,0.0,0.0,3,1.0,1.224745,1.732051,1.0,1.0,1.0,0.707107,0.5,1.0,0.0
1,hotel in Pocono Mountains,0.75,0.666667,0.5,0.0,0.0,0.0,4,1.0,1.414214,2.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [23]:
lexChar.shape


(1505, 10)

In [24]:
lexComplex.columns


Index(['query', 'ld', 'ls1', 'ls2', 'vs1', 'vs2', 'cvs1', 'ndw', 'ttr', 'cttr',
       'rttr', 'lv', 'vv1', 'svv1', 'cvv1', 'vv2', 'nv', 'adjv'],
      dtype='object')

In [25]:
lexChar.head(2)


Unnamed: 0,query,totalSyl,avgSyl,simWords,comWords,greatestSyl,leastSyl,numChars,numWords,avgLenWord
0,becoming a fireman,7,2.333333,1,2,3,1,18,3,6.0
1,hotel in Pocono Mountains,8,2.0,3,1,3,1,25,4,6.25


In [26]:
lexChar.columns

Index(['query', 'totalSyl', 'avgSyl', 'simWords', 'comWords', 'greatestSyl',
       'leastSyl', 'numChars', 'numWords', 'avgLenWord'],
      dtype='object')

In [27]:
ddd = pd.merge(lexComplex, lexChar, left_index=True, right_index=True)
ddd.shape


(1505, 28)

In [28]:
aabbcc = pd.merge(lexComplex, lexChar, on='query')
aabbcc.shape


(1527, 27)

In [29]:
aabbcc.columns


Index(['query', 'ld', 'ls1', 'ls2', 'vs1', 'vs2', 'cvs1', 'ndw', 'ttr', 'cttr',
       'rttr', 'lv', 'vv1', 'svv1', 'cvv1', 'vv2', 'nv', 'adjv', 'totalSyl',
       'avgSyl', 'simWords', 'comWords', 'greatestSyl', 'leastSyl', 'numChars',
       'numWords', 'avgLenWord'],
      dtype='object')

In [30]:
# -- E

In [31]:
# lexChar.to_csv('lexCharTest.csv', index = False)
# lexComplex.to_csv('lexComplexTest.csv', index = False)


In [32]:
pwd


'/Users/assoumerredempta/Documents/aSpring_2023/RYSe_Final/FeatureExtraction'

# Combine And Return Feature Set

This block of code combines all features into one dataframe and outputs that combination as a pickle.

In [33]:
# lexicalFeatures = pd.merge(lexComplex, lexChar, on='query')
# pickle.dump(lexicalFeatures, open( "Pickles/LexFeat.p", "wb" ) )

lexicalFeatures = pd.merge(lexComplex, lexChar, left_index=True, right_index=True)
lexicalFeatures.drop(columns = ['query_y'], inplace = True)
lexicalFeatures.rename(columns = {'query_x':'query'}, inplace = True)

In [34]:
lexicalFeatures.head(3)

Unnamed: 0,query,ld,ls1,ls2,vs1,vs2,cvs1,ndw,ttr,cttr,...,adjv,totalSyl,avgSyl,simWords,comWords,greatestSyl,leastSyl,numChars,numWords,avgLenWord
0,becoming a fireman,0.666667,0.5,0.333333,0.0,0.0,0.0,3,1.0,1.224745,...,0.0,7,2.333333,1,2,3,1,18,3,6.0
1,hotel in Pocono Mountains,0.75,0.666667,0.5,0.0,0.0,0.0,4,1.0,1.414214,...,0.0,8,2.0,3,1,3,1,25,4,6.25
2,wedding traditions buddhism,1.0,0.666667,0.666667,0.0,0.0,0.0,3,1.0,1.224745,...,0.0,7,2.333333,2,1,3,2,27,3,9.0


In [35]:
lexicalFeatures.columns


Index(['query', 'ld', 'ls1', 'ls2', 'vs1', 'vs2', 'cvs1', 'ndw', 'ttr', 'cttr',
       'rttr', 'lv', 'vv1', 'svv1', 'cvv1', 'vv2', 'nv', 'adjv', 'totalSyl',
       'avgSyl', 'simWords', 'comWords', 'greatestSyl', 'leastSyl', 'numChars',
       'numWords', 'avgLenWord'],
      dtype='object')

In [36]:
lexicalFeatures.shape

(1505, 27)

In [37]:
pickle.dump(lexicalFeatures, open( "Pickles/LexFeat.p", "wb" ) )

In [38]:
print('done')


done


In [39]:
# Expected shape: 1505, 27

In [40]:
27 + 5 + 41 + 34

107

In [3]:
sss = SyllableTokenizer()

In [4]:
sss.tokenize('one')

['o', 'ne']

In [5]:
sss.tokenize('athlete')

['at', 'hle', 'te']