This notebook extracts syntatical features from the queries found in SWC and SQS, returning a data frame containing those features.

# Import Libraries
The following block of code loads all libraries needed for this notebook.

In [1]:
import nltk
import os
import pickle
import re
import shlex
import stanza
import subprocess
import time

import pandas as pd
import numpy as np

from nltk import word_tokenize
from nltk.tokenize import SyllableTokenizer
from subprocess import Popen, PIPE
from tqdm import tqdm

# Declare Functions

The following block of code declares functions used in this notebook.

In [2]:
# This function generates n-grams generated from a string.
#
# param s: is the string passed into this function
# param n: is the n in n-grams
# returns: the n-grams

def generate_ngrams(s, n):
    # Convert to lowercases
    s = s.lower()
    
    # Replace all none alphanumeric characters with spaces
    s = re.sub(r'[^a-zA-Z0-9\s]', ' ', s)
    
    # Break sentence in the token, remove empty tokens
    tokens = [token for token in s.split(" ") if token != ""]
    
    # Use the zip function to help us generate n-grams
    # Concatentate the tokens into ngrams and return
    ngrams = zip(*[tokens[i:] for i in range(n)])
    return [" ".join(ngram) for ngram in ngrams]

# Load Data Sets

This block of code loads the data sets and extracts all unique queries from both.

In [3]:
allSessions = pickle.load( open( "../Data/DataSets/SWC/SWC.p", "rb" ) )
allSessionsSQS = pickle.load( open( "../Data/DataSets/SQS/SQS.p", "rb" ) )
allQueries = allSessions['query'].tolist() + allSessionsSQS['query'].tolist()  
setQueries = set(allQueries)

# Extract D-Level Features

The following block of code extracts D-Level features from each query. This code is extremely slow as it is making system calls which execute another block of code. I have encountered difficulties with getting this code to run before, as COLLINS-PARSER/code is compiled C code that may need to be recompiled to ensure compatibility with processor. The solution is to run the make clean, and then make again. Further information about this suite of code can be found at:

http://www.personal.psu.edu/xxl13/downloads/d-level.html

In [None]:
count = 0

input_file = 'DLA/data/lemmatize_pos_sentences.tagged'
loc_file =  '../../data/lemmatize_pos_sentences.tagged'

processor_dict = {
    'tokenize': 'gsd',
    'pos': 'bnc',
    'lemma': 'default'
}

nlp = stanza.Pipeline('en', processors=processor_dict)

from tqdm import tqdm
with tqdm(total = len(setQueries) ) as pbar:
    for text in setQueries:
        doc = nlp(text)
        out = open(input_file, 'w')
        
        for sentence in doc.sentences:
            s = ''
            l = 0
            for word in sentence.words:
                s+='{} {}'.format(word.lemma, word.xpos) + ' ' # needs to be xpos so it uses Penn Treebank
                l+=1
            out.write('{} {}\n'.format(l, s.strip()))
        out.close()
        
        cmd = 'cd DLA/d-level-analyzer/COLLINS-PARSER;'
        cmd += ' code/parser {} models/model2/grammar 10000 1 1 1 1 > ../../data/parsed.m2;'.format(loc_file)
        cmd += 'cd ..;'
        cmd += 'python d-level.py ../data/parsed.m2 > ../data/dlevel.dla;'
        proc = subprocess.Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True).wait()
        if count == 0:
            dl = pd.read_csv('DLA/data/dlevel.dla')
            dl['query'] = text
            dLevel = dl
            count += 1
        else:
            dl = pd.read_csv('DLA/data/dlevel.dla')
            dl['query'] = text
            dLevel = dLevel.append(dl)
        pbar.update()

2021-11-02 13:57:32 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | ewt       |
| pos       | ewt       |
| lemma     | ewt       |
| depparse  | ewt       |
| sentiment | sstplus   |
| ner       | ontonotes |

2021-11-02 13:57:32 INFO: Use device: cpu
2021-11-02 13:57:32 INFO: Loading: tokenize
2021-11-02 13:57:32 INFO: Loading: pos
2021-11-02 13:57:33 INFO: Loading: lemma
2021-11-02 13:57:33 INFO: Loading: depparse
2021-11-02 13:57:34 INFO: Loading: sentiment
2021-11-02 13:57:35 INFO: Loading: ner
2021-11-02 13:57:36 INFO: Done loading processors!
 19%|█▉        | 13184/70112 [4:33:53<14:01:53,  1.13it/s]    

# Extract Part of Speech Features

The following block of code first generates part of speech uni-gram, bi-gram, and tri-gram for each query, then takes the top 10 most common bi-grams and top 5 most common tri-grams (was determined be initial research); returning the ratio of all n-grams for each query.

In [None]:
posData = []
for document in setQueries:
    text = nltk.word_tokenize(document)
    tags = np.array(nltk.pos_tag(text)).flatten()
    posData.append(tags[1::2])

posMod = []

for pos in posData: 
    string = []
    for entry in pos:
        string += str(entry) + " "
    posMod.append("".join(string))

    
posUni = []
posBi = []
posTri = []

for document in posMod:
    doc = generate_ngrams(document,1)
    posUni.append(doc)

for document in posMod:
    doc = generate_ngrams(document,2)
    posBi.append(doc)

for document in posMod:
    doc = generate_ngrams(document,3)
    posTri.append(doc)  
    
posDF = pd.DataFrame(setQueries)

posDF['all'] = posMod
posDF['uniPos'] = posUni
posDF['biPos'] = posBi
posDF['triPos']= posTri
posDF = posDF.rename(columns={0: "query"})

allSessionsuni = pd.concat([posDF,pd.get_dummies(posDF['uniPos'].apply(pd.Series).stack()).sum(level=0)],axis=1).drop(['uniPos', 'all', 'biPos', 'triPos'],axis=1)
allSessionsbi = pd.concat([posDF,pd.get_dummies(posDF['biPos'].apply(pd.Series).stack()).sum(level=0)],axis=1).drop(['biPos', 'uniPos', 'all', 'triPos'],axis=1)
allSessionstri = pd.concat([posDF,pd.get_dummies(posDF['triPos'].apply(pd.Series).stack()).sum(level=0)],axis=1).drop(['uniPos', 'all', 'biPos', 'triPos'],axis=1)

In [None]:
allSessionsbiLanding = allSessionsbi[[
'nn nn',
'jj nn',
'nn nns',
'to vb',
'jj nns',
'jj to',
'nn in',
'nns in',
'in nn',
'dt nn',
'query']]

In [None]:
allSessionstriLanding = allSessionstri[[
'jj nn nn',
'nn nn nn',
'jj to vb',
'nn nn nns',
'to vb nn',
'query']]

In [None]:
synFeats = allSessionsuni.merge(allSessionsbiLanding)
synFeats = synFeats.merge(allSessionstriLanding)
synFeats = synFeats.merge(allSessionstriLanding)
synFeats = synFeats.fillna(0)

listCols = list(synFeats.columns)
listCols.pop(0) ##removes 'query' from the list of columns

synFeats['length'] = synFeats['query'].str.split().str.len()


for col in listCols:
    synFeats[col] = synFeats[col]/synFeats['length']

# Return Feature Set

Combines all data frames into one, preprocesses out extraneous information and returns the cleaned data frame.

In [None]:
synFeats = synFeats.merge(dLevel, on = 'query')
synFeats.drop(columns = [' Sentences', 'length', 'Filename'], inplace = True)
pickle.dump(synFeats, open( "Pickles/SynFeat.p", "wb" ) )