This notebook extracts syntatical features from the queries found in SQS, returning a data frame containing those features.

# Import Libraries
The following block of code loads all libraries needed for this notebook.

In [219]:
import nltk
import os
import pickle
import re
import shlex
import stanza
import subprocess
import time

import pandas as pd
import numpy as np

from nltk import word_tokenize
from nltk.tokenize import SyllableTokenizer
from subprocess import Popen, PIPE
from tqdm import tqdm

# Declare Functions

The following block of code declares functions used in this notebook.

In [220]:
# This function generates n-grams generated from a string.
#
# param s: is the string passed into this function
# param n: is the n in n-grams
# returns: the n-grams

def generate_ngrams(s, n):
    # Convert to lowercases
    s = s.lower()
    
    # Replace all none alphanumeric characters with spaces
    s = re.sub(r'[^a-zA-Z0-9\s]', ' ', s) # \s white, ^ : negation
    
    # Break sentence in the token, remove empty tokens
    tokens = [token for token in s.split(" ") if token != ""]
    
    # Use the zip function to help us generate n-grams
    # Concatentate the tokens into ngrams and return
    ngrams = zip(*[tokens[i:] for i in range(n)])
    return [" ".join(ngram) for ngram in ngrams]

In [221]:
n = "aa bb cc"
token=n.split(' ')
token

['aa', 'bb', 'cc']

In [222]:
jj=list(zip(*[token[i:] for i in range(2)]))
jj

[('aa', 'bb'), ('bb', 'cc')]

In [223]:
list(jj)

[('aa', 'bb'), ('bb', 'cc')]

In [224]:
jj

[('aa', 'bb'), ('bb', 'cc')]

In [225]:
[' '.join(n) for n in jj]

['aa bb', 'bb cc']

In [226]:
jj=[token[i:] for i in range(5)]
jj

[['aa', 'bb', 'cc'], ['bb', 'cc'], ['cc'], [], []]

In [227]:
zip(*jj)

<zip at 0x7fab8876f840>

In [228]:
list(zip(*jj))

[]

In [229]:
'_'.join(['Redempta', 'is', 'from', 'Kigali'])

'Redempta_is_from_Kigali'

In [230]:
list(zip([1,2,3], [5,6,7]))

[(1, 5), (2, 6), (3, 7)]

# Load Data Sets

This block of code loads the data sets and extracts all unique queries from both.

In [231]:

allSessionsSQS = pickle.load( open( "../Data/DataSets/SQS/SQS.p", "rb" ) )
allSessionsSQS = allSessionsSQS.drop(['sID'], axis=1) # ---- added
allQueries = allSessionsSQS['query'].tolist()
# allQueries = allSessionsSQS.tolist()
setQueries = allQueries


In [232]:
pwd

'/Users/assoumerredempta/Documents/aSpring_2023/RYSe_Final/FeatureExtraction'

In [233]:
allSessionsSQS

Unnamed: 0,query,class
0,becoming a fireman,0
1,hotel in Pocono Mountains,0
2,wedding traditions buddhism,0
3,diversification in hiring,0
4,traiditional swahili recipes,0
...,...,...
1500,Who plays the bad guy in Star Wars the Horde a...,1
1501,What is a fox's favorite kind of food?,1
1502,"Show me the movie called ""The Martian""",1
1503,What is the biggest rock found on Mars?,1


In [234]:
allQueries

['becoming a fireman',
 'hotel in Pocono Mountains',
 'wedding traditions buddhism',
 'diversification in hiring',
 'traiditional swahili recipes',
 'Boolywood in hollywood',
 'week long vacation idea',
 'efficiency of solar panels',
 "bollywood's increasing popularity",
 'New York City',
 'growth in bollywood government',
 'regular phones vs internet phones voip experiences cheapest',
 'designer dogs moral',
 'best cities planning a trip to the United States',
 'healthcare UK',
 'Eurozone crisis history',
 'what is wrong with designer dogs',
 'business phd',
 'main Swahili dishes',
 'American Cities close to tourist attractions',
 'merck scandal',
 'Pocono Mountains  things to do',
 'Connecticut Fire Academy  ',
 'AIDS, Africa, charities',
 'Kenya',
 'causes for pseudocyesis',
 'best vacation spots',
 'Water turbines types',
 'History of Churchill downs',
 'cross breeding dogs',
 'silicone rook coating',
 'jersey shore',
 'Martin Bryant homicide',
 'students evaluating teachers',
 'sw

In [235]:
len(allQueries)

1505

In [236]:
len(setQueries)


1505

# Extract D-Level Features

The following block of code extracts D-Level features from each query. This code is extremely slow as it is making system calls which execute another block of code. I have encountered difficulties with getting this code to run before, as COLLINS-PARSER/code is compiled C code that may need to be recompiled to ensure compatibility with processor. The solution is to run the make clean, and then make again. Further information about this suite of code can be found at:

http://www.personal.psu.edu/xxl13/downloads/d-level.html

In [237]:
pwd

'/Users/assoumerredempta/Documents/aSpring_2023/RYSe_Final/FeatureExtraction'

The following code compute D-level

In [238]:
# count = 0

# input_file = 'DLA/data/lemmatize_pos_sentences.tagged'
# loc_file =  '../../data/lemmatize_pos_sentences.tagged'

# processor_dict = {
#     'tokenize': 'gsd',
#     'pos': 'bnc',
#     'lemma': 'default'
# }

# nlp = stanza.Pipeline('en', processors=processor_dict)

# from tqdm import tqdm
# with tqdm(total = len(setQueries) ) as pbar:
#     for text in setQueries:
#         doc = nlp(text)
#         out = open(input_file, 'w')
        
#         for sentence in doc.sentences:
#             s = ''
#             l = 0
#             for word in sentence.words:
#                 s+='{} {}'.format(word.lemma, word.xpos) + ' ' # needs to be xpos so it uses Penn Treebank
#                 l+=1
#             out.write('{} {}\n'.format(l, s.strip()))
#         out.close()
        
#         cmd = 'cd DLA/d-level-analyzer/COLLINS-PARSER;'
#         cmd += ' code/parser {} models/model2/grammar 10000 1 1 1 1 > ../../data/parsed.m2;'.format(loc_file)
#         cmd += 'cd ..;'
#         cmd += 'python d-level.py ../data/parsed.m2 > ../data/dlevel.dla;'
#         proc = subprocess.Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True).wait()
#         if count == 0:
#             dl = pd.read_csv('DLA/data/dlevel.dla')
#             dl['query'] = text
#             dLevel = dl
#             count += 1
#         else:
#             dl = pd.read_csv('DLA/data/dlevel.dla')
#             dl['query'] = text
#             dLevel = dLevel.append(dl)
#         pbar.update()
        

In [239]:
# dLevel 

# Extract Part of Speech Features

The following block of code first generates part of speech uni-gram, bi-gram, and tri-gram for each query, then takes the top 10 most common bi-grams and top 5 most common tri-grams (was determined be initial research); returning the ratio of all n-grams for each query.

In [240]:
posData = [] # -- stores list of pos of each word per a query eg: [array(['NN', 'NN', 'NNS'], dtype='<U8'),
for document in setQueries:
    text = nltk.word_tokenize(document)
    tags = np.array(nltk.pos_tag(text)).flatten()
    posData.append(tags[1::2]) # -- select a pos 
#     print(document)
#     print(text)
#     print(tags)
#     print(posData)


posMod = [] # -- well presented pos of each word per a query eg: 'NN NN NNS '. --> the pos of each query is an element of the list

for pos in posData: 
    string = []
    for entry in pos:
        string += str(entry) + " "
#         print(string)
    posMod.append("".join(string))

    
posUni = []
posBi = []
posTri = []

for document in posMod:
    doc = generate_ngrams(document,1)
    posUni.append(doc)

for document in posMod:
    doc = generate_ngrams(document,2)
    posBi.append(doc)

for document in posMod:
    doc = generate_ngrams(document,3)
    posTri.append(doc)  
    
posDF = pd.DataFrame(setQueries)

posDF['all'] = posMod
posDF['uniPos'] = posUni
posDF['biPos'] = posBi
posDF['triPos']= posTri
posDF = posDF.rename(columns={0: "query"})

allSessionsuni = pd.concat([posDF,pd.get_dummies(posDF['uniPos'].apply(pd.Series).stack()).sum(level=0)],axis=1).drop(['uniPos', 'all', 'biPos', 'triPos'],axis=1)
allSessionsbi = pd.concat([posDF,pd.get_dummies(posDF['biPos'].apply(pd.Series).stack()).sum(level=0)],axis=1).drop(['biPos', 'uniPos', 'all', 'triPos'],axis=1)
allSessionstri = pd.concat([posDF,pd.get_dummies(posDF['triPos'].apply(pd.Series).stack()).sum(level=0)],axis=1).drop(['uniPos', 'all', 'biPos', 'triPos'],axis=1)

  allSessionsuni = pd.concat([posDF,pd.get_dummies(posDF['uniPos'].apply(pd.Series).stack()).sum(level=0)],axis=1).drop(['uniPos', 'all', 'biPos', 'triPos'],axis=1)
  allSessionsbi = pd.concat([posDF,pd.get_dummies(posDF['biPos'].apply(pd.Series).stack()).sum(level=0)],axis=1).drop(['biPos', 'uniPos', 'all', 'triPos'],axis=1)
  allSessionstri = pd.concat([posDF,pd.get_dummies(posDF['triPos'].apply(pd.Series).stack()).sum(level=0)],axis=1).drop(['uniPos', 'all', 'biPos', 'triPos'],axis=1)


In [241]:
# --- S

In [242]:
print(document)


WP VBZ DT JJ NN DT NN . 


In [243]:
print(text)


['What', 'is', 'the', 'top', 'game', 'this', 'week', '?']


In [244]:
print(tags)

['What' 'WP' 'is' 'VBZ' 'the' 'DT' 'top' 'JJ' 'game' 'NN' 'this' 'DT'
 'week' 'NN' '?' '.']


In [245]:
print(tags[1::2])


['WP' 'VBZ' 'DT' 'JJ' 'NN' 'DT' 'NN' '.']


In [246]:
print(posData)

[array(['VBG', 'DT', 'NN'], dtype='<U8'), array(['NN', 'IN', 'NNP', 'NNP'], dtype='<U9'), array(['VBG', 'NNS', 'NN'], dtype='<U10'), array(['NN', 'IN', 'VBG'], dtype='<U15'), array(['JJ', 'NN', 'NNS'], dtype='<U12'), array(['NN', 'IN', 'NN'], dtype='<U9'), array(['NN', 'JJ', 'NN', 'NN'], dtype='<U8'), array(['NN', 'IN', 'JJ', 'NNS'], dtype='<U10'), array(['NN', 'POS', 'VBG', 'NN'], dtype='<U10'), array(['NNP', 'NNP', 'NNP'], dtype='<U4'), array(['NN', 'IN', 'JJ', 'NN'], dtype='<U10'), array(['JJ', 'NNS', 'FW', 'NN', 'NNS', 'JJ', 'NNS', 'VBP'], dtype='<U11'), array(['NN', 'NNS', 'JJ'], dtype='<U8'), array(['JJS', 'NNS', 'VBG', 'DT', 'NN', 'TO', 'DT', 'NNP', 'NNPS'],
      dtype='<U8'), array(['NN', 'NNP'], dtype='<U10'), array(['NN', 'NN', 'NN'], dtype='<U8'), array(['WP', 'VBZ', 'JJ', 'IN', 'JJ', 'NNS'], dtype='<U8'), array(['NN', 'NN'], dtype='<U8'), array(['JJ', 'NNP', 'NNS'], dtype='<U7'), array(['NNP', 'NNP', 'RB', 'TO', 'VB', 'NNS'], dtype='<U11'), array(['NN', 'NN'], dtype='<U7')

In [247]:
print(posData.append(tags[1::2]))

None


In [248]:
print(posMod)

['VBG DT NN ', 'NN IN NNP NNP ', 'VBG NNS NN ', 'NN IN VBG ', 'JJ NN NNS ', 'NN IN NN ', 'NN JJ NN NN ', 'NN IN JJ NNS ', 'NN POS VBG NN ', 'NNP NNP NNP ', 'NN IN JJ NN ', 'JJ NNS FW NN NNS JJ NNS VBP ', 'NN NNS JJ ', 'JJS NNS VBG DT NN TO DT NNP NNPS ', 'NN NNP ', 'NN NN NN ', 'WP VBZ JJ IN JJ NNS ', 'NN NN ', 'JJ NNP NNS ', 'NNP NNP RB TO VB NNS ', 'NN NN ', 'NNP NNP NNS TO VB ', 'NNP NNP NNP ', 'NNP , NNP , NNS ', 'NNP ', 'NNS IN NN ', 'JJS NN NNS ', 'NNP NNS NNS ', 'NN IN NNP NNS ', 'NN NN NNS ', 'NN NN NN ', 'NN NN ', 'NNP NNP NN ', 'NNS VBG NNS ', 'NN NN ', 'WP VBZ VBG VBN IN NN ', 'NN NNS VBP ', 'NN NNS ', 'NNS VBP NN NNS ', 'NN NNS ', 'JJS NNS IN NN NN NN ', 'NNP NNP NNP JJ NN NN ', 'JJ NN IN JJ NN NNS ', 'NN JJ NN ', 'NN ', 'JJ NN NN JJ ', 'JJ NN NN ', 'NN VBZ NN ', 'NN NNS NN NNS ', 'JJ NN ', 'NNS NN NN NN ', 'JJ NN ', 'NN RB JJS NNS NN NN ', 'WP VBD DT NN POS NN . ', 'NN NN NNS ', 'JJ NN ', 'NN NN NN CD ', 'NNP NN NNS ', 'JJ VBG NN NN CD NN NN ', 'NN NN NNS ', 'JJ NN NNS ', 

In [249]:
print(posUni)

[['vbg', 'dt', 'nn'], ['nn', 'in', 'nnp', 'nnp'], ['vbg', 'nns', 'nn'], ['nn', 'in', 'vbg'], ['jj', 'nn', 'nns'], ['nn', 'in', 'nn'], ['nn', 'jj', 'nn', 'nn'], ['nn', 'in', 'jj', 'nns'], ['nn', 'pos', 'vbg', 'nn'], ['nnp', 'nnp', 'nnp'], ['nn', 'in', 'jj', 'nn'], ['jj', 'nns', 'fw', 'nn', 'nns', 'jj', 'nns', 'vbp'], ['nn', 'nns', 'jj'], ['jjs', 'nns', 'vbg', 'dt', 'nn', 'to', 'dt', 'nnp', 'nnps'], ['nn', 'nnp'], ['nn', 'nn', 'nn'], ['wp', 'vbz', 'jj', 'in', 'jj', 'nns'], ['nn', 'nn'], ['jj', 'nnp', 'nns'], ['nnp', 'nnp', 'rb', 'to', 'vb', 'nns'], ['nn', 'nn'], ['nnp', 'nnp', 'nns', 'to', 'vb'], ['nnp', 'nnp', 'nnp'], ['nnp', 'nnp', 'nns'], ['nnp'], ['nns', 'in', 'nn'], ['jjs', 'nn', 'nns'], ['nnp', 'nns', 'nns'], ['nn', 'in', 'nnp', 'nns'], ['nn', 'nn', 'nns'], ['nn', 'nn', 'nn'], ['nn', 'nn'], ['nnp', 'nnp', 'nn'], ['nns', 'vbg', 'nns'], ['nn', 'nn'], ['wp', 'vbz', 'vbg', 'vbn', 'in', 'nn'], ['nn', 'nns', 'vbp'], ['nn', 'nns'], ['nns', 'vbp', 'nn', 'nns'], ['nn', 'nns'], ['jjs', 'nns'

In [250]:
print(posBi)

[['vbg dt', 'dt nn'], ['nn in', 'in nnp', 'nnp nnp'], ['vbg nns', 'nns nn'], ['nn in', 'in vbg'], ['jj nn', 'nn nns'], ['nn in', 'in nn'], ['nn jj', 'jj nn', 'nn nn'], ['nn in', 'in jj', 'jj nns'], ['nn pos', 'pos vbg', 'vbg nn'], ['nnp nnp', 'nnp nnp'], ['nn in', 'in jj', 'jj nn'], ['jj nns', 'nns fw', 'fw nn', 'nn nns', 'nns jj', 'jj nns', 'nns vbp'], ['nn nns', 'nns jj'], ['jjs nns', 'nns vbg', 'vbg dt', 'dt nn', 'nn to', 'to dt', 'dt nnp', 'nnp nnps'], ['nn nnp'], ['nn nn', 'nn nn'], ['wp vbz', 'vbz jj', 'jj in', 'in jj', 'jj nns'], ['nn nn'], ['jj nnp', 'nnp nns'], ['nnp nnp', 'nnp rb', 'rb to', 'to vb', 'vb nns'], ['nn nn'], ['nnp nnp', 'nnp nns', 'nns to', 'to vb'], ['nnp nnp', 'nnp nnp'], ['nnp nnp', 'nnp nns'], [], ['nns in', 'in nn'], ['jjs nn', 'nn nns'], ['nnp nns', 'nns nns'], ['nn in', 'in nnp', 'nnp nns'], ['nn nn', 'nn nns'], ['nn nn', 'nn nn'], ['nn nn'], ['nnp nnp', 'nnp nn'], ['nns vbg', 'vbg nns'], ['nn nn'], ['wp vbz', 'vbz vbg', 'vbg vbn', 'vbn in', 'in nn'], ['nn

In [251]:
print(posTri)

[['vbg dt nn'], ['nn in nnp', 'in nnp nnp'], ['vbg nns nn'], ['nn in vbg'], ['jj nn nns'], ['nn in nn'], ['nn jj nn', 'jj nn nn'], ['nn in jj', 'in jj nns'], ['nn pos vbg', 'pos vbg nn'], ['nnp nnp nnp'], ['nn in jj', 'in jj nn'], ['jj nns fw', 'nns fw nn', 'fw nn nns', 'nn nns jj', 'nns jj nns', 'jj nns vbp'], ['nn nns jj'], ['jjs nns vbg', 'nns vbg dt', 'vbg dt nn', 'dt nn to', 'nn to dt', 'to dt nnp', 'dt nnp nnps'], [], ['nn nn nn'], ['wp vbz jj', 'vbz jj in', 'jj in jj', 'in jj nns'], [], ['jj nnp nns'], ['nnp nnp rb', 'nnp rb to', 'rb to vb', 'to vb nns'], [], ['nnp nnp nns', 'nnp nns to', 'nns to vb'], ['nnp nnp nnp'], ['nnp nnp nns'], [], ['nns in nn'], ['jjs nn nns'], ['nnp nns nns'], ['nn in nnp', 'in nnp nns'], ['nn nn nns'], ['nn nn nn'], [], ['nnp nnp nn'], ['nns vbg nns'], [], ['wp vbz vbg', 'vbz vbg vbn', 'vbg vbn in', 'vbn in nn'], ['nn nns vbp'], [], ['nns vbp nn', 'vbp nn nns'], [], ['jjs nns in', 'nns in nn', 'in nn nn', 'nn nn nn'], ['nnp nnp nnp', 'nnp nnp jj', 'nn

In [252]:
print(posDF)

                                                  query  \
0                                    becoming a fireman   
1                             hotel in Pocono Mountains   
2                           wedding traditions buddhism   
3                             diversification in hiring   
4                          traiditional swahili recipes   
...                                                 ...   
1500  Who plays the bad guy in Star Wars the Horde a...   
1501             What is a fox's favorite kind of food?   
1502             Show me the movie called "The Martian"   
1503            What is the biggest rock found on Mars?   
1504                    What is the top game this week?   

                                           all  \
0                                   VBG DT NN    
1                               NN IN NNP NNP    
2                                  VBG NNS NN    
3                                   NN IN VBG    
4                                   JJ NN

In [253]:
print(posDF)

                                                  query  \
0                                    becoming a fireman   
1                             hotel in Pocono Mountains   
2                           wedding traditions buddhism   
3                             diversification in hiring   
4                          traiditional swahili recipes   
...                                                 ...   
1500  Who plays the bad guy in Star Wars the Horde a...   
1501             What is a fox's favorite kind of food?   
1502             Show me the movie called "The Martian"   
1503            What is the biggest rock found on Mars?   
1504                    What is the top game this week?   

                                           all  \
0                                   VBG DT NN    
1                               NN IN NNP NNP    
2                                  VBG NNS NN    
3                                   NN IN VBG    
4                                   JJ NN

In [254]:
dt = pd.DataFrame(posDF)
dt

Unnamed: 0,query,all,uniPos,biPos,triPos
0,becoming a fireman,VBG DT NN,"[vbg, dt, nn]","[vbg dt, dt nn]",[vbg dt nn]
1,hotel in Pocono Mountains,NN IN NNP NNP,"[nn, in, nnp, nnp]","[nn in, in nnp, nnp nnp]","[nn in nnp, in nnp nnp]"
2,wedding traditions buddhism,VBG NNS NN,"[vbg, nns, nn]","[vbg nns, nns nn]",[vbg nns nn]
3,diversification in hiring,NN IN VBG,"[nn, in, vbg]","[nn in, in vbg]",[nn in vbg]
4,traiditional swahili recipes,JJ NN NNS,"[jj, nn, nns]","[jj nn, nn nns]",[jj nn nns]
...,...,...,...,...,...
1500,Who plays the bad guy in Star Wars the Horde a...,WP VBZ DT JJ NN IN NNP NNP DT NNP NNS .,"[wp, vbz, dt, jj, nn, in, nnp, nnp, dt, nnp, nns]","[wp vbz, vbz dt, dt jj, jj nn, nn in, in nnp, ...","[wp vbz dt, vbz dt jj, dt jj nn, jj nn in, nn ..."
1501,What is a fox's favorite kind of food?,WP VBZ DT NN POS JJ NN IN NN .,"[wp, vbz, dt, nn, pos, jj, nn, in, nn]","[wp vbz, vbz dt, dt nn, nn pos, pos jj, jj nn,...","[wp vbz dt, vbz dt nn, dt nn pos, nn pos jj, p..."
1502,"Show me the movie called ""The Martian""",VB PRP DT NN VBN `` DT JJ '',"[vb, prp, dt, nn, vbn, dt, jj]","[vb prp, prp dt, dt nn, nn vbn, vbn dt, dt jj]","[vb prp dt, prp dt nn, dt nn vbn, nn vbn dt, v..."
1503,What is the biggest rock found on Mars?,WP VBZ DT JJS NN VBD IN NNS .,"[wp, vbz, dt, jjs, nn, vbd, in, nns]","[wp vbz, vbz dt, dt jjs, jjs nn, nn vbd, vbd i...","[wp vbz dt, vbz dt jjs, dt jjs nn, jjs nn vbd,..."


In [255]:
#-- chech what get_dummies() do
li = ['s', 'a', 't', np.nan]
print(pd.get_dummies(li))

   a  s  t
0  0  1  0
1  1  0  0
2  0  0  1
3  0  0  0


In [256]:
print(allSessionsuni)

                                                  query  cc  cd  dt  ex  fw  \
0                                    becoming a fireman   0   0   1   0   0   
1                             hotel in Pocono Mountains   0   0   0   0   0   
2                           wedding traditions buddhism   0   0   0   0   0   
3                             diversification in hiring   0   0   0   0   0   
4                          traiditional swahili recipes   0   0   0   0   0   
...                                                 ...  ..  ..  ..  ..  ..   
1500  Who plays the bad guy in Star Wars the Horde a...   0   0   2   0   0   
1501             What is a fox's favorite kind of food?   0   0   1   0   0   
1502             Show me the movie called "The Martian"   0   0   2   0   0   
1503            What is the biggest rock found on Mars?   0   0   1   0   0   
1504                    What is the top game this week?   0   0   2   0   0   

      in  jj  jjr  jjs  ...  to  vb  vbd  vbg  vbn 

In [257]:
dt1 = pd.DataFrame(allSessionsuni)
dt1

Unnamed: 0,query,cc,cd,dt,ex,fw,in,jj,jjr,jjs,...,to,vb,vbd,vbg,vbn,vbp,vbz,wdt,wp,wrb
0,becoming a fireman,0,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,hotel in Pocono Mountains,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,wedding traditions buddhism,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,diversification in hiring,0,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,traiditional swahili recipes,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1500,Who plays the bad guy in Star Wars the Horde a...,0,0,2,0,0,1,1,0,0,...,0,0,0,0,0,0,1,0,1,0
1501,What is a fox's favorite kind of food?,0,0,1,0,0,1,1,0,0,...,0,0,0,0,0,0,1,0,1,0
1502,"Show me the movie called ""The Martian""",0,0,2,0,0,0,1,0,0,...,0,1,0,0,1,0,0,0,0,0
1503,What is the biggest rock found on Mars?,0,0,1,0,0,1,0,0,1,...,0,0,1,0,0,0,1,0,1,0


In [258]:
allSessionsuni.loc[allSessionsuni['wp']==1]

Unnamed: 0,query,cc,cd,dt,ex,fw,in,jj,jjr,jjs,...,to,vb,vbd,vbg,vbn,vbp,vbz,wdt,wp,wrb
16,what is wrong with designer dogs,0,0,0,0,0,1,2,0,0,...,0,0,0,0,0,0,1,0,1,0
35,what is being done about eurozone,0,0,0,0,0,1,0,0,0,...,0,0,0,1,1,0,1,0,1,0
53,What was the killer's nationality?,0,0,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
84,what is collagen disease,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,1,0
130,What countries have banned the use of Red Bull,0,0,1,0,0,1,0,0,0,...,0,0,0,0,1,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1499,What is the capitol of New York?,0,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,1,0
1500,Who plays the bad guy in Star Wars the Horde a...,0,0,2,0,0,1,1,0,0,...,0,0,0,0,0,0,1,0,1,0
1501,What is a fox's favorite kind of food?,0,0,1,0,0,1,1,0,0,...,0,0,0,0,0,0,1,0,1,0
1503,What is the biggest rock found on Mars?,0,0,1,0,0,1,0,0,1,...,0,0,1,0,0,0,1,0,1,0


In [259]:
allSessionsbi

Unnamed: 0,query,cc cd,cc dt,cc in,cc jj,cc nn,cc nnp,cc nns,cc vb,cc wp,...,wrb md,wrb nn,wrb nnp,wrb prp,wrb rb,wrb to,wrb vb,wrb vbd,wrb vbp,wrb vbz
0,becoming a fireman,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,hotel in Pocono Mountains,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,wedding traditions buddhism,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,diversification in hiring,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,traiditional swahili recipes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1500,Who plays the bad guy in Star Wars the Horde a...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1501,What is a fox's favorite kind of food?,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1502,"Show me the movie called ""The Martian""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1503,What is the biggest rock found on Mars?,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [260]:
allSessionsbi.columns

Index(['query', 'cc cd', 'cc dt', 'cc in', 'cc jj', 'cc nn', 'cc nnp',
       'cc nns', 'cc vb', 'cc wp',
       ...
       'wrb md', 'wrb nn', 'wrb nnp', 'wrb prp', 'wrb rb', 'wrb to', 'wrb vb',
       'wrb vbd', 'wrb vbp', 'wrb vbz'],
      dtype='object', length=296)

In [261]:
allSessionsbi['cc cd'].unique()

array([ 0., nan,  1.])

In [262]:
allSessionsbi['nn nn'].unique()

array([ 0.,  1.,  2., nan,  3.,  4.])

In [263]:
allSessionsbi['wrb to'].unique()

array([ 0., nan,  1.])

In [264]:
allSessionsbi.loc[allSessionsbi['nn nn']==4]

Unnamed: 0,query,cc cd,cc dt,cc in,cc jj,cc nn,cc nnp,cc nns,cc vb,cc wp,...,wrb md,wrb nn,wrb nnp,wrb prp,wrb rb,wrb to,wrb vb,wrb vbd,wrb vbp,wrb vbz
305,internet phone service selection advice,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
794,austerity unemployment debt eurozone crisis,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
810,france world cup economy reaction,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
936,france world cup 98 reaction stock market,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1233,monster high move boo york boo york,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [265]:
# allSessionsbi['wrb to'].is_unique

In [266]:
# allSessionsbi.loc[allSessionsbi['wrb to']==1]

### trigrams

In [267]:
allSessionstri

Unnamed: 0,query,cc dt jj,cc dt nn,cc in cd,cc in dt,cc in nnp,cc jj cd,cc jj nn,cc nn jj,cc nn nn,...,wrb vbp nns,wrb vbp prp,wrb vbp rb,wrb vbz dt,wrb vbz jj,wrb vbz nn,wrb vbz nnp,wrb vbz prp,wrb vbz rb,wrb vbz vbg
0,becoming a fireman,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,hotel in Pocono Mountains,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,wedding traditions buddhism,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,diversification in hiring,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,traiditional swahili recipes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1500,Who plays the bad guy in Star Wars the Horde a...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1501,What is a fox's favorite kind of food?,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1502,"Show me the movie called ""The Martian""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1503,What is the biggest rock found on Mars?,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [268]:
# allSessionstri['jj to vb']

In [269]:
allSessionsuni.shape

(1505, 32)

In [270]:
allSessionsbi.shape

(1505, 296)

In [271]:
allSessionstri.shape

(1505, 856)

In [272]:
allSessionsuniLanding = allSessionsuni

In [273]:
allSessionsbiLanding = allSessionsbi[[
'nn nn',
'jj nn',
'nn nns',
'to vb',
'jj nns',
'jj to',
'nn in',
'nns in',
'in nn',
'dt nn',
'query']]

In [274]:
allSessionsbiLanding.shape


(1505, 11)

In [275]:
allSessionsbiLanding.head(2)

Unnamed: 0,nn nn,jj nn,nn nns,to vb,jj nns,jj to,nn in,nns in,in nn,dt nn,query
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,becoming a fireman
1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,hotel in Pocono Mountains


In [276]:
allSessionstriLanding = allSessionstri[[
'jj nn nn',
'nn nn nn',
# 'jj to vb', #-- this was considered by MG, but it does not appear in our triagrams
'nn nn nns',
'to vb nn',
'query']]

In [277]:
allSessionstriLanding.shape


(1505, 5)

In [278]:
allSessionstriLanding.head(2)

Unnamed: 0,jj nn nn,nn nn nn,nn nn nns,to vb nn,query
0,0.0,0.0,0.0,0.0,becoming a fireman
1,0.0,0.0,0.0,0.0,hotel in Pocono Mountains


In [279]:
#-- We fill the nan with 0 before merging

allSessionsuniLanding = allSessionsuniLanding.fillna(0)
allSessionsbiLanding = allSessionsbiLanding.fillna(0)
allSessionstriLanding = allSessionstriLanding.fillna(0)


In [280]:
allSessionsuniLanding.columns

Index(['query', 'cc', 'cd', 'dt', 'ex', 'fw', 'in', 'jj', 'jjr', 'jjs', 'md',
       'nn', 'nnp', 'nnps', 'nns', 'pdt', 'pos', 'prp', 'rb', 'rbr', 'rbs',
       'rp', 'to', 'vb', 'vbd', 'vbg', 'vbn', 'vbp', 'vbz', 'wdt', 'wp',
       'wrb'],
      dtype='object')

In [281]:
# -- E - Verified 

In [282]:
# -- Confirm their shape

allSessionsuniLanding.shape, allSessionsbiLanding.shape, allSessionstriLanding.shape

((1505, 32), (1505, 11), (1505, 5))

In [283]:
# -- Merge 

posFeat = pd.merge(allSessionsuniLanding, allSessionsbiLanding, left_index=True, right_index=True)
posFeat.drop(columns = ['query_y'], inplace = True)
posFeat.rename(columns = {'query_x':'query'}, inplace = True)

In [284]:
# synFeats = posFeat

# listCols = list(synFeats.columns)
# listCols.pop(0) #removes 'query' from the list of columns

# synFeats['length'] = synFeats['query'].str.split().str.len() # count number of words in each query

# for col in listCols:
#     synFeats[col] = synFeats[col]/synFeats['length'] # calculate the ratio of grams (each colums) per 

In [285]:
posFeat.columns, posFeat.shape

(Index(['query', 'cc', 'cd', 'dt', 'ex', 'fw', 'in', 'jj', 'jjr', 'jjs', 'md',
        'nn', 'nnp', 'nnps', 'nns', 'pdt', 'pos', 'prp', 'rb', 'rbr', 'rbs',
        'rp', 'to', 'vb', 'vbd', 'vbg', 'vbn', 'vbp', 'vbz', 'wdt', 'wp', 'wrb',
        'nn nn', 'jj nn', 'nn nns', 'to vb', 'jj nns', 'jj to', 'nn in',
        'nns in', 'in nn', 'dt nn'],
       dtype='object'),
 (1505, 42))

In [286]:
# -- Add triagrams

posFeat = pd.merge(posFeat, allSessionstriLanding, left_index=True, right_index=True)
posFeat.drop(columns = ['query_y'], inplace = True)
posFeat.rename(columns = {'query_x':'query'}, inplace = True)

In [287]:
posFeat.columns, synFeats.shape

(Index(['query', 'cc', 'cd', 'dt', 'ex', 'fw', 'in', 'jj', 'jjr', 'jjs', 'md',
        'nn', 'nnp', 'nnps', 'nns', 'pdt', 'pos', 'prp', 'rb', 'rbr', 'rbs',
        'rp', 'to', 'vb', 'vbd', 'vbg', 'vbn', 'vbp', 'vbz', 'wdt', 'wp', 'wrb',
        'nn nn', 'jj nn', 'nn nns', 'to vb', 'jj nns', 'jj to', 'nn in',
        'nns in', 'in nn', 'dt nn', 'jj nn nn', 'nn nn nn', 'nn nn nns',
        'to vb nn'],
       dtype='object'),
 (1505, 42))

In [288]:
posFeat

Unnamed: 0,query,cc,cd,dt,ex,fw,in,jj,jjr,jjs,...,jj nns,jj to,nn in,nns in,in nn,dt nn,jj nn nn,nn nn nn,nn nn nns,to vb nn
0,becoming a fireman,0,0,1,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,hotel in Pocono Mountains,0,0,0,0,0,1,0,0,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,wedding traditions buddhism,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,diversification in hiring,0,0,0,0,0,1,0,0,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,traiditional swahili recipes,0,0,0,0,0,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1500,Who plays the bad guy in Star Wars the Horde a...,0,0,2,0,0,1,1,0,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1501,What is a fox's favorite kind of food?,0,0,1,0,0,1,1,0,0,...,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
1502,"Show me the movie called ""The Martian""",0,0,2,0,0,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1503,What is the biggest rock found on Mars?,0,0,1,0,0,1,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [289]:
# synFeats = allSessionsuni.merge(allSessionsbiLanding)
# synFeats = synFeats.merge(allSessionstriLanding)
# synFeats = synFeats.fillna(0)

# listCols = list(synFeats.columns)
# listCols.pop(0) ##removes 'query' from the list of columns

# synFeats['length'] = synFeats['query'].str.split().str.len()


# for col in listCols:
#     synFeats[col] = synFeats[col]/synFeats['length']
    

In [290]:
synFeats = posFeat

listCols = list(synFeats.columns)
listCols.pop(0) #removes 'query' from the list of columns

synFeats['length'] = synFeats['query'].str.split().str.len() # count number of words in each query

for col in listCols:
    synFeats[col] = synFeats[col]/synFeats['length'] # calculate the ratio of grams (each colums) per 

In [291]:
synFeats.shape

(1505, 47)

In [292]:
# -- remove 'length column'
synFeats.drop(columns = ['length'], inplace = True)

In [293]:
synFeats

Unnamed: 0,query,cc,cd,dt,ex,fw,in,jj,jjr,jjs,...,jj nns,jj to,nn in,nns in,in nn,dt nn,jj nn nn,nn nn nn,nn nn nns,to vb nn
0,becoming a fireman,0.0,0.0,0.333333,0.0,0.0,0.000000,0.000000,0.0,0.000,...,0.0,0.0,0.000000,0.0,0.000,0.333333,0.0,0.0,0.0,0.0
1,hotel in Pocono Mountains,0.0,0.0,0.000000,0.0,0.0,0.250000,0.000000,0.0,0.000,...,0.0,0.0,0.250000,0.0,0.000,0.000000,0.0,0.0,0.0,0.0
2,wedding traditions buddhism,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000,...,0.0,0.0,0.000000,0.0,0.000,0.000000,0.0,0.0,0.0,0.0
3,diversification in hiring,0.0,0.0,0.000000,0.0,0.0,0.333333,0.000000,0.0,0.000,...,0.0,0.0,0.333333,0.0,0.000,0.000000,0.0,0.0,0.0,0.0
4,traiditional swahili recipes,0.0,0.0,0.000000,0.0,0.0,0.000000,0.333333,0.0,0.000,...,0.0,0.0,0.000000,0.0,0.000,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1500,Who plays the bad guy in Star Wars the Horde a...,0.0,0.0,0.181818,0.0,0.0,0.090909,0.090909,0.0,0.000,...,0.0,0.0,0.090909,0.0,0.000,0.000000,0.0,0.0,0.0,0.0
1501,What is a fox's favorite kind of food?,0.0,0.0,0.125000,0.0,0.0,0.125000,0.125000,0.0,0.000,...,0.0,0.0,0.125000,0.0,0.125,0.125000,0.0,0.0,0.0,0.0
1502,"Show me the movie called ""The Martian""",0.0,0.0,0.285714,0.0,0.0,0.000000,0.142857,0.0,0.000,...,0.0,0.0,0.000000,0.0,0.000,0.142857,0.0,0.0,0.0,0.0
1503,What is the biggest rock found on Mars?,0.0,0.0,0.125000,0.0,0.0,0.125000,0.000000,0.0,0.125,...,0.0,0.0,0.000000,0.0,0.000,0.000000,0.0,0.0,0.0,0.0


In [294]:
synFeats.columns

Index(['query', 'cc', 'cd', 'dt', 'ex', 'fw', 'in', 'jj', 'jjr', 'jjs', 'md',
       'nn', 'nnp', 'nnps', 'nns', 'pdt', 'pos', 'prp', 'rb', 'rbr', 'rbs',
       'rp', 'to', 'vb', 'vbd', 'vbg', 'vbn', 'vbp', 'vbz', 'wdt', 'wp', 'wrb',
       'nn nn', 'jj nn', 'nn nns', 'to vb', 'jj nns', 'jj to', 'nn in',
       'nns in', 'in nn', 'dt nn', 'jj nn nn', 'nn nn nn', 'nn nn nns',
       'to vb nn'],
      dtype='object')

In [295]:
synFeats.shape

(1505, 46)

# Return Feature Set

Combines all data frames into one, preprocesses out extraneous information and returns the cleaned data frame.

In [296]:
pickle.dump(synFeats, open( "Pickles/SynFeat.p", "wb" ) )

In [297]:
print('done')

done


In [214]:
(71155 + 39629 + 82066 + 42377)/62002

3.7938614883390858

In [215]:
(58624 + 33459 + 66489 + 26315)/50835 

3.637002065506049

My notes:

    - 'jj to' is not found in svenSQS

In [216]:
# Expected shape: (1505, 43 or 44)