The following code in this notebook extracts vocabulary features from each query and then returns a data frame of those extracted features. Only **SQS dataset** used

In [218]:
pwd

'/Users/assoumerredempta/Documents/aSpring_2023/RYSe_Final/FeatureExtraction'

# Load Libraries

The following block of code loads all libraries needed for this notebook. Numpy has an established to ensure that the random selection of queries drawn to establish certain features, such as top word n-grams; is consistent across this code and future execution.

In [219]:
import csv
import nltk
import pickle
import re
import string
import warnings
import random
import numpy as np
import pandas as pd

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm

np.random.seed(20200522)

stopwords = list(stopwords.words('english'))

# Functions for Vocabulary Features

Features used in the following code.

In [220]:
# Converts a list into a dictionary.
# param lst: is the list that is convert into dictionary
# returns resDct: the converted list

def convert(lst):
    
    resDct = {lst[i]: lst[i + 1] for i in range(0, len(lst), 2)}
    
    return resDct

# Load Data Sets

This block of code loads the data sets and extracts all unique queries from both.

In [221]:
# allSessionsSQS = pickle.load( open( "../Data/DataSets/SQS/casttrecSQS.p", "rb" ) )

allSessionsSQS = pickle.load( open( "../Data/DataSets/SQS/castsventrecSQS.p", "rb" ) )
allQueries = allSessionsSQS['query'].tolist()
qID = allSessionsSQS['sID'].tolist()

In [222]:
allSessionsSQS

Unnamed: 0,query,class,sID
0,US civil war causes,0,6352
1,scooter brands,0,8305
2,scooter brands reliable,0,6814
3,scooter,0,7688
4,scooter cheap,0,6221
...,...,...,...
4741,House of dreams,1,5975
4742,When did Desmond doss get married,1,5233
4743,H,1,7864
4744,find fact about dog,1,5316


In [223]:
allSessionsSQS['sID'].is_unique

True

In [224]:
# # -- add a column of randomly selected unique values to represent qID that can be used to represent sID
# n = len(allSessionsSQS)
# allSessionsSQS['sID'] = random.sample(range(1,n+1),n)
# allQueries = allSessionsSQS['query'].tolist()
# qID = allSessionsSQS['sID'].tolist()

In [225]:
len(allSessionsSQS)

4746

In [226]:
allSessionsSQS

Unnamed: 0,query,class,sID
0,US civil war causes,0,6352
1,scooter brands,0,8305
2,scooter brands reliable,0,6814
3,scooter,0,7688
4,scooter cheap,0,6221
...,...,...,...
4741,House of dreams,1,5975
4742,When did Desmond doss get married,1,5233
4743,H,1,7864
4744,find fact about dog,1,5316


In [227]:
# -- S

In [228]:
# ss = np.random.randint(1, 1000, size=1000)
# ss = pd.DataFrame(ss, columns=['random_numbers'])
# ss

In [229]:
# len(ss['random_numbers'].unique())

In [230]:
# ss['rn'] = random.sample(range(1,1001),1000)
# ss

In [231]:
# len(ss['rn'].unique())

In [232]:
# ss['random_numbers'].unique()

In [233]:

# random.sample(range(1, 11), 10)

In [234]:
# -- E

In [235]:
allSessionsSQS['query'].is_unique

False

In [236]:
len(allQueries)

4746

# Core Vocab

Loads all vocabulary expected to be learned between Kindergarten to Seventh grade based on Common Core Curriculum, before extracting the ratio of words in each query that are, and are not; found in this list.

In [237]:
kd = ['a', 'all', 'am', 'an', 'and', 'are', 'as', 'at', 'away', 'back', 'ball', 'bell', 'big', 'bird', 'blue', 'book', 'boot', 'box', 'boy', 'brown', 'but', 'by', 'can', 'car', 'cat', 'come', 'cow', 'day', 'do', 'dog', 'down', 'end', 'fall', 'fan', 'fish', 'fly', 'food', 'for', 'from', 'fun', 'get', 'go', 'good', 'gray', 'green', 'groundhog', 'hat', 'he', 'here', 'hill', 'I', 'in', 'into', 'is', 'it', 'inside', 'kitten', 'little', 'look', 'mad', 'me', 'mud', 'my', 'name', 'no', 'not', 'of', 'on', 'orange', 'out', 'paint', 'pet', 'pin', 'play', 'put', 'rain', 'red', 'run', 'sad', 'say', 'see', 'she', 'sing', 'sit', 'so', 'stay', 'stop', 'story', 'sun', 'take', 'that', 'the', 'them', 'then', 'there', 'they', 'this', 'to', 'too', 'up', 'we', 'wet', 'what', 'where', 'who', 'will', 'with', 'work', 'yellow', 'yes', 'you', 'zoo', 'orange', 'white', 'black', 'monday', 'tuesday', 'wednesday','thursday','friday', 'saturday','sunday']
oned = ['a', 'all', 'am', 'and', 'at', 'ball', 'be', 'bed', 'big', 'book', 'box', 'boy', 'but', 'came', 'can', 'car', 'cat', 'come', 'cow', 'dad', 'day', 'did', 'do', 'dog', 'fat', 'for', 'fun', 'get', 'go', 'good', 'got', 'had', 'hat', 'he', 'hen', 'here', 'him', 'his', 'home', 'hot', 'I', 'if', 'in', 'into', 'is', 'it', 'its', 'let', 'like', 'look', 'man', 'may', 'me', 'mom', 'my', 'no', 'not', 'of', 'oh', 'old', 'on', 'one', 'out', 'pan', 'pet', 'pig', 'play', 'ran', 'rat', 'red', 'ride', 'run', 'sat', 'see', 'she', 'sit', 'six', 'so', 'stop', 'sun', 'ten', 'the', 'this', 'to', 'top', 'toy', 'two', 'up', 'us', 'was', 'we', 'will', 'yes', 'you' ]
twod = ['about', 'add', 'after', 'ago', 'an ', 'any', 'apple', 'are ', 'as', 'ask', 'ate', 'away', 'baby ', 'back', 'bad', 'bag', 'base', 'bat', 'bee', 'been', 'before', 'being', 'best', 'bike', 'bill', 'bird', 'black', 'blue', 'boat', 'both', 'bring', 'brother ', 'brown', 'bus', 'buy ', 'by', 'cake', 'call', 'candy', 'change', 'child', 'city', 'clean', 'club', 'coat', 'cold', 'coming ', 'corn', 'could', 'cry', 'cup', 'cut', 'daddy ', 'dear', 'deep', 'deer', 'doing', 'doll', 'door', 'down ', 'dress', 'drive', 'drop', 'dry', 'duck', 'each', 'eat', 'eating', 'egg', 'end', 'fall', 'far', 'farm', 'fast', 'father ', 'feed', 'feel', 'feet', 'fell ', 'find', 'fine ', 'fire', 'first ', 'fish', 'five', 'fix', 'flag', 'floor', 'fly', 'food', 'foot', 'four', 'fox', 'from ', 'full', 'funny', 'game', 'gas', 'gave', 'girl', 'give', 'glad', 'goat', 'goes ', 'going ', 'gold', 'gone', 'grade ', 'grass', 'green', 'grow', 'hand', 'happy', 'hard', 'has ', 'have ', 'hear ', 'help', 'here ', 'hill', 'hit', 'hold', 'hole', 'hop', 'hope ', 'horse', 'house ', 'how ', 'ice', 'inch', 'inside ', 'job', 'jump', 'just ', 'keep', 'king', 'know ', 'lake', 'land', 'last', 'late', 'lay', 'left', 'leg', 'light', 'line', 'little ', 'live', 'lives', 'long', 'looking', 'lost', 'lot', 'love', 'mad', 'made ', 'make ', 'many ', 'meat', 'men', 'met', 'mile', 'milk', 'mine', 'miss', 'moon', 'more', 'most', 'mother ', 'move', 'much ', 'must', 'myself ', 'nail', 'name ', 'need', 'new ', 'next', 'nice ', 'night', 'nine', 'north', 'now ', 'nut', 'off ', 'only', 'open', 'or ', 'other', 'our', 'outside ', 'over', 'page', 'park', 'part', 'pay', 'pick', 'plant', 'playing', 'pony', 'post', 'pull', 'put', 'rabbit', 'rain', 'read', 'rest', 'riding', 'road', 'rock', 'room', 'said ', 'same', 'sang', 'saw ', 'say', 'school ', 'sea', 'seat', 'seem', 'seen', 'send', 'set', 'seven', 'sheep', 'ship', 'shoe', 'show ', 'sick', 'side', 'sing', 'sky', 'sleep', 'small', 'snow', 'some ', 'soon ', 'spell', 'start', 'stay', 'still', 'store ', 'story', 'take', 'talk', 'tall', 'teach', 'tell', 'than ', 'thank', 'that', 'them ', 'then ', 'there ', 'they ', 'thing', 'think ', 'three', 'time ', 'today ', 'told', 'too ', 'took', 'train ', 'tree', 'truck', 'try', 'use', 'very ', 'walk', 'want ', 'warm', 'wash', 'way', 'week', 'well ', 'went ', 'were ', 'wet', 'what', 'when ', 'while ', 'white', 'who', 'why', 'wind', 'wish', 'with ', 'woke', 'wood', 'work', 'yellow', 'yet', 'your', 'zoo']
threed = ['able', 'above', 'afraid', 'afternoon', 'again', 'age', 'air', 'airplane', 'almost', 'alone', 'along', 'already', 'also', 'always', 'animal', 'another', 'anything', 'around', 'art', 'aunt', 'balloon', 'bark', 'barn', 'basket', 'beach', 'bear', 'because', 'become', 'began', 'begin', 'behind', 'believe', 'below', 'belt', 'better', 'birthday', 'body', 'bones', 'born', 'bought', 'bread', 'bright', 'broke', 'brought', 'busy', 'cabin', 'cage', 'camp', 'can\'t', 'care', 'carry', 'catch', 'cattle', 'cave', 'children', 'class', 'close', 'cloth', 'coal', 'color', 'corner', 'cotton', 'cover', 'dark', 'desert', 'didn\'t', 'dinner', 'dishes', 'does', 'done', 'don\'t', 'dragon', 'draw', 'dream', 'drink', 'early', 'earth', 'east', 'eight', 'even', 'ever', 'every', 'everyone', 'everything', 'eyes', 'face', 'family', 'feeling', 'felt', 'few', 'fight', 'fishing', 'flower', 'flying', 'follow', 'forest', 'forgot', 'form', 'found', 'fourth', 'free', 'Friday', 'friend', 'front', 'getting', 'given', 'grandmother', 'great', 'grew', 'ground', 'guess', 'hair', 'half', 'having', 'head', 'heard', 'he\'s', 'heat', 'hello', 'high', 'himself', 'hour', 'hundred', 'hurry', 'hurt', 'I\'d', 'I\'ll', 'I\'m', 'inches', 'isn\'t', 'it\'s', 'I\'ve', 'kept', 'kids', 'kind', 'kitten', 'knew', 'knife', 'lady', 'large', 'largest', 'later', 'learn', 'leave', 'let\'s', 'letter', 'life', 'list', 'living', 'lovely', 'loving', 'lunch', 'mail', 'making', 'maybe', 'mean', 'merry', 'might', 'mind', 'money', 'month', 'morning', 'mouse', 'mouth', 'Mr.', 'Mrs.', 'Ms.', 'music', 'near', 'nearly', 'never', 'news', 'noise', 'nothing', 'number', 'o\'clock', 'often', 'oil', 'once', 'orange', 'order', 'own', 'pair', 'paint', 'paper', 'party', 'pass', 'past', 'penny', 'people', 'person', 'picture', 'place', 'plan', 'plane', 'please', 'pocket', 'point', 'poor', 'race', 'reach', 'reading', 'ready', 'real', 'rich', 'right', 'river', 'rocket', 'rode', 'round', 'rule', 'running', 'salt', 'says', 'sending', 'sent', 'seventh', 'sew', 'shall', 'short', 'shot', 'should', 'sight', 'sister', 'sitting', 'sixth', 'sled', 'smoke', 'soap', 'someone', 'something', 'sometime', 'song', 'sorry', 'sound', 'south', 'space', 'spelling', 'spent', 'sport', 'spring', 'stairs', 'stand', 'state', 'step', 'stick', 'stood', 'stopped', 'stove', 'street', 'strong', 'study', 'such', 'sugar', 'summer', 'Sunday', 'supper', 'table', 'taken', 'taking', 'talking', 'teacher', 'team', 'teeth', 'tenth', 'that\'s', 'their', 'these', 'thinking', 'third', 'those', 'thought', 'throw', 'tonight', 'trade', 'trick', 'trip', 'trying', 'turn', 'twelve', 'twenty', 'uncle', 'under', 'upon', 'wagon', 'wait', 'walking', 'wasn\'t', 'watch', 'water', 'weather', 'we\'re', 'west', 'wheat', 'where', 'which', 'wife', 'wild', 'win', 'window', 'winter', 'without', 'woman', 'won', 'won\'t', 'wool', 'word', 'working', 'world', 'would', 'write', 'wrong', 'yard', 'year', 'yesterday', 'you\'re'  ]
fourd = ['across', 'against', 'answer', 'awhile', 'between', 'board', 'bottom', 'breakfast', 'broken', 'build', 'building', 'built', 'captain', 'carried', 'caught', 'charge', 'chicken', 'circus', 'cities', 'clothes', 'company', 'couldn\'t', 'country', 'discover', 'doctor', 'doesn\'t', 'dollar', 'during', 'eighth', 'else', 'enjoy', 'enough', 'everybody', 'example', 'except', 'excuse', 'field', 'fifth', 'finish', 'following', 'good-by', 'group', 'happened', 'harden', 'haven\'t', 'heavy', 'held', 'hospital', 'idea', 'instead', 'known', 'laugh', 'middle', 'minute', 'mountain', 'ninth', 'ocean', 'office', 'parent', 'peanut', 'pencil', 'picnic', 'police', 'pretty', 'prize', 'quite', 'radio', 'raise', 'really', 'reason', 'remember', 'return', 'Saturday', 'scare', 'second', 'since', 'slowly', 'stories', 'student', 'sudden', 'suit', 'sure', 'swimming', 'though', 'threw', 'tired', 'together', 'tomorrow', 'toward', 'tried', 'trouble', 'truly', 'turtle', 'until', 'village', 'visit', 'wear', 'we\'ll', 'whole', 'whose', 'women', 'wouldn\'t', 'writing', 'written', 'wrote', 'yell', 'young']
fived = ['although', 'America', 'among', 'arrive', 'attention', 'beautiful', 'countries', 'course', 'cousin', 'decide', 'different', 'evening', 'favorite', 'finally', 'future', 'happiest', 'happiness', 'important', 'interest', 'piece', 'planet', 'present', 'president', 'principal', 'probably', 'problem', 'receive', 'sentence', 'several', 'special', 'suddenly', 'suppose', 'surely', 'surprise', 'they\'re', 'through', 'usually', 'action', 'actor', 'actually', 'addition', 'agreed', 'allowed', 'aloud', 'amendment', 'amount', 'amusement', 'annual', 'appointed', 'arrange', 'attention', 'awhile', 'beginning', 'bruise', 'business', 'calves', 'capital', 'capitol', 'captain', 'carefully', 'caught', 'cause', 'celebrate', 'century', 'chemical', 'chocolate', 'circle', 'climate', 'climbed', 'collar', 'column', 'company', 'condition', 'consider', 'consonant', 'constant', 'continent', 'continued', 'country', 'course', 'crystal', 'current', 'curtain', 'daughter', 'daytime', 'decided', 'decimal', 'delicious', 'desert', 'dessert', 'details', 'determine', 'dictionary', 'difference', 'different', 'difficult', 'direction', 'disappoint', 'division', 'eighth', 'election', 'elements', 'energy', 'enjoyment', 'equal', 'equation', 'errands', 'exact', 'except', 'expect', 'explain', 'explode', 'express', 'factory', 'fault', 'favorite', 'finally', 'finished', 'forward', 'fought', 'fraction', 'furniture', 'future', 'general', 'government', 'graceful', 'graph', 'grasp', 'grease', 'grown-ups', 'guest', 'guide', 'happened', 'happily', 'harvest', 'healthy', 'height', 'hoarse', 'human', 'idea', 'imagine', 'include', 'increase', 'indicate', 'information', 'instrument', 'intention', 'interesting', 'inventor', 'island', 'jewel', 'journey', 'jungle', 'knives', 'known', 'language', 'laughter', 'length', 'limb', 'located', 'lumber', 'major', 'mammal', 'manufacture', 'material', 'mayor', 'measure', 'melody', 'members', 'memories', 'message', 'method', 'million', 'minor', 'modern', 'mountain', 'music', 'natural', 'necessary', 'neither', 'newspaper', 'northern', 'notebook', 'notice', 'noun', 'numeral', 'object', 'observe', 'opposite', 'orphan', 'ought', 'outside', 'oxygen', 'paid', 'paint', 'paragraph', 'pattern', 'pause', 'payment', 'perhaps', 'period', 'permit', 'phone', 'phrase', 'pleasant', 'pleasure', 'plural', 'poison', 'position', 'possible', 'practice', 'prepared', 'president', 'probably', 'problem', 'process', 'produce', 'program', 'promise', 'property', 'protection', 'provide', 'puzzle', 'quickly', 'quietly', 'radio', 'raise', 'rarely', 'rather', 'reached', 'receive', 'record', 'region', 'relax', 'remain', 'remove', 'repay', 'repeat', 'report', 'represent', 'respond', 'result', 'rhythm', 'rising', 'ruin', 'salad', 'sandal', 'scale', 'scent', 'schedule', 'science', 'section', 'separate', 'service', 'settled', 'several', 'shadow', 'shelter', 'shoulder', 'shouted', 'shower', 'signal', 'similar', 'sincerely', 'single', 'size', 'slippery', 'soar', 'soil', 'solution', 'solve', 'southern', 'split', 'spoiled', 'sports', 'square', 'squeeze', 'stain', 'state', 'statement', 'station', 'steer', 'stomach', 'stopping', 'straight', 'straighten', 'stream', 'stretched', 'suggest', 'suitcase', 'sunset', 'supply', 'sure', 'surface', 'surprise', 'surround', 'sweater', 'syllable', 'syrup', 'tablet', 'tasty', 'teaspoon', 'terrible', 'though', 'thoughtful', 'thrown', 'tornado', 'toward', 'traffic', 'trail', 'treasure', 'treatment', 'triangle', 'trouble', 'tunnel', 'type', 'understood', 'unknown', 'usually', 'value', 'various', 'warn', 'weigh', 'weight', 'weird', 'western', 'whisper', 'whoever', 'whole', 'whose', 'wives', 'women', 'wonderful', 'wound', 'wreck', 'x-ray', 'yesterday']
sixd = ['Abandon', 'abundant', 'access', 'accommodate', 'accumulate', 'adapt', 'adhere', 'agony', 'allegiance', 'ambition', 'ample', 'anguish', 'anticipate', 'anxious', 'apparel', 'appeal', 'apprehensive', 'arid', 'arrogant', 'awe', 'Barren', 'beacon', 'beneficial', 'blunder', 'boisterous', 'boycott', 'burden', 'Campaign', 'capacity', 'capital', 'chronological', 'civic', 'clarity', 'collaborate', 'collide', 'commend', 'commentary', 'compact', 'composure', 'concise', 'consent', 'consequence', 'conserve', 'conspicuous', 'constant', 'contaminate', 'context', 'continuous', 'controversy', 'convenient', 'cope', 'cordial', 'cultivate', 'cumulative', '', 'Declare', 'deluge', 'dense', 'deplete', 'deposit', 'designate', 'desperate', 'deteriorate', 'dialogue', 'diligent', 'diminish', 'discretion', 'dissent', 'dissolve', 'distinct', 'diversity', 'domestic', 'dominate', 'drastic', 'duration', 'dwell', 'Eclipse', 'economy', 'eerie', 'effect', 'efficient', 'elaborate', 'eligible', 'elude', 'encounter', 'equivalent', 'erupt', 'esteem', 'evolve', 'exaggerate', 'excel', 'exclude', 'expanse', 'exploit', 'extinct', 'extract', 'Factor', 'former', 'formulates', 'fuse', 'futile', 'Generate', 'genre', 'Habitat', 'hazardous', 'hoax', 'hostile', 'Idiom', 'ignite', 'immense', 'improvises', 'inept', 'inevitable', 'influence', 'ingenious', 'innovation', 'intimidate', 'Jovial', 'Knack', 'Leeway', 'legislation', 'leisure', 'liberate', 'likeness', 'linger', 'literal', 'loathe', 'lure', 'Majority', 'makeshift', 'manipulate', 'marvel', 'massive', 'maximum', 'meager', 'mere', 'migration', 'mimic', 'minute', 'monotonous', 'Negotiate', 'Objective', 'obstacle', 'omniscient', 'onset', 'optimist', 'originate', 'Painstaking', 'paraphrase', 'parody', 'persecute', 'plummet', 'possess', 'poverty', 'precise', 'predicament', 'predict', 'prejudice', 'preliminary', 'primitive', 'priority', 'prominent', 'propel', 'prosecute', 'prosper', 'provoke', 'pursue', 'Quest', 'Recount', 'refuge', 'reinforce', 'reluctant', 'remorse', 'remote', 'resolute', 'restrain', 'retaliate', 'retrieve', 'rigorous', 'rural', 'Salvage', 'sanctuary', 'siege', 'significant', 'solar', 'soothe', 'stationary', 'stifle', 'strive', 'subordinate', 'subsequent', 'superior', 'supplement', 'swarm', 'Tangible', 'terminate', 'terrain', 'trait', 'transform', 'transport', 'treacherous', 'Unanimous', 'unique', 'unruly', 'urban', 'Vacate', 'verdict', 'verge', 'vibrant', 'vital', 'vow', 'accept', 'accidentally', 'acquire', 'ambulance', 'ancient', 'appearance', 'appointment', 'arithmetic', 'audience', 'autumn', 'beautifully', 'beliefs', 'blown', 'bough', 'bows', 'calendar', 'canyon', 'capable', 'capacity', 'caution', 'ceiling', 'champion', 'choir', 'cleanse', 'combination', 'comfortable', 'community', 'complain', 'concentration', 'concern', 'connection', 'constitution', 'contagious', 'conversation', 'cooperation', 'correct', 'coupon', 'creative', 'creature', 'crisis', 'culture', 'curious', 'dangerous', 'decision', 'demonstrate', 'denominator', 'department', 'departure', 'depth', 'descendant', 'disagreement', 'disastrous', 'discussion', 'distance', 'distributed', 'earliest', 'echoes', 'edition', 'educate', 'electricity', 'element', 'elevator', 'emergency', 'employer', 'emptiness', 'encouragement', 'encyclopedia', 'entire', 'entrance', 'envelope', 'equator', 'especially', 'establish', 'example', 'excellent', 'excitement', 'exercise', 'experience', 'exterior', 'familiar', 'faucet', 'fierce', 'fireproof', 'following', 'forgetting', 'forgiveness', 'fossil', 'freight', 'frighten', 'fuel', 'further', 'gallon', 'gaze', 'gesture', 'governor', 'graduation', 'grateful', 'grief', 'halves', 'hamburger', 'hangar', 'hanger', 'happiness', 'headache', 'heroes', 'history', 'honorable', 'horizon', 'hunger', 'hyphen', 'ignore', 'imagination', 'immediate', 'importance', 'improvement', 'independence', 'ingredient', 'injury', 'inquire', 'instead', 'instruction', 'intermission', 'interview', 'invisible', 'invitation', 'involve', 'jealous', 'junior', 'knowledge', 'lawyer', 'league', 'legal', 'liberty', 'liquid', 'listening', 'loaves', 'location', 'luggage', 'manager', 'manner', 'manor', 'marriage', 'meant', 'mechanic', 'medicine', 'mention', 'minus', 'minute', 'mistaken', 'misunderstand', 'mixture', 'mourn', 'multiple', 'muscle', 'museum', 'musician', 'mute', 'myth', 'nationality', 'negative', 'noisy', 'noticeable', 'novel', 'numerator', 'obtain', 'occur', 'official', 'operate', 'original', 'outline', 'partial', 'passenger', 'patient', 'penalty', 'penguin', 'percent', 'performance', 'personal', 'persuade', 'physical', 'piano', 'plumber', 'poem', 'poet', 'policy', 'pollute', 'pollution', 'positive', 'potatoes', 'predict', 'prefer', 'pressure', 'prevent', 'principal', 'private', 'project', 'pumpkins', 'purchase', 'purse', 'quote', 'radius', 'rapid', 'ratio', 'realize', 'recently', 'recycle', 'reduce', 'referred', 'regardless', 'regular', 'rehearse', 'relief', 'relieve', 'remarkable', 'remind', 'remote', 'replacement', 'replied', 'reply', 'requirement', 'rescue', 'resident', 'resources', 'respectful', 'review', 'roam', 'routine', 'rumor', 'rural', 'safety', 'sailor', 'salute', 'satisfy', 'scarcely', 'scientific', 'scissors', 'selection', 'senior', 'sentence', 'separately', 'serious', 'session', 'shampoo', 'shelves', 'shorten', 'silent', 'simply', 'sketch', 'skillful', 'solar', 'sought', 'spaghetti', 'sponge', 'squawk', 'storage', 'strain', 'strategy', 'strength', 'strive', 'struggle', 'studios', 'success', 'suggestion', 'support', 'surrounded', 'sword', 'system', 'telephone', 'television', 'temperature', 'theme', 'themselves', 'therefore', 'thicken', 'thousand', 'threat', 'tomatoes', 'trophies', 'tutor', 'unbelievable', 'underneath', 'unite', 'vacuum', 'vain', 'variety', 'vary', 'vault', 'vegetable', 'vein', 'violence', 'visible', 'vision', 'waste', 'who\'s', 'whose', 'wrestle', 'wrinkle', 'yield']
# sevend = ['abbreviation', 'absence', 'absolutely', 'absorb', 'abundant', 'accessible', 'accompanied', 'accomplishment', 'accurate', 'achievement', 'acres', 'adequate', 'adjustable', 'admit', 'admittance', 'advice', 'advise', 'afghan', 'alternate', 'alternative', 'amusement', 'analysis', 'analyze', 'ancestor', 'anniversary', 'appreciate', 'artificial', 'assistance', 'association', 'athlete', 'atmosphere', 'attendance', 'authority', 'bacteria', 'bagel', 'baggage', 'benefited', 'benefiting', 'bicycle', 'biscuit', 'bizarre', 'boulevard', 'boundary', 'bouquet', 'brilliant', 'brochure', 'bulletin', 'bureau', 'campaign', 'cancellation', 'candidate', 'capable', 'capital', 'capitol', 'category', 'celery', 'cemetery', 'changeable', 'chaperone', 'character', 'cinnamon', 'civilize', 'commercial', 'committed', 'committee', 'commotion', 'companion', 'competent', 'competition', 'complement', 'complex', 'compliment', 'compressor', 'concentrate', 'concentration', 'conductor', 'confetti', 'congratulations', 'consequently', 'controlling', 'cringe', 'culminate', 'culprit', 'deceive', 'delayed', 'democracy', 'deodorant', 'descendent', 'description', 'diameter', 'diamond', 'discourage', 'disgraceful', 'dismissal', 'distinguished', 'dreadful', 'economics', 'economy', 'elementary', 'embarrass', 'emotion', 'emphasize', 'encircle', 'enclosing', 'encounter', 'endurance', 'engineer', 'environment', 'episode', 'erosion', 'eruption', 'evident', 'exchange', 'executive', 'exhibit', 'expensive', 'extinct', 'extinguish', 'extraordinary', 'extremely', 'fabricate', 'failure', 'fascinating', 'fatigue', 'flagrant', 'foreign', 'forfeit', 'frequently', 'fundamental', 'genuine', 'ghetto', 'gossiping', 'gradual', 'graffiti', 'grammar', 'grievance', 'guarantee', 'harass', 'havoc', 'heroic', 'hesitate', 'horrify', 'hospital', 'humid', 'humility', 'hygiene', 'identical', 'idle', 'idol', 'illegal', 'illustration', 'imaginary', 'immediately', 'immobilize', 'impossibility', 'inconvenient', 'incredible', 'individual', 'infamous', 'influence', 'informant', 'inhabit', 'inherit', 'innocence', 'innocent', 'instructor', 'intelligent', 'interruption', 'introduction', 'involvement', 'irate', 'irresistible', 'jealousy', 'judgment', 'juvenile', 'kettle', 'knitting', 'laboratory', 'language', 'legibly', 'liquidation', 'management', 'maneuver', 'media', 'mileage', 'miniature', 'misbehaved', 'morale', 'mortgage', 'movement', 'murmur', 'musician', 'mysterious', 'negotiate', 'nervous', 'nuisance', 'nurture', 'oases', 'oasis', 'obedient', 'obstacle', 'obviously', 'occasion', 'ordinarily', 'ordinary', 'organization', 'pamphlet', 'panic', 'panicked', 'panicky', 'parallel', 'paralysis', 'paralyze', 'penicillin', 'pedestrian', 'phantom', 'pheasant', 'phrase', 'politely', 'popular', 'precipitation', 'principal', 'principle', 'privilege', 'procedure', 'pronunciation', 'psychology', 'puny', 'qualified', 'qualifying', 'quotation', 'raspberry', 'reasonable', 'receipt', 'receiving', 'recipe', 'recognition', 'recommend', 'recruit', 'reddest', 'reprimand', 'resigned', 'restaurant', 'rotten', 'sandwich', 'scarcity', 'scenery', 'secretary', 'securing', 'significance', 'simile', 'sincerely', 'sincerity', 'situation', 'skeptical', 'slumber', 'smudge', 'solemn', 'souvenir', 'spacious', 'specific', 'stationary', 'stationery', 'statistics', 'subscription', 'substitute', 'superintendent', 'supervisor', 'supposedly', 'threatening', 'tolerate', 'tongue', 'tournament', 'tragedy', 'traitor', 'transferred', 'transferring', 'transmitted', 'traveled', 'traveling', 'unfortunately', 'uniform', 'university', 'unnecessary', 'valuable', 'various', 'vehicle', 'version', 'vertical', 'victim', 'vigorously', 'violation', 'visualize', 'volcano', 'voyage', 'wealthy', 'weapon', 'wheeze', 'wilderness', 'Abate', 'abnormal', 'abode', 'abrupt', 'accelerate', 'acclaim', 'acknowledge', 'acquire', 'aspire', 'acrid', 'addict', 'adjacent', 'admonish', 'affliction', 'agitate', 'ajar', 'akin', 'allege', 'annihilate', 'anonymous', 'antagonize', 'apathy', 'arbitrate', 'astute', 'authentic', 'avert', 'Bellow', 'beseech', 'bestow', 'bewilder', 'bigot', 'blatant', 'bleak', 'braggart', 'brawl', 'browse', 'bystander', 'Candid', 'canine', 'canny', 'capricious', 'capsize', 'casual', 'casualty', 'catastrophe', 'cater', 'chorus', 'citrus', 'clamber', 'climax', 'compromise', 'concur', 'confront', 'congested', 'conjure', 'consult', 'corrupt', 'counterfeit', 'covet', 'customary', 'Debut', 'deceased', 'dependent', 'despondent', 'detach', 'devour', 'dishearten', 'dismal', 'dismantle', 'distraught', 'docile', 'downright', 'drone', 'dumbfound', 'Emblem', 'endure', 'ensue', 'enthrall', 'epidemic', 'erode', 'exuberant', 'Fathom', 'feud', 'figment', 'firebrand', 'flabbergast', 'flagrant', 'flaw', 'fruitless', 'Gaudy', 'geography', 'gratify', 'gravity', 'grim', 'grimy', 'grueling', 'gruesome', 'Haggle', 'headlong', 'hilarious', 'homage', 'homicide', 'hospitable', 'hurtle', 'hybrid', 'Illiterate', 'impede', 'implore', 'incident', 'incredulous', 'infamous', 'infuriate', 'insinuate', 'intensified', 'inundate', 'irate', 'Lavish', 'legacy', 'legitimate', 'lethal', 'loath', 'lurk', 'Magnetic', 'mirth', 'quench', 'magnitude', 'maternal', 'maul', 'melancholy', 'mellow', 'momentum', 'mortify', 'mull', 'murky', 'Narrative', 'negligent', 'nimble', 'nomadic', 'noteworthy', 'notify', 'notorious', 'nurture', 'Obnoxious', 'oration', 'orthodox', 'overwhelm', 'Pamper', 'patronize', 'peevish', 'pelt', 'pending', 'perceived', 'perjury', 'permanent', 'persist', 'perturb', 'pique', 'pluck', 'poised', 'ponder', 'potential', 'predatory', 'presume', 'preview', 'prior', 'prowess', 'Radiant', 'random', 'rant', 'recede', 'reprimand', 'resume', 'retort', 'robust', 'rupture', 'Saga', 'sequel', 'sham', 'shirk', 'simultaneously', 'snare', 'species', 'status', 'stodgy', 'substantial', 'subtle', 'sullen', 'supervise', 'Tamper', 'throb', 'toxic', 'tragedy', 'trickle', 'trivial', 'Uncertainty', 'unscathed', 'upright', 'urgent', 'utmost', 'Vengeance', 'vicious', 'vindictive', 'vista', 'vocation', 'void', 'Wary', 'whim', 'wince', 'wrath', 'Yearn']

# Note: seven is taken out because the age we are considering 

coreVocab = []
vocab = []
nonVocab = []

st = WordNetLemmatizer()
"""
The following lammatize the splitted words and put them in lower case 

"""
for word in kd:
    coreVocab.append(" ".join([st.lemmatize(i.lower()) for i in word.split()])) 
for word in oned:
    coreVocab.append(" ".join([st.lemmatize(i.lower()) for i in word.split()]))
for word in twod:
    coreVocab.append(" ".join([st.lemmatize(i.lower()) for i in word.split()]))
for word in threed:
    coreVocab.append(" ".join([st.lemmatize(i.lower()) for i in word.split()]))
for word in fourd:
    coreVocab.append(" ".join([st.lemmatize(i.lower()) for i in word.split()]))
for word in fived:
    coreVocab.append(" ".join([st.lemmatize(i.lower()) for i in word.split()]))
for word in sixd:
    coreVocab.append(" ".join([st.lemmatize(i.lower()) for i in word.split()]))
# for word in sevend:
#     coreVocab.append(" ".join([st.lemmatize(i.lower()) for i in word.split()]))
    

with tqdm(total=len(allQueries)) as pbar:
    for query in allQueries:

        splitQuery = [st.lemmatize(i.lower()) for i in query.split(' ')]

        queryVocab = 0
        nonqueryVocab = 0
        totalVocab = 0

        for word in splitQuery:
            if word in coreVocab:
                queryVocab  +=1
                totalVocab  +=1
            else:
                nonqueryVocab +=1
                totalVocab  +=1

        vocab.append(queryVocab/totalVocab) 
        nonVocab.append(nonqueryVocab/totalVocab) 
        pbar.update() 

Vocab = pd.DataFrame(data=vocab, columns = ['coreVocab'])
Vocab['query'] = allQueries
Vocab['nonCoreVocab'] = nonVocab
# Vocab['qID'] = qID
# Vocab = Vocab.set_index('query')


100%|██████████| 4746/4746 [00:00<00:00, 18293.47it/s]


In [238]:
Vocab.head(2)

Unnamed: 0,coreVocab,query,nonCoreVocab
0,0.5,US civil war causes,0.5
1,0.0,scooter brands,1.0


In [239]:
Vocab.shape

(4746, 3)

In [240]:
if "results" in coreVocab:
    print("it's there")
else:
    print("not there")

not there


In [241]:
print("Core Vocab = ", len(coreVocab))
# print( "Sum words in all grades = ", len(kd + sevend + oned + twod + threed + fourd + fived + sixd ))

Core Vocab =  1780


In [242]:
Vocab

Unnamed: 0,coreVocab,query,nonCoreVocab
0,0.50,US civil war causes,0.50
1,0.00,scooter brands,1.00
2,0.00,scooter brands reliable,1.00
3,0.00,scooter,1.00
4,0.00,scooter cheap,1.00
...,...,...,...
4741,1.00,House of dreams,0.00
4742,0.50,When did Desmond doss get married,0.50
4743,0.00,H,1.00
4744,0.75,find fact about dog,0.25


In [243]:
len(Vocab['coreVocab'])

4746

In [244]:
len(kd + oned + twod + threed + fourd + fived + sixd )

1780

In [245]:
# Comment:

# 1st query coreVocab is 0.5 because 'consulting' is in core vocab
# Sum of coreVocab and nonCoreVocab should = 1


# Age of Acquisition features

In this block of code we first load up the Age of Acquistion data set (which is a csv with multiple columns representing a variety of information) and process it into a dictionary where the key is the word, and the value is AoA rating. We then find the AoA rating for each word in the query, extracting the min, max, average (known as query complexity), and ratio of words expected to be learned by the age of 12.

In [246]:
# look at the info in the data

# the words expected to be known at a certain age

dtAoA = pd.read_csv('DataSets/AoA/AoA_51715_words.csv')
dtAoA.head()

Unnamed: 0,Word,Alternative.spelling,Freq_pm,Dom_PoS_SUBTLEX,Nletters,Nphon,Nsyll,Lemma_highest_PoS,AoA_Kup,Perc_known,AoA_Kup_lem,Perc_known_lem,AoA_Bird_lem,AoA_Bristol_lem,AoA_Cort_lem,AoA_Schock
0,a,a,20415.27,Article,1,1,1,a,2.89,1.0,2.89,1.0,3.16,,,
1,aardvark,aardvark,0.41,Noun,8,7,2,aardvark,9.89,1.0,9.89,1.0,,,,
2,abacus,abacus,0.24,Noun,6,6,3,abacus,8.69,0.65,8.69,0.65,,,,
3,abacuses,abacuses,0.02,Noun,8,9,4,abacus,,,8.69,0.65,,,,
4,abalone,abalone,0.51,Verb,7,7,4,abalone,12.23,0.72,12.23,0.72,,,,


In [247]:
dtAoA.loc[dtAoA['Lemma_highest_PoS']=='it', 'AoA_Kup_lem']

24706    4.4
Name: AoA_Kup_lem, dtype: float64

In [248]:
dtAoA.loc[dtAoA['Lemma_highest_PoS']=='consult', 'AoA_Kup_lem']

9667    10.95
9672    10.95
9673    10.95
9674    10.95
Name: AoA_Kup_lem, dtype: float64

In [249]:
AoAvocab = [] # a word and it's corresponding age

with open('DataSets/AoA/AoA_51715_words.csv') as csvFile:
    csvReader = csv.reader(csvFile)
    lineCount = 0
    for row in csvReader:
        if lineCount == 0:
            lineCount += 1
        else:
            AoAvocab.append(row[7]) # row[7]: Column (Lemma_highest_PoS) with lemma of the words
            AoAvocab.append(row[10]) # row[10]: Column 
            
AoAVConv = convert(AoAvocab) # make 'AoAvocab' a dict of words and corresponding age

minAoA = []
maxAoA = []
averageVocab = []
ratioAoA = []

st = WordNetLemmatizer()


with tqdm(total=len(allQueries)) as pbar:
    for query in allQueries:
        count = 0
        vocab = []

        for word in query.split(' '):
            word = word.lower().strip()
            word = re.sub(r'[^\w\s]','',word) 
            word = st.lemmatize(word)
            if word in AoAVConv:
                vocab.append(float(AoAVConv[word])) # if a word in query is found in listed vocabulary we save it's value
            else:
                vocab.append(0)


        vocab = np.array(vocab)
        
        if vocab.size == 0:
            minAoA.append(-1) 
            maxAoA.append(-1) 
            averageVocab.append(-1)
            ratioAoA.append(0)
        elif vocab.size > 0:
            minAoA.append(np.min(vocab))
            maxAoA.append(np.max(vocab))
            averageVocab.append(np.mean(vocab))
            for entry in vocab:
                if entry < 13 and entry > 0: # 13 = limited age
                    count +=1
            ratioAoA.append(count/len(vocab))
        
        pbar.update()

Vocab['minAoA'] = minAoA
Vocab['maxAoA'] = maxAoA
Vocab['ratioAoA'] = ratioAoA
Vocab['queryComplexity'] = averageVocab # mean of age

100%|██████████| 4746/4746 [00:00<00:00, 38669.87it/s]


In [250]:
Vocab.shape

(4746, 7)

In [251]:
# --- Start verify ---

In [252]:
#  vocab[1]

In [253]:
np.min(vocab)

4.28

In [254]:
AoAvocab

['a',
 '2.89',
 'aardvark',
 '9.89',
 'abacus',
 '8.69',
 'abacus',
 '8.69',
 'abalone',
 '12.23',
 'abalone',
 '12.23',
 'abandon',
 '8.32',
 'abandon',
 '8.32',
 'abandoner',
 '11.89',
 'abandon',
 '8.32',
 'abandonment',
 '10.27',
 'abandon',
 '8.32',
 'abase',
 '14.57',
 'abasement',
 '15.13',
 'abate',
 '14.44',
 'abate',
 '14.44',
 'abatement',
 '15.12',
 'abate',
 '14.44',
 'abate',
 '14.44',
 'abattoir',
 '15.17',
 'abbacy',
 '14.50',
 'abbess',
 '15.43',
 'abbess',
 '15.43',
 'abbey',
 '13.06',
 'abbot',
 '12.10',
 'abbreviate',
 '9.95',
 'abbreviated',
 '10.50',
 'abbreviation',
 '9.11',
 'abbreviation',
 '9.11',
 'abdicate',
 '12.60',
 'abdicate',
 '12.60',
 'abdicate',
 '12.60',
 'abdicate',
 '12.60',
 'abdication',
 '14.94',
 'abdomen',
 '8.61',
 'abdomen',
 '8.61',
 'abdominal',
 '10.24',
 'abduct',
 '11.26',
 'abduct',
 '11.26',
 'abduct',
 '11.26',
 'abduction',
 '11.94',
 'abduction',
 '11.94',
 'abductor',
 '11.11',
 'abductor',
 '11.11',
 'abduct',
 '11.26',
 'abeam'

In [255]:
AoAVConv

{'a': '2.89',
 'aardvark': '9.89',
 'abacus': '8.69',
 'abalone': '12.23',
 'abandon': '8.32',
 'abandoner': '11.89',
 'abandonment': '10.27',
 'abase': '14.57',
 'abasement': '15.13',
 'abate': '14.44',
 'abatement': '15.12',
 'abattoir': '15.17',
 'abbacy': '14.50',
 'abbess': '15.43',
 'abbey': '13.06',
 'abbot': '12.10',
 'abbreviate': '9.95',
 'abbreviated': '10.50',
 'abbreviation': '9.11',
 'abdicate': '12.60',
 'abdication': '14.94',
 'abdomen': '8.61',
 'abdominal': '10.24',
 'abduct': '11.26',
 'abduction': '11.94',
 'abductor': '11.11',
 'abeam': '13.40',
 'aberrant': '13.31',
 'aberration': '12.69',
 'abet': '12.15',
 'abettor': '15.55',
 'abeyance': '15.00',
 'abhor': '13.76',
 'abhorrent': '12.14',
 'abide': '9.50',
 'abiding': '10.30',
 'ability': '8.84',
 'abject': '14.00',
 'abjuration': '17.12',
 'abjure': '14.60',
 'ablation': '13.29',
 'ablaze': '10.83',
 'able': '7.79',
 'ablution': '13.64',
 'abnormal': '10.05',
 'abnormality': '11.58',
 'aboard': '8.66',
 'abode'

In [256]:
Vocab.shape


(4746, 7)

In [257]:
Vocab.columns

Index(['coreVocab', 'query', 'nonCoreVocab', 'minAoA', 'maxAoA', 'ratioAoA',
       'queryComplexity'],
      dtype='object')

# Sven Features

In this block of code we load the list of commonly searched for terms used by children as established in the Is Sven Seven data set, and then determines the ratio of those terms occuring in each query. #--- from thesis: children tend to use different vocabulary than adults when searching therefore we count the occurrence of words per  query found in the most common words found in children's websites from the Sven Children dictionary dataset ---

In [258]:
# looking into the data that we are loading 
svendict = pd.read_csv('DataSets/Sven/ChildrenDict.tsv', sep = '\t')
svendict.head()

Unnamed: 0,1,family,71289
0,2,story,43562
1,3,life,29678
2,4,�,26454
3,5,trtd,20449
4,6,love,19698


In [259]:
SVENwords = []
st = WordNetLemmatizer()
with open('DataSets/Sven/ChildrenDict.tsv') as csvFile:
    csvReader = csv.reader(csvFile, delimiter = '\t')
    lineCount = 0
    for row in csvReader:
#         print(row)
        if lineCount == 0:
            lineCount +=1
        else:
            SVENwords.append((row[1]))
            
SVENcount = []
with tqdm(total=len(allQueries)) as pbar:
    for query in allQueries:
        vocab = []
        countWord = 0
        wordCount = 0
        for word in query.split(' '):
#             print(word)
            wordCount +=1
            if word in SVENwords:
                countWord +=1

        SVENcount.append(countWord/wordCount)
        pbar.update()
        
Vocab['SVEN'] = SVENcount

100%|██████████| 4746/4746 [00:04<00:00, 1042.33it/s]


In [260]:
Vocab.shape

(4746, 8)

In [261]:
len(SVENcount)

4746

## Concrete/Abstract

In [262]:
word_concreteness = pickle.load( open( "DataSets/word_concreteness.p", "rb" ) )

In [408]:
pwd

'/Users/assoumerredempta/Documents/aSpring_2023/RYSe_Final/FeatureExtraction'

In [263]:
word_concreteness['word']=word_concreteness['word'].str.lower()

In [264]:
# abstractWords = word_concreteness.loc[word_concreteness['label']=='abstract', 'word']
# concreteWords = word_concreteness.loc[word_concreteness['label']=='concrete', 'word']

In [393]:

aw = word_concreteness[word_concreteness['label']=='abstract']
cw = word_concreteness[word_concreteness['label']=='concrete']

abW = []
coW = []
for w_a in aw['word']:
    abW.append(w_a)
for w_c in cw['word']:
    coW.append(w_c)

absrtCount = []
concCount = []
with tqdm(total=len(allQueries)) as pbar:
    for query in allQueries:
#         vocab = []
        a_countWord = 0
        c_countWord = 0
        wordCount = 0
        for word in query.split(' '):
            word.lower()
#             print(word)
            wordCount +=1
            if word in abW:
                a_countWord +=1
            if word in coW:
                c_countWord +=1

        absrtCount.append(a_countWord/wordCount)
        concCount.append(c_countWord/wordCount)
        pbar.update()
        
Vocab['ratioAbs'] = absrtCount
Vocab['ratioConc'] = concCount

100%|██████████| 4746/4746 [00:04<00:00, 952.74it/s] 


In [None]:
# save abs and con

In [403]:
aC =  absrtCount.copy()
cC = concCount.copy()

In [409]:
dt = {'ratioAbs': aC,
      'ratioConc': cC}
abs_conc = pd.DataFrame(dt)
# abs_conc.to_csv('abs_conc.csv')

pickle.dump(abs_conc, open( "Pickles/4746Abs_concFeat.p", "wb" ) )


In [405]:
pwd

'/Users/assoumerredempta/Documents/aSpring_2023/RYSe_Final/FeatureExtraction'

In [394]:
Vocab

Unnamed: 0,coreVocab,query,nonCoreVocab,minAoA,maxAoA,ratioAoA,queryComplexity,SVEN,ratioAbs,ratioConc,...,com,net,org,edu,gov,http,AND,OR,quotes,inter
0,0.50,US civil war causes,0.50,0.00,10.89,0.75,6.100000,0.500000,0.250000,0.250000,...,0,0,0,0,0,0,0,0,0,0
1,0.00,scooter brands,1.00,6.68,7.72,1.00,7.200000,0.500000,0.000000,0.500000,...,0,0,0,0,0,0,0,0,0,0
2,0.00,scooter brands reliable,1.00,6.68,9.32,1.00,7.906667,0.666667,0.333333,0.333333,...,0,0,0,0,0,0,0,0,0,0
3,0.00,scooter,1.00,6.68,6.68,1.00,6.680000,1.000000,0.000000,1.000000,...,0,0,0,0,0,0,0,0,0,0
4,0.00,scooter cheap,1.00,6.68,7.11,1.00,6.895000,1.000000,0.500000,0.500000,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4741,1.00,House of dreams,0.00,3.16,4.88,1.00,4.196667,0.333333,0.333333,0.000000,...,0,0,0,0,0,0,0,0,0,0
4742,0.50,When did Desmond doss get married,0.50,0.00,5.16,0.50,2.095000,0.333333,0.500000,0.000000,...,0,0,0,0,0,0,0,0,0,1
4743,0.00,H,1.00,0.00,0.00,0.00,0.000000,0.000000,0.000000,0.000000,...,0,0,0,0,0,0,0,0,0,0
4744,0.75,find fact about dog,0.25,2.80,6.47,1.00,5.030000,0.500000,0.750000,0.250000,...,0,0,0,0,0,0,0,0,0,0


# Top Stereotype Uni-Grams

In this block of code we take 80% of the sessions generated by users who belong to our stereotype, extract the 250 most common word uni-grams found in that sample, and then calculate the number of those words found in each query as well as the antecedent and the consequent ratio of words per query that are found in the top 250.


In [268]:
# allSessions = pickle.load( open( "../Data/DataSets/SWC/SWC.p", "rb" ) )
# allSessionsQ = allSessions.loc[allSessions['type']=='Q']
# allSessionsQ = allSessionsQ[allSessionsQ['class'] == 1]
# sID = allSessionsQ['sID'].unique()
# corpus = np.random.choice(sID,int((len(sID)*.8)), replace=False)
# allSessionsQ = allSessionsQ[allSessionsQ['sID'].isin(corpus)]
# queries = allSessionsQ['query'].tolist()

allSessions = pickle.load( open( "../Data/DataSets/SQS/castsventrecSQS.p", "rb" ) )
allSessions = allSessions[allSessions['class'] == 1]
sID = allSessions['sID'].unique()
corpus = np.random.choice(sID,int((len(sID)*.8)), replace=False)
allSessions = allSessions[allSessions['sID'].isin(corpus)]
queries = allSessions['query'].tolist()

text = ''

for query in queries:
    text += query.lower() + " "
    
queryWords = text.split()

resultWords  = [word for word in queryWords if word.lower() not in stopwords]
text = ' '.join(resultWords)
text = text.split(' ')
fdist1 = nltk.FreqDist(text)
top250 = []

for x in fdist1.most_common(250):
    top250.append(x[0])
    
top250count = []
top250avg = []
with tqdm(total=len(allQueries)) as pbar:
    for query in allQueries:
        vocab = []
        countWord = 0
        wordCount = 0
        for word in query.split(' '):
            wordCount +=1
            if word in top250:
                countWord +=1
            else:
                pass
        top250count.append(countWord)
        top250avg.append(countWord/wordCount)
        pbar.update()


Vocab['top250SterCount'] = top250count
Vocab['top250SterRatAnt'] = top250avg
Vocab['top250SterRatCon'] = 1-Vocab['top250SterRatAnt']

100%|██████████| 4746/4746 [00:00<00:00, 88243.20it/s]


In [269]:
allSessions

Unnamed: 0,query,class,sID
3797,why is idaho called the gem state,1,5179
3799,a famos mathution,1,8600
3801,Victorea stillwells Big moments,1,6411
3802,Information about tigers,1,9041
3803,science,1,8574
...,...,...,...
4740,Kids facts about Norway,1,7234
4741,House of dreams,1,5975
4742,When did Desmond doss get married,1,5233
4743,H,1,7864


In [270]:
Vocab.shape

(4746, 13)

In [271]:
# len(queries)

In [272]:
Vocab.columns

Index(['coreVocab', 'query', 'nonCoreVocab', 'minAoA', 'maxAoA', 'ratioAoA',
       'queryComplexity', 'SVEN', 'ratioAbs', 'ratioConc', 'top250SterCount',
       'top250SterRatAnt', 'top250SterRatCon'],
      dtype='object')

# Top Non-Stereotype Uni-Grams

In this block of code we take 80% of the sessions generated by users who do not belong to our stereotype, extract the 250 most common word uni-grams found in that sample, and then calculate the number of those words found in each query as well as the antecedent and the consequent ratio of words per query that are found in the top 250.

In [273]:
# n-grams: once of ways helping machines understand the a word and it's contex

In [274]:
# allSessions = pickle.load( open( "../Data/DataSets/SWC/SWC.p", "rb" ) )
# allSessionsQ = allSessions.loc[allSessions['type']=='Q']
# allSessionsQ = allSessionsQ[allSessionsQ['class'] == 0]
# sID = allSessionsQ['sID'].unique()
# corpus = np.random.choice(sID,int((len(sID)*.8)), replace=False)
# allSessionsQ = allSessionsQ[allSessionsQ['sID'].isin(corpus)]
# queries = allSessionsQ['query'].tolist()

allSessions = pickle.load( open( "../Data/DataSets/SQS/castsventrecSQS.p", "rb" ) )
allSessions = allSessions[allSessions['class'] == 0]
sID = allSessions['sID'].unique()
corpus = np.random.choice(sID,int((len(sID)*.8)), replace=False)
allSessions = allSessions[allSessions['sID'].isin(corpus)]
queries = allSessions['query'].tolist()



text = ''

for query in queries:
    text += query.lower() + " "
    
queryWords = text.split()

resultWords  = [word for word in queryWords if word.lower() not in stopwords]
text = ' '.join(resultWords)
text = text.split(' ')
fdist1 = nltk.FreqDist(text)
top250 = []

for x in fdist1.most_common(250):
    top250.append(x[0])
    
top250Count = []
top250Avg = []
with tqdm(total=len(allQueries)) as pbar:
    for query in allQueries:
        vocab = []
        countWord = 0
        wordCount = 0
        for word in query.split(' '):
            wordCount +=1
            if word in top250:
                countWord +=1
            else:
                pass
        top250Count.append(countWord)
        top250Avg.append(countWord/wordCount)
        pbar.update()
    
Vocab['top250NonSterCount'] = top250Count
Vocab['top250NonSterRatAnt'] = top250Avg
Vocab['top250NonSterRatCon'] = 1-Vocab['top250NonSterRatAnt']

100%|██████████| 4746/4746 [00:00<00:00, 105130.06it/s]


In [275]:
Vocab.shape

(4746, 16)

In [276]:
# len(queries)

In [277]:
Vocab.columns

Index(['coreVocab', 'query', 'nonCoreVocab', 'minAoA', 'maxAoA', 'ratioAoA',
       'queryComplexity', 'SVEN', 'ratioAbs', 'ratioConc', 'top250SterCount',
       'top250SterRatAnt', 'top250SterRatCon', 'top250NonSterCount',
       'top250NonSterRatAnt', 'top250NonSterRatCon'],
      dtype='object')

In [278]:
Vocab.head(2)

Unnamed: 0,coreVocab,query,nonCoreVocab,minAoA,maxAoA,ratioAoA,queryComplexity,SVEN,ratioAbs,ratioConc,top250SterCount,top250SterRatAnt,top250SterRatCon,top250NonSterCount,top250NonSterRatAnt,top250NonSterRatCon
0,0.5,US civil war causes,0.5,0.0,10.89,0.75,6.1,0.5,0.25,0.25,0,0.0,1.0,1,0.25,0.75
1,0.0,scooter brands,1.0,6.68,7.72,1.0,7.2,0.5,0.0,0.5,0,0.0,1.0,1,0.5,0.5


# Top Stereotype Bi-Grams

In this block of code we take 80% of the sessions generated by users who belong to our stereotype, extract the 50 most common word bi-grams found in that sample, and then calculate the number of those words found in each query as well as the antecedent and the consequent ratio of words per query that are found in the top 50.


In [279]:
# allSessions = pickle.load( open( "../Data/DataSets/SWC/SWC.p", "rb" ) )
# allSessionsQ = allSessions.loc[allSessions['type']=='Q']
# allSessionsQ = allSessionsQ[allSessionsQ['class'] == 1]
# sID = allSessionsQ['sID'].unique()
# corpus = np.random.choice(sID,int((len(sID)*.8)), replace=False)
# allSessionsQ = allSessionsQ[allSessionsQ['sID'].isin(corpus)]
# queries = allSessionsQ['query'].tolist()

allSessions = pickle.load( open( "../Data/DataSets/SQS/castsventrecSQS.p", "rb" ) )
allSessions = allSessions[allSessions['class'] == 1]
sID = allSessions['sID'].unique()
corpus = np.random.choice(sID,int((len(sID)*.8)), replace=False)
allSessions = allSessions[allSessions['sID'].isin(corpus)]
queries = allSessions['query'].tolist()


queries2 = []
for query in queries:
    queries2.append(query.lower())
queries = queries2

bigrams = [b for l in queries for b in zip(l.split(" ")[:-1], l.split(" ")[1:])]

fdist1 = nltk.FreqDist(bigrams)

top50 = []

for x in fdist1.most_common(50):
     top50.append(x[0])
        
top50count = []
top50avg = []

with tqdm(total=len(allQueries)) as pbar:
    for query in allQueries:
        vocab = []
        countWord = 0
        wordCount = 0
        query = query.lower()
        query = query.split(" ")
        split = nltk.bigrams(query)
        for word in split:
            wordCount +=1
            if word in top50:
                countWord +=1
            else:
                pass
        top50count.append(countWord)
        if wordCount > 0:
            top50avg.append(countWord/wordCount)
        else:
            top50avg.append(0) 
        pbar.update()
        
Vocab['top50SterCount'] = top50count
Vocab['top50SterRatAnt'] = top50avg
Vocab['top50SterAntCon'] = 1-Vocab['top50SterRatAnt']

100%|██████████| 4746/4746 [00:00<00:00, 210297.88it/s]


In [280]:
allSessions

Unnamed: 0,query,class,sID
3797,why is idaho called the gem state,1,5179
3798,alcoholic,1,9005
3799,a famos mathution,1,8600
3801,Victorea stillwells Big moments,1,6411
3802,Information about tigers,1,9041
...,...,...,...
4738,How many Star Wars movies will be made?,1,9488
4739,Mount St. Helens,1,9233
4740,Kids facts about Norway,1,7234
4742,When did Desmond doss get married,1,5233


In [281]:
Vocab.shape

(4746, 19)

In [282]:
# len(queries)

In [283]:
Vocab.columns

Index(['coreVocab', 'query', 'nonCoreVocab', 'minAoA', 'maxAoA', 'ratioAoA',
       'queryComplexity', 'SVEN', 'ratioAbs', 'ratioConc', 'top250SterCount',
       'top250SterRatAnt', 'top250SterRatCon', 'top250NonSterCount',
       'top250NonSterRatAnt', 'top250NonSterRatCon', 'top50SterCount',
       'top50SterRatAnt', 'top50SterAntCon'],
      dtype='object')

# Top Non-Stereotype Bi-Grams

In this block of code we take 80% of the sessions generated by users who do not belong to our stereotype, extract the 50 most common word bi-grams found in that sample, and then calculate the number of those words found in each query as well as the antecedent and the consequent ratio of words per query that are found in the top 50.

In [284]:
# allSessions = pickle.load( open( "../Data/DataSets/SWC/SWC.p", "rb" ) )
# allSessionsQ = allSessions.loc[allSessions['type']=='Q']
# allSessionsQ = allSessionsQ[allSessionsQ['class'] == 0]
# sID = allSessionsQ['sID'].unique()
# corpus = np.random.choice(sID,int((len(sID)*.8)), replace=False)
# allSessionsQ = allSessionsQ[allSessionsQ['sID'].isin(corpus)]
# queries = allSessionsQ['query'].tolist()

allSessions = pickle.load( open( "../Data/DataSets/SQS/castsventrecSQS.p", "rb" ) )
allSessions = allSessions[allSessions['class'] == 0]
sID = allSessions['sID'].unique()
corpus = np.random.choice(sID,int((len(sID)*.8)), replace=False)
allSessions = allSessions[allSessions['sID'].isin(corpus)]
queries = allSessions['query'].tolist()


queries2 = []
for query in queries:
    queries2.append(query.lower())
queries = queries2

bigrams = [b for l in queries for b in zip(l.split(" ")[:-1], l.split(" ")[1:])]

fdist1 = nltk.FreqDist(bigrams)

top50 = []

for x in fdist1.most_common(50):
     top50.append(x[0])
        
top50count = []
top50avg = []

with tqdm(total=len(allQueries)) as pbar:
    for query in allQueries:
        vocab = []
        countWord = 0
        wordCount = 0
        query = query.lower()
        query = query.split(" ")
        split = nltk.bigrams(query)
        for word in split:
            wordCount +=1
            if word in top50:
                countWord +=1
            else:
                pass
        top50count.append(countWord)
        if wordCount > 0:
            top50avg.append(countWord/wordCount)
        else:
            top50avg.append(0)
        pbar.update()
        
Vocab['top50NonSterCount'] = top50count
Vocab['top50NonSterRatAnt'] = top50avg
Vocab['top50NonSterAntCon'] = 1-Vocab['top50NonSterRatAnt']

100%|██████████| 4746/4746 [00:00<00:00, 195476.63it/s]


In [285]:
Vocab.shape

(4746, 22)

# TF-IDF All

In the following block of code we take an 80% sample of all sessions found in SWC, and then calculate the TF-IDF values for each all queries with each query in every session seen as an individual document. 

----------**TF-IDF** stands for term frequency-inverse document frequency and it is a measure, used in the fields of information retrieval (IR) and machine learning, that can quantify the importance or relevance of string representations (words, phrases, lemmas, etc) in a document amongst a collection of documents-------

In [286]:
# allSessions = pickle.load( open( "../Data/DataSets/SWC/SWC.p", "rb" ) )
# allSessionsQ = allSessions.loc[allSessions['type']=='Q']
# sID = allSessionsQ['sID'].unique()
# corpus = np.random.choice(sID,int((len(sID)*.8)), replace=False)
# allSessionsQ = allSessionsQ[allSessionsQ['sID'].isin(corpus)]
# queries = allSessionsQ['query'].tolist()

allSessions = pickle.load( open( "../Data/DataSets/SQS/castsventrecSQS.p", "rb" ) )
sID = allSessions['sID'].unique()
corpus = np.random.choice(sID,int((len(sID)*.8)), replace=False)
allSessions = allSessions[allSessions['sID'].isin(corpus)]
queries = allSessions['query'].tolist()


text = ''
for query in queries:
    text += query + " "

querywords = text.split()

resultwords  = [word for word in querywords if word.lower() not in stopwords]
text = ' '.join(resultwords)

queries = allSessions['query'].tolist()

vectorizer = TfidfVectorizer()
vector = vectorizer.fit(queries)
vectors = vector.transform(allQueries)

listTFIDF = []

with tqdm(total=len(allQueries)) as pbar:
    for m in vectors:
        if(m.sum() != 0):
            listTFIDF.append(m.sum() / m.count_nonzero())
        else:
            listTFIDF.append(-1)
        pbar.update()
        
VocabTFIDFAll = pd.DataFrame(data=listTFIDF, columns = ['tfidfAll']).fillna(-1)
# VocabTFIDFAll['query'] = allQueries
VocabTFIDFAll['qID'] = qID
Vocab['qID'] = qID #-----------------------------------------added
Vocab = pd.merge(Vocab, VocabTFIDFAll, on='qID')
# Vocab = Vocab.merge(VocabTFIDFAll, on = 'query')

100%|██████████| 4746/4746 [00:00<00:00, 12157.67it/s]


In [287]:
# ----------S

In [288]:
VocabTFIDFAll

Unnamed: 0,tfidfAll,qID
0,0.491051,6352
1,0.704467,8305
2,0.574193,6814
3,1.000000,7688
4,0.706889,6221
...,...,...
4741,0.552780,5975
4742,0.405230,5233
4743,-1.000000,7864
4744,0.498821,5316


In [289]:
Vocab

Unnamed: 0,coreVocab,query,nonCoreVocab,minAoA,maxAoA,ratioAoA,queryComplexity,SVEN,ratioAbs,ratioConc,...,top250NonSterRatAnt,top250NonSterRatCon,top50SterCount,top50SterRatAnt,top50SterAntCon,top50NonSterCount,top50NonSterRatAnt,top50NonSterAntCon,qID,tfidfAll
0,0.50,US civil war causes,0.50,0.00,10.89,0.75,6.100000,0.500000,0.250000,0.250000,...,0.250000,0.750000,0,0.000000,1.000000,0,0.0,1.0,6352,0.491051
1,0.00,scooter brands,1.00,6.68,7.72,1.00,7.200000,0.500000,0.000000,0.500000,...,0.500000,0.500000,0,0.000000,1.000000,0,0.0,1.0,8305,0.704467
2,0.00,scooter brands reliable,1.00,6.68,9.32,1.00,7.906667,0.666667,0.333333,0.333333,...,0.333333,0.666667,0,0.000000,1.000000,0,0.0,1.0,6814,0.574193
3,0.00,scooter,1.00,6.68,6.68,1.00,6.680000,1.000000,0.000000,1.000000,...,1.000000,0.000000,0,0.000000,1.000000,0,0.0,1.0,7688,1.000000
4,0.00,scooter cheap,1.00,6.68,7.11,1.00,6.895000,1.000000,0.500000,0.500000,...,1.000000,0.000000,0,0.000000,1.000000,0,0.0,1.0,6221,0.706889
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4741,1.00,House of dreams,0.00,3.16,4.88,1.00,4.196667,0.333333,0.333333,0.000000,...,0.000000,1.000000,0,0.000000,1.000000,0,0.0,1.0,5975,0.552780
4742,0.50,When did Desmond doss get married,0.50,0.00,5.16,0.50,2.095000,0.333333,0.500000,0.000000,...,0.166667,0.833333,1,0.200000,0.800000,0,0.0,1.0,5233,0.405230
4743,0.00,H,1.00,0.00,0.00,0.00,0.000000,0.000000,0.000000,0.000000,...,0.000000,1.000000,0,0.000000,1.000000,0,0.0,1.0,7864,-1.000000
4744,0.75,find fact about dog,0.25,2.80,6.47,1.00,5.030000,0.500000,0.750000,0.250000,...,0.250000,0.750000,1,0.333333,0.666667,0,0.0,1.0,5316,0.498821


In [290]:
Vocab.columns

Index(['coreVocab', 'query', 'nonCoreVocab', 'minAoA', 'maxAoA', 'ratioAoA',
       'queryComplexity', 'SVEN', 'ratioAbs', 'ratioConc', 'top250SterCount',
       'top250SterRatAnt', 'top250SterRatCon', 'top250NonSterCount',
       'top250NonSterRatAnt', 'top250NonSterRatCon', 'top50SterCount',
       'top50SterRatAnt', 'top50SterAntCon', 'top50NonSterCount',
       'top50NonSterRatAnt', 'top50NonSterAntCon', 'qID', 'tfidfAll'],
      dtype='object')

In [291]:
len(VocabTFIDFAll), len(Vocab)

(4746, 4746)

In [292]:
VocabTFIDFAll['qID'].equals(Vocab['qID'])

True

In [293]:
# pd.merge(Vocab, VocabTFIDFAll, on='qID')

In [294]:
# Vocab['query']

In [295]:
# a=Vocab.copy()
# b=VocabTFIDFAll.copy()

In [296]:
# a.rename(columns={'query': 'key'}, inplace=True)
# b.rename(columns={'query': 'key'}, inplace=True)

In [297]:
# a[a.key == 'youtube']

In [298]:
# b[b.key == 'youtube']

In [299]:
# Vocab.merge(VocabTFIDFAll)['query'][Vocab.merge(VocabTFIDFAll)['query'] == 'youtube'].values

In [300]:
# a.join(b, lsuffix='_Vocab', rsuffix= '_VocabTFIDFAll', how='inner')

In [301]:
# a=Vocab['query'].map(lambda x: len(x)).sort_values()

In [302]:
# b=VocabTFIDFAll['query'].map(lambda x: len(x)).sort_values()

In [303]:
# Vocab.shape

In [304]:
# a.equals(b)

In [305]:
# Vocab.index

In [306]:
# VocabTFIDFAll.index

In [307]:
# Vocab.merge(VocabTFIDFAll)['query']

In [308]:
# VocabTFIDFAll.isnull().sum()

In [309]:
# Vocab.isnull().sum()

In [310]:
# b

In [311]:
# a[a != b]

In [312]:
# VocabTFIDFAll.merge(Vocab, on='query')

In [313]:
df1 = pd.DataFrame({'team' : ['A', 'B', 'A'], 
                    'points' : [1, 2, 1],
                    'ID' : [1, 2, 3]}) 
df1

Unnamed: 0,team,points,ID
0,A,1,1
1,B,2,2
2,A,1,3


In [314]:
df2 = pd.DataFrame({'team' : ['A', 'D', 'C'],
                    'age' : [3,4,5],
                    'ID' : [1, 2, 3]})

df2

Unnamed: 0,team,age,ID
0,A,3,1
1,D,4,2
2,C,5,3


In [315]:
df1.merge(df2, on='ID')

Unnamed: 0,team_x,points,ID,team_y,age
0,A,1,1,A,3
1,B,2,2,D,4
2,A,1,3,C,5


In [316]:
# VocabTFIDFAll.isnull().values.any()

# TF-IDF Stereotype

In the following block of code we take an 80% sample of all stereotype sessions found in SWC, and then calculate the TF-IDF values for each all queries with each query in every session seen as an individual document. 


In [317]:
# allSessions = pickle.load( open( "../Data/DataSets/SWC/SWC.p", "rb" ) )
# allQueries = list(set(allQueries))
# allSessionsQ = allSessions.loc[allSessions['type']=='Q']
# allSessionsQ = allSessionsQ[allSessionsQ['class'] == 1]
# sID = allSessionsQ['sID'].unique()
# corpus = np.random.choice(sID,int((len(sID)*.8)), replace=False)
# allSessionsQ = allSessionsQ[allSessionsQ['sID'].isin(corpus)]
# queries = allSessionsQ['query'].tolist()

allSessions = pickle.load( open( "../Data/DataSets/SQS/castsventrecSQS.p", "rb" ) )
# allQueries = list(set(allQueries))
allSessions = allSessions[allSessions['class'] == 1]
sID = allSessions['sID'].unique()
corpus = np.random.choice(sID,int((len(sID)*.8)), replace=False)
allSessions = allSessions[allSessions['sID'].isin(corpus)]
queries = allSessions['query'].tolist()


text = ''
for query in queries:
    text += query + " "
    
querywords = text.split()

resultwords  = [word for word in querywords if word.lower() not in stopwords]
text = ' '.join(resultwords)

queries = allSessions['query'].tolist()

vectorizer = TfidfVectorizer()
vector = vectorizer.fit(querywords)
vectors = vector.transform(allQueries)

listTFIDF = []

with tqdm(total=len(allQueries)) as pbar:
    for m in vectors:
        if(m.sum() != 0):
            listTFIDF.append(m.sum() / m.count_nonzero())
        else:
            listTFIDF.append(-1)
        pbar.update()

        
VocabTFIDF = pd.DataFrame(data=listTFIDF, columns = ['tfidfS']).fillna(-1)
# VocabTFIDF['query'] = allQueries
# Vocab = pd.merge(Vocab, VocabTFIDF, on='query')
# Vocab = Vocab.merge(VocabTFIDF, on = 'query')
VocabTFIDF['qID'] = qID
Vocab = pd.merge(Vocab, VocabTFIDF, on='qID')

100%|██████████| 4746/4746 [00:00<00:00, 14837.42it/s]


In [318]:
queries

['why is idaho called the gem state',
 'a famos mathution',
 'motion waves',
 'Victorea stillwells Big moments',
 'Information about tigers',
 'science',
 'fact on tigers',
 'meow meow meow meow meow meow meow meow meow meow meow',
 'google docs',
 'Videos of Victoria stillwell',
 'Deathly Hallows part 2',
 'test search!',
 'South Africa facts kids',
 'What is the weeknds most famous song',
 'cast',
 "when's the last time you let your heart decide?",
 'The leopard project done by Andrew kitties',
 'Guatemala fun facts for kids',
 'facrs',
 'difference between dogs and wolves',
 'What is the new son by son and son?',
 'When is the Fast and ferious 9 coming in theaters',
 'Spirited away',
 'interesting fact about space',
 'Caculator',
 'Ada Lovelace biography for kids',
 'Where was mother Teresa born',
 'the weeney',
 'how do humans use water',
 'equal setam',
 'what is the state bird of idaho is',
 'Glg',
 'math',
 'Bethany Hamilton 2nd kid name',
 'facts about birds',
 'Bethany Hamilto

In [319]:
VocabTFIDF.head(30)

Unnamed: 0,tfidfS,qID
0,0.706882,6352
1,-1.0,8305
2,-1.0,6814
3,-1.0,7688
4,-1.0,6221
5,-1.0,6645
6,-1.0,9056
7,-1.0,7349
8,0.707107,5008
9,0.707107,6503


In [320]:
Vocab.head(2)

Unnamed: 0,coreVocab,query,nonCoreVocab,minAoA,maxAoA,ratioAoA,queryComplexity,SVEN,ratioAbs,ratioConc,...,top250NonSterRatCon,top50SterCount,top50SterRatAnt,top50SterAntCon,top50NonSterCount,top50NonSterRatAnt,top50NonSterAntCon,qID,tfidfAll,tfidfS
0,0.5,US civil war causes,0.5,0.0,10.89,0.75,6.1,0.5,0.25,0.25,...,0.75,0,0.0,1.0,0,0.0,1.0,6352,0.491051,0.706882
1,0.0,scooter brands,1.0,6.68,7.72,1.0,7.2,0.5,0.0,0.5,...,0.5,0,0.0,1.0,0,0.0,1.0,8305,0.704467,-1.0


In [321]:
VocabTFIDF.head(2)

Unnamed: 0,tfidfS,qID
0,0.706882,6352
1,-1.0,8305


In [322]:
VocabTFIDF.shape

(4746, 2)

In [323]:
Vocab.shape

(4746, 25)

In [324]:
#-- E

In [325]:
# pd.merge(Vocab, VocabTFIDFAll, left_index=True, right_index=True)

In [326]:
(Vocab.columns).is_unique

True

In [327]:
Vocab.shape

(4746, 25)

# TF-IDF Non-Stereotype

In the following block of code we take an 80% sample of all non-stereotype sessions found in SWC, and then calculate the TF-IDF values for each all queries with each query in every session seen as an individual document. 

In [328]:
# allSessions = pickle.load( open( "../Data/DataSets/SWC/SWC.p", "rb" ) )
# allQueries = list(set(allQueries))
# allSessionsQ = allSessions.loc[allSessions['type']=='Q']
# allSessionsQ = allSessionsQ[allSessionsQ['class'] == 0]
# sID = allSessionsQ['sID'].unique()
# corpus = np.random.choice(sID,int((len(sID)*.8)), replace=False)
# allSessionsQ = allSessionsQ[allSessionsQ['sID'].isin(corpus)]
# queries = allSessionsQ['query'].tolist()

allSessions = pickle.load( open( "../Data/DataSets/SQS/castsventrecSQS.p", "rb" ) )
# allQueries = list(set(allQueries))
allSessions = allSessions[allSessions['class'] == 0]
sID = allSessions['sID'].unique()
corpus = np.random.choice(sID,int((len(sID)*.8)), replace=False)
allSessions = allSessions[allSessions['sID'].isin(corpus)]
queries = allSessions['query'].tolist()


text = ''
for query in queries:
    text += query + " "
    
querywords = text.split()

resultwords  = [word for word in querywords if word.lower() not in stopwords]
text = ' '.join(resultwords)

queries = allSessions['query'].tolist()

vectorizer = TfidfVectorizer()
vector = vectorizer.fit(queries)
vectors = vector.transform(allQueries)

listTFIDF = []

with tqdm(total=len(allQueries)) as pbar:
    for m in vectors:
        if(m.sum() != 0):
            listTFIDF.append(m.sum() / m.count_nonzero())
        else:
            listTFIDF.append(-1)
        pbar.update()
    
VocabTFIDFNA = pd.DataFrame(data=listTFIDF, columns = ['tfidfNS']).fillna(-1)
# VocabTFIDFNA['query'] = allQueries
# Vocab = pd.merge(Vocab, VocabTFIDFNA, left_index=True, right_index=True)
# Vocab = Vocab.merge(VocabTFIDFNA, on = 'query')
VocabTFIDFNA['qID'] = qID
Vocab = pd.merge(Vocab, VocabTFIDFNA, on='qID')

100%|██████████| 4746/4746 [00:00<00:00, 12573.05it/s]


In [329]:
Vocab.shape

(4746, 26)

In [330]:
len(queries)

3037

In [331]:
Vocab.columns

Index(['coreVocab', 'query', 'nonCoreVocab', 'minAoA', 'maxAoA', 'ratioAoA',
       'queryComplexity', 'SVEN', 'ratioAbs', 'ratioConc', 'top250SterCount',
       'top250SterRatAnt', 'top250SterRatCon', 'top250NonSterCount',
       'top250NonSterRatAnt', 'top250NonSterRatCon', 'top50SterCount',
       'top50SterRatAnt', 'top50SterAntCon', 'top50NonSterCount',
       'top50NonSterRatAnt', 'top50NonSterAntCon', 'qID', 'tfidfAll', 'tfidfS',
       'tfidfNS'],
      dtype='object')

# Stopwords

In the following block of code we load up a list of stopwords (not those found in NLTK) and then count the number of stopwords found in each query.

In [332]:
stopWords = []
st = WordNetLemmatizer()

with open('DataSets/stopwords.txt') as csv_file:
    csv_reader = csv.reader(csv_file)
    for row in csv_reader:
        if (row):
            stopWords.append(st.lemmatize(row[0]))
        else:
            pass

        
stopCount = []
stopAverage= []
st = WordNetLemmatizer()

with tqdm(total=len(allQueries)) as pbar:
    for query in allQueries:
        count = 0
        for word in query.split(' '):
            word = word.lower().strip()
            word = re.sub(r'[^\w\s]','',word)
            word = st.lemmatize(word)
            if word in stopWords:
                count +=1
            else:
                pass
        stopCount.append(count)
        stopAverage.append(count/len(query.split(' ')))
        pbar.update()
    
VocabStop = pd.DataFrame(data=stopCount, columns = ['stopCount'])
# VocabStop['query'] = allQueries
# Vocab = pd.merge(Vocab, VocabStop, left_index=True, right_index=True)
# Vocab = Vocab.merge(VocabStop, on = 'query')
VocabStop['qID'] = qID
Vocab = pd.merge(Vocab, VocabStop, on='qID')

100%|██████████| 4746/4746 [00:00<00:00, 26716.82it/s]


In [333]:
VocabStop

Unnamed: 0,stopCount,qID
0,2,6352
1,0,8305
2,0,6814
3,0,7688
4,0,6221
...,...,...
4741,1,5975
4742,3,5233
4743,1,7864
4744,1,5316


In [334]:
Vocab.shape

(4746, 27)

In [335]:
Vocab.head()

Unnamed: 0,coreVocab,query,nonCoreVocab,minAoA,maxAoA,ratioAoA,queryComplexity,SVEN,ratioAbs,ratioConc,...,top50SterRatAnt,top50SterAntCon,top50NonSterCount,top50NonSterRatAnt,top50NonSterAntCon,qID,tfidfAll,tfidfS,tfidfNS,stopCount
0,0.5,US civil war causes,0.5,0.0,10.89,0.75,6.1,0.5,0.25,0.25,...,0.0,1.0,0,0.0,1.0,6352,0.491051,0.706882,0.488738,2
1,0.0,scooter brands,1.0,6.68,7.72,1.0,7.2,0.5,0.0,0.5,...,0.0,1.0,0,0.0,1.0,8305,0.704467,-1.0,0.704259,0
2,0.0,scooter brands reliable,1.0,6.68,9.32,1.0,7.906667,0.666667,0.333333,0.333333,...,0.0,1.0,0,0.0,1.0,6814,0.574193,-1.0,0.5744,0
3,0.0,scooter,1.0,6.68,6.68,1.0,6.68,1.0,0.0,1.0,...,0.0,1.0,0,0.0,1.0,7688,1.0,-1.0,1.0,0
4,0.0,scooter cheap,1.0,6.68,7.11,1.0,6.895,1.0,0.5,0.5,...,0.0,1.0,0,0.0,1.0,6221,0.706889,-1.0,0.706948,0


In [336]:
# ----- start verify ----

In [337]:
if "of" in stopWords:
    print('in')
else:
    print("not in")

in


In [338]:
len(stopWords)

659

In [339]:
VocabStop['stopCount']

0       2
1       0
2       0
3       0
4       0
       ..
4741    1
4742    3
4743    1
4744    1
4745    0
Name: stopCount, Length: 4746, dtype: int64

In [340]:
# ---- end -----

## VERIFIED 

In [341]:
Vocab.columns

Index(['coreVocab', 'query', 'nonCoreVocab', 'minAoA', 'maxAoA', 'ratioAoA',
       'queryComplexity', 'SVEN', 'ratioAbs', 'ratioConc', 'top250SterCount',
       'top250SterRatAnt', 'top250SterRatCon', 'top250NonSterCount',
       'top250NonSterRatAnt', 'top250NonSterRatCon', 'top50SterCount',
       'top50SterRatAnt', 'top50SterAntCon', 'top50NonSterCount',
       'top50NonSterRatAnt', 'top50NonSterAntCon', 'qID', 'tfidfAll', 'tfidfS',
       'tfidfNS', 'stopCount'],
      dtype='object')

# Net Vocab

In the following block of code we count the occurence of individual net vocabulary found in each query.

In [342]:
www = []
com = []
net = []
org = []
gov = []
edu = []
http = []

with tqdm(total=len(allQueries)) as pbar:
    for query in allQueries:

        if "www." in query:
            www.append(1)
        else:
            www.append(0)

        if ".com" in query:
            com.append(1)
        else:
            com.append(0)

        if ".net" in query:
            net.append(1)
        else:
            net.append(0)

        if ".org" in query:
            org.append(1)
        else:
            org.append(0)

        if ".edu" in query:
            edu.append(1)
        else:
            edu.append(0)

        if ".gov" in query:
            gov.append(1)
        else:
            gov.append(0)

        if "http" in query:
            http.append(1)
        else:
            http.append(0)
        
        pbar.update()
        
VocabNet = pd.DataFrame(data=com, columns = ['com'])
VocabNet['net'] = net
VocabNet['org'] = org
VocabNet['edu'] = edu
VocabNet['gov'] = gov
VocabNet['http'] = http
# VocabNet['query'] = allQueries
# Vocab = Vocab.merge(VocabNet, on = 'query')
VocabNet['qID'] = qID
Vocab = pd.merge(Vocab, VocabNet, on='qID')

100%|██████████| 4746/4746 [00:00<00:00, 519363.57it/s]


In [343]:
VocabNet.head()

Unnamed: 0,com,net,org,edu,gov,http,qID
0,0,0,0,0,0,0,6352
1,0,0,0,0,0,0,8305
2,0,0,0,0,0,0,6814
3,0,0,0,0,0,0,7688
4,0,0,0,0,0,0,6221


In [344]:
# ---------------- start verify --------------------------------

In [345]:
len(VocabNet)

4746

In [346]:
Vocab.head(3)

Unnamed: 0,coreVocab,query,nonCoreVocab,minAoA,maxAoA,ratioAoA,queryComplexity,SVEN,ratioAbs,ratioConc,...,tfidfAll,tfidfS,tfidfNS,stopCount,com,net,org,edu,gov,http
0,0.5,US civil war causes,0.5,0.0,10.89,0.75,6.1,0.5,0.25,0.25,...,0.491051,0.706882,0.488738,2,0,0,0,0,0,0
1,0.0,scooter brands,1.0,6.68,7.72,1.0,7.2,0.5,0.0,0.5,...,0.704467,-1.0,0.704259,0,0,0,0,0,0,0
2,0.0,scooter brands reliable,1.0,6.68,9.32,1.0,7.906667,0.666667,0.333333,0.333333,...,0.574193,-1.0,0.5744,0,0,0,0,0,0,0


In [347]:
Vocab.shape

(4746, 33)

#### Check the "domain name" in the qeury

In [348]:
# Vocab['com'].value_counts()

In [349]:
# Vocab['net'].value_counts()

In [350]:
# Vocab['org'].value_counts()

In [351]:
# Vocab['gov'].value_counts()

In [352]:
# Vocab['http'].value_counts()

In [353]:
query

'kid'

In [354]:
# Vocab.loc[Vocab['http'] == 1, 'query']

In [355]:
# Vocab.loc[Vocab['org'] == 1, 'query']

In [356]:
# Vocab.loc[Vocab['com'] == 1, 'query']

In [357]:
# ---- end verify -----------

# Search Operators

In the following block of code we count the occurence of individual search operators found in each query.

In [358]:
AND = []
OR = []
quotes = []

with tqdm(total=len(allQueries)) as pbar:
    for query in allQueries:

        if "AND" in query:
            AND.append(1)
        else:
            AND.append(0)

        if "OR" in query:
            OR.append(1)
        else:
            OR.append(0)

        if "\"" in query:
            quotes.append(1) 
        else:
            quotes.append(0)
        
        pbar.update()
            
VocabOP = pd.DataFrame(data=AND, columns = ['AND'])
VocabOP['OR'] = OR
VocabOP['quotes'] = quotes
# VocabOP['query'] = allQueries
# Vocab = Vocab.merge(VocabOP, on = 'query')
VocabOP['qID'] = qID
Vocab = pd.merge(Vocab, VocabOP, on='qID')

100%|██████████| 4746/4746 [00:00<00:00, 521868.89it/s]


In [359]:
Vocab.shape

(4746, 36)

In [360]:
# ---- begin verify -----

In [361]:
# Vocab.loc[Vocab['OR'] == 1, 'query'] # ---? 

In [362]:
# Vocab['quotes'].value_counts()

In [363]:
# Vocab.loc[Vocab['quotes'] == 1, 'query']

In [364]:
# Comments:

# No queries with 'AND' oparator 

# ------------ End verify --------


# Interogatives 

In the following block of code we determine if a query contains an interogative.

In [365]:
inter = []
VocabInter = pd.DataFrame(data=vocab, columns = ['coreVocab'])

x = len(allQueries)

with tqdm(total=len(allQueries)) as pbar:
    for num in range(x):
        query = allQueries[num]

        if re.match(r"who( |'re|'s|s)", query, flags=re.IGNORECASE):
            inter.append(1)

        elif re.match(r"what( |'re|re|'s|s)", query, flags=re.IGNORECASE):
            inter.append(1)

        elif re.match(r"when( |'re|re|'s|s)", query, flags=re.IGNORECASE):
            inter.append(1)

        elif re.match(r"where( |'re|re|'s|s)", query, flags=re.IGNORECASE):
            inter.append(1)

        elif re.match(r"why( |'re|re|'s|s)", query, flags=re.IGNORECASE):
            inter.append(1)

        elif re.match(r"how( |'re|re|'s|s)", query, flags=re.IGNORECASE):
            inter.append(1)

        elif re.match(r"is ", query, flags=re.IGNORECASE):
            inter.append(1)

        elif re.match(r"are ", query, flags=re.IGNORECASE):
            inter.append(1)

        elif re.match(r"can ", query, flags=re.IGNORECASE):
            inter.append(1)

        elif re.match(r"could ", query, flags=re.IGNORECASE):
            inter.append(1)

        elif re.match(r"should ", query, flags=re.IGNORECASE):
            inter.append(1)

        elif re.match(r"would ", query, flags=re.IGNORECASE):
            inter.append(1)

        else:
            inter.append(0)
    
        pbar.update() 
        
VocabInter = pd.DataFrame(data=inter, columns = ['inter'])
# VocabInter['query'] = allQueries
# Vocab = Vocab.merge(VocabInter, on = 'query')
VocabInter['qID'] = qID
Vocab = pd.merge(Vocab, VocabInter, on='qID')

100%|██████████| 4746/4746 [00:00<00:00, 104060.59it/s]


In [366]:
Vocab.head()

Unnamed: 0,coreVocab,query,nonCoreVocab,minAoA,maxAoA,ratioAoA,queryComplexity,SVEN,ratioAbs,ratioConc,...,com,net,org,edu,gov,http,AND,OR,quotes,inter
0,0.5,US civil war causes,0.5,0.0,10.89,0.75,6.1,0.5,0.25,0.25,...,0,0,0,0,0,0,0,0,0,0
1,0.0,scooter brands,1.0,6.68,7.72,1.0,7.2,0.5,0.0,0.5,...,0,0,0,0,0,0,0,0,0,0
2,0.0,scooter brands reliable,1.0,6.68,9.32,1.0,7.906667,0.666667,0.333333,0.333333,...,0,0,0,0,0,0,0,0,0,0
3,0.0,scooter,1.0,6.68,6.68,1.0,6.68,1.0,0.0,1.0,...,0,0,0,0,0,0,0,0,0,0
4,0.0,scooter cheap,1.0,6.68,7.11,1.0,6.895,1.0,0.5,0.5,...,0,0,0,0,0,0,0,0,0,0


In [367]:
Vocab.shape

(4746, 37)

In [368]:
Vocab.columns

Index(['coreVocab', 'query', 'nonCoreVocab', 'minAoA', 'maxAoA', 'ratioAoA',
       'queryComplexity', 'SVEN', 'ratioAbs', 'ratioConc', 'top250SterCount',
       'top250SterRatAnt', 'top250SterRatCon', 'top250NonSterCount',
       'top250NonSterRatAnt', 'top250NonSterRatCon', 'top50SterCount',
       'top50SterRatAnt', 'top50SterAntCon', 'top50NonSterCount',
       'top50NonSterRatAnt', 'top50NonSterAntCon', 'qID', 'tfidfAll', 'tfidfS',
       'tfidfNS', 'stopCount', 'com', 'net', 'org', 'edu', 'gov', 'http',
       'AND', 'OR', 'quotes', 'inter'],
      dtype='object')

In [369]:
# --- begin veirfy --------

In [370]:
Vocab["inter"].value_counts()

0    4323
1     423
Name: inter, dtype: int64

In [371]:
Vocab.loc[Vocab['inter'] == 1, 'query']

31                         how to quit smoking 
46                  what to take on a road trip
100                    where is dulles airport?
192                      how to get a pay raise
211      are developmental milestones universal
                         ...                   
4729               what is the height of a trex
4733       who was the first cumputer programer
4735           When  Did Nellie bly get Marie s
4738    How many Star Wars movies will be made?
4742          When did Desmond doss get married
Name: query, Length: 423, dtype: object

In [372]:
#----- end verif ---------


# VERIFIED

# Return Feature Set

Due to the length of this notebook, we have been merging data frames as we go. Therefore, at this point we only have to return the overall feature set.

In [373]:
# We drop the pID
Vocab__ = Vocab.drop(['qID'], axis=1)
Vocab__.shape

(4746, 36)

In [380]:
pickle.dump(Vocab__, open( "Pickles/4746VocabFeat.p", "wb" ) )


In [375]:
Vocab

Unnamed: 0,coreVocab,query,nonCoreVocab,minAoA,maxAoA,ratioAoA,queryComplexity,SVEN,ratioAbs,ratioConc,...,com,net,org,edu,gov,http,AND,OR,quotes,inter
0,0.50,US civil war causes,0.50,0.00,10.89,0.75,6.100000,0.500000,0.250000,0.250000,...,0,0,0,0,0,0,0,0,0,0
1,0.00,scooter brands,1.00,6.68,7.72,1.00,7.200000,0.500000,0.000000,0.500000,...,0,0,0,0,0,0,0,0,0,0
2,0.00,scooter brands reliable,1.00,6.68,9.32,1.00,7.906667,0.666667,0.333333,0.333333,...,0,0,0,0,0,0,0,0,0,0
3,0.00,scooter,1.00,6.68,6.68,1.00,6.680000,1.000000,0.000000,1.000000,...,0,0,0,0,0,0,0,0,0,0
4,0.00,scooter cheap,1.00,6.68,7.11,1.00,6.895000,1.000000,0.500000,0.500000,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4741,1.00,House of dreams,0.00,3.16,4.88,1.00,4.196667,0.333333,0.333333,0.000000,...,0,0,0,0,0,0,0,0,0,0
4742,0.50,When did Desmond doss get married,0.50,0.00,5.16,0.50,2.095000,0.333333,0.500000,0.000000,...,0,0,0,0,0,0,0,0,0,1
4743,0.00,H,1.00,0.00,0.00,0.00,0.000000,0.000000,0.000000,0.000000,...,0,0,0,0,0,0,0,0,0,0
4744,0.75,find fact about dog,0.25,2.80,6.47,1.00,5.030000,0.500000,0.750000,0.250000,...,0,0,0,0,0,0,0,0,0,0


In [376]:
Vocab.columns

Index(['coreVocab', 'query', 'nonCoreVocab', 'minAoA', 'maxAoA', 'ratioAoA',
       'queryComplexity', 'SVEN', 'ratioAbs', 'ratioConc', 'top250SterCount',
       'top250SterRatAnt', 'top250SterRatCon', 'top250NonSterCount',
       'top250NonSterRatAnt', 'top250NonSterRatCon', 'top50SterCount',
       'top50SterRatAnt', 'top50SterAntCon', 'top50NonSterCount',
       'top50NonSterRatAnt', 'top50NonSterAntCon', 'qID', 'tfidfAll', 'tfidfS',
       'tfidfNS', 'stopCount', 'com', 'net', 'org', 'edu', 'gov', 'http',
       'AND', 'OR', 'quotes', 'inter'],
      dtype='object')

In [377]:
Vocab__.shape

(4746, 36)

In [378]:
print('done')

done


**Note:** Vocab dataframe contain a 'qID' column which is the 'sID' from the original dataset. It was added for the sake of merging in order to avoide duplicates. Before saving the 'VocabFeat.p' final dataframe, we drop the 'qID' and save it to save it do 'Vocab__'