This notebook is intended to extract all search related features from sessions. The method of aggregating features is similar to other feature extraction notebooks, mapping aggregated feature values to session IDs/

# Load Libraries

In [1]:
import pickle

import pandas as pd
import numpy as np

from Levenshtein import distance as levenshtein_distance

# Load Data Sets

Load both data sets, process the SWC so it can be handled on a session by session basis. The same is done for SQS.

In [2]:
allSessions = pickle.load( open( "../Data/DataSets/SWC/SWC.p", "rb" ) )
sessions = allSessions.groupby('sID').apply(pd.Series.tolist).tolist()
allSessionsSQS = pickle.load( open( "../Data/DataSets/SQS/SQS.p", "rb" ) )


# Number of Queries

The following block of code counts number of queries per session.

In [3]:
searchFeat = pd.DataFrame(data = allSessions[allSessions['type']=='Q'].groupby('sID')['type'].count())
searchFeat = searchFeat.rename(columns = {'type': 'numQueries'})

# Number of Clicks

The following block of code counts number of clicks per session.

In [4]:
numC = pd.DataFrame(data = allSessions[allSessions['type']=='C'].groupby('sID')['type'].count())
numC = numC.rename(columns = {'type': 'numClicks'})

searchFeat = searchFeat.join(numC)

# Number of Clicks Per Query

The following block of code determines average number of clicks per query per session.

In [5]:
searchFeat['numClicksPerQuery'] = searchFeat['numQueries']/searchFeat['numClicks']

# Average Click Position

In [6]:
allSessionsC = allSessions[allSessions['type']=='C']
allSessionsC = allSessionsC.astype({'click': 'int32'})
avgC = pd.DataFrame(data = allSessionsC.groupby('sID').mean()['click'])
avgC = avgC.rename(columns = {'click': 'meanClickPosition'})

searchFeat = searchFeat.join(avgC)

# Distance Between Queries

In the following block of code we measure the levenhsten distances between queries in a session.




In [7]:
distances = []
for session in sessions:
    distance = []
    currentQuery = ""
    for query in session:
        if query[5] == 'Q':
            if currentQuery =="":
                currentQuery = query[1]
                distance.append(0)
            else:
                distance.append(levenshtein_distance(currentQuery,query[1]))
                currentQuery = query[1]
        else:
            distance.append(-1)
    distances.append(distance)
    distance = []
    
distancesFlat = [item for sublist in distances for item in sublist]
allSessions['queryDistance'] = distancesFlat

queryDistance = pd.DataFrame(data = allSessions[allSessions['type']=='Q'].groupby('sID')['queryDistance'].mean())
searchFeat = searchFeat.join(queryDistance)

# Time Between Queries

In the following block of code we measure the time between each query.

In [8]:
timeQueries = []
for session in sessions:
    distance = []
    currentTime = -1
    for query in session:
        if query[5] == 'Q':
            if currentTime ==- 1:
                currentTime = query[2]
                distance.append(0.0)
            else:
                distance.append(float(query[2])-float(currentTime))
                currentTime = query[2]
        else:
            distance.append(-1)
    timeQueries.append(distance)
    distance = []
    
timeQueriesFlat = [item for sublist in timeQueries for item in sublist]
allSessions['timeQueries'] = timeQueriesFlat

timeQueries = pd.DataFrame(data = allSessions[allSessions['type']=='C'].groupby('sID')['timeQueries'].mean())
searchFeat = searchFeat.join(timeQueries)

# Number of Unique Queries and If All Queries Are The Same

In the following block of code marks whether counts the number of unique queries and determines if all queries are the same query by comparing the length of the set of queries with total number of queries. If a session contains more than one query but only has a set of one unique query, we know that session contains all the same query and is marked as such.

In [9]:
uniqueQueries = []
allSameQueries = []

for session in sessions:
    
    queries = []
    
    for query in session:
        if query[5] == 'Q':
            queries.append(query[1])
    numUniqQueries = len(set(queries))
    

    if (numUniqQueries == 1): 
        if len(queries) >1:
            allSameQueries.append(1)
        else:
            allSameQueries.append(-1)
    else:
        allSameQueries.append(0)
            
    uniqueQueries.append(numUniqQueries)

searchFeat['uniqueQueries'] = uniqueQueries
searchFeat['allSameQueries'] = allSameQueries

# Number of Repeat Queries

In the following block of code we count the number of repeated queries, not including the first instance of the query.

In [10]:
repeatQueries = []

for session in sessions:
    
    repeat = []
    
    for query in session:
        
        if query[5] == 'Q':
            for query2 in session:
                if query2[5] == 'Q':
                    if query == query2:
                        repeat.append(0)
                        break
                    if query[1] == query2[1]:
                        repeat.append(1)
                        break

        else:
            repeat.append(0)
            
    repeatQueries.append(repeat)

repeatQueriesFlat = [item for sublist in repeatQueries for item in sublist]
allSessions['repeatQueries'] = repeatQueriesFlat

repeatQueries = pd.DataFrame(data = allSessions[allSessions['type']=='Q'].groupby('sID')['repeatQueries'].sum())
searchFeat = searchFeat.join(repeatQueries)

# Number of Unique Clicks and If All Clicks Are The Same

In the following block of code counts the number of unique clicks and determines if all queries are the same clicks by comparing the length of the set of clicks with total number of clicks. If a session contains more than one click but only has a set of one unique click, we know that session contains all the same clicks and is marked as such.

In [11]:
uniqueClicks = []
allSameClicks = []

for session in sessions:
    
    clicks = []
  
    for query in session:
        if query[5] == 'C':
            clicks.append(query[4])

    numUniqClicks = len(set(clicks))
    

    uniqueClicks.append(numUniqClicks)
    if (numUniqClicks == 1): 
        if len(clicks) >1:
            allSameClicks.append(1)
        else:
            allSameClicks.append(-1)
    else:
        allSameClicks.append(0)
            

searchFeat['uniqueClicks'] = uniqueClicks
searchFeat['allSameClicks'] = allSameClicks

# Number of Repeat Clicks

In the following block of code we count the number of repeated clicks, not including the first instance of the click.

In [12]:
repeatClicks = []

for session in sessions:
    
    repeat = []

    for entry in session:
        if entry[5] == 'C':
            for entryLoop in session:
                if entryLoop[5] == 'C':
                    if entry == entryLoop:
                        repeat.append(0)
                        break
                    if entry[4] == entryLoop[4]:
                        repeat.append(1)
                        break

        else:
            repeat.append(0)
    
    repeatClicks.append(repeat)

repeatClicks = [item for sublist in repeatClicks for item in sublist]
allSessions['repeatClicks'] = repeatClicks

repeatClicks = pd.DataFrame(data = allSessions[allSessions['type']=='C'].groupby('sID')['repeatClicks'].sum())
searchFeat = searchFeat.join(repeatClicks)

# Time Between Clicks

The following block of code is intended to calculate the time between clicks, however due to the nature of the AOL query logs (which gave the same timestamp to the clicks following a query as the query itself), the following code calculates the time between groups of clicks seperated by queries.

In [13]:
timeClicks = []
for session in sessions:
    distance = []
    currentTime = -1
    check = False
    for query in session:
        if query[5] == 'C':
            if currentTime ==- 1:
                currentTime = query[2]
                distance.append(0.0)
            elif check == True:
                distance.append(-1)
            else:
                distance.append(float(query[2])-float(currentTime))
                currentTime = query[2]
                check = True
        else:
            distance.append(-1)
            check = False
    distance = np.array(distance)
    distance = distance[distance != -1]
    
    timeClicks.append(distance.mean())

searchFeat['timeClicks'] = timeClicks

# Distance Between Query and Click URL

The following block of code measures the Levenhstein distance (based on character) between search queries and each url click.

In [14]:
clickDistance = []
for session in sessions:
    distance = []
    for query in session:
        if query[5] == 'C':
            distance.append(levenshtein_distance(query[4], query[1]))
        else:
            distance.append(-1)
    clickDistance.append(distance)

clickDistanceFlat = [item for sublist in clickDistance for item in sublist]
allSessions['clickDistance'] = clickDistanceFlat

clickDistance = pd.DataFrame(data = allSessions[allSessions['type']=='C'].groupby('sID')['clickDistance'].count())
searchFeat = searchFeat.join(clickDistance)

# Extract Features for SQS

Extract all of the previous described features from the SQS data set.

In [15]:
allSessionsSQS['numQueries'] = 1
allSessionsSQS['numClicks'] = -1
allSessionsSQS['numClicksPerQuery'] = -1
allSessionsSQS['meanClickPosition'] = -1
allSessionsSQS['queryDistance'] = -1
allSessionsSQS['timeQueries'] = -1
allSessionsSQS['uniqueQueries'] = 1
allSessionsSQS['allSameQueries'] = 0
allSessionsSQS['repeatQueries'] = -1
allSessionsSQS['uniqueClicks'] = -1
allSessionsSQS['allSameClicks'] = -1
allSessionsSQS['repeatClicks'] = -1
allSessionsSQS['timeClicks'] = -1
allSessionsSQS['clickDistance'] = -1
allSessionsSQS = allSessionsSQS.set_index('sID')

# Return Feature Set

Returns dataframes of search features extracted from both SWC and SQS.

In [16]:
pickle.dump( allSessionsSQS, open( "Pickles/SearchFeatSQS.p", "wb" ) )
pickle.dump( searchFeat, open( "Pickles/SearchFeatSWC.p", "wb" ) )