# Notebook to Write Qualtrics Surveys

This notebook writes the Qualtrics survey forms and then reads the results.

In [None]:
import pandas as pd
import numpy as np
import search_engine as ss
import re
import csv
import json

In [None]:
# Load up the qsf!
with open('Query_Template.qsf', 'r') as file:
    data = json.load(file)

In [None]:
# make function to write survey file

def new_survey_file(choices_dict:dict,file_name:str,template_name:str='Query_Template.qsf')->None:
    '''Writes a survey file with the given choices.

    Input:
        choices_dict: dict, with search result choices
        file_name: str, name of the file to save
        template_name: str, name of the query template

    Output:
        None, a file is saved
    '''
    # open the template
    with open(template_name, 'r') as file:
        data = json.load(file)
    topics = list(choices_dict.keys()) # will be dict keys
    c_topics = topics.copy()

    # iterate through the survey to get to the questions
    for x in data['SurveyElements']:
        if type(x['Payload']) != dict: # if there's no payload (questions) skip
            continue
        if 'QuestionType' not in x['Payload'].keys(): # if there's no question in the payload, skip
            continue
        # if it is MC, then we choose if the choices are relevant
        if x['Payload']['QuestionType']=='MC':
            text = topics.pop()
            new_q = f'Select whether or not the following quotes are similar to the query "{text}"'
            x['Payload']['QuestionDescription']= new_q
            x['SecondaryAttribute'] = new_q
            x['Payload']['QuestionText']= new_q
            choices = choices_dict[text]
            new_choices = {str(x+1):{'Display': choices[x]} for x in range(len(choices))}
            x['Payload']['Choices']=new_choices
            x['Payload']['ChoiceOrder']=list(new_choices.keys())
        # if it is RO, then we rank and order the selected results
        if x['Payload']['QuestionType']=='RO':
            text = c_topics.pop()
            new_q = f'Rank these quotes on how funny they are!'
            x['SecondaryAttribute'] = new_q
            x['Payload']['QuestionText']= new_q
            x['Payload']['QuestionDescription']= new_q

    # save
    with open(file_name, 'w') as json_file:
        json.dump(data, json_file) 

### Load up examples

Now that I have the function written, I need to load up the search engine.

In [None]:
# load up that search
complete = pd.read_csv('https://scmcqueen.github.io/StarTrekScriptData/complete_data.csv')
complete.columns = ['index','character', 'quote', 'scene', 'location', 'view',
       'episode', 'date', 'series', 'file']
complete['quote']=complete['quote'].apply(lambda text: " ".join(text.split()))
complete['character']=complete['character'].fillna('NA')

# create instance of the search engine
test_engine = ss.search_engine(name='BM25 Engine',full_data=complete)
# load data in bulk
test_engine.bulk_load(complete[['quote']].to_dict()['quote'])

We added 144211 documents. The engine now has 144211 documents.


In [None]:
# sample results
results=test_engine.bw_search('candle',20,True)

In [None]:
# set keywords
keywords_1 =  ['candle','dream','doctor','bajor','prophets','inequality','struggle','engineer','prune','humanity']

In [None]:
indices = []
choices = []
search_term = []
bm25 = [] # new
bm25_prev = []
bm25_next = []
file_input = {}

for term in keywords_1:
    # do the search
    results=test_engine.bw_search(term,20,True)
    search_results = test_engine.old_pretty_print([x[0] for x in results])
    # update values
    t_ind = [x[0] for x in results]
    t_choices = [' '.join(x) for x in search_results]
    t_search = [term]*len(t_ind)
    # update big list
    indices+=t_ind
    choices+=t_choices
    search_term+=t_search
    bm25 +=[x[1] for x in results]
    bm25_prev +=[x[2] for x in results]
    bm25_next +=[x[3] for x in results]
    # update the dict
    file_input[term] = t_choices  

# don't need to save now
# new_survey_file(choices_dict=file_input,file_name='Evaluation/day_1.qsf',template_name='Query_Template.qsf')
pd.DataFrame({'indices':indices,'choices':choices,'search':search_term,'bm25':bm25,'prev_bm25':bm25_prev,'next_bm25':bm25_next}).to_csv('Evaluation/day_1_data.csv')

In [None]:
# preview data frame
pd.DataFrame({'indices':indices,'choices':choices,'search':search_term,'bm25':bm25,'prev_bm25':bm25_prev,'next_bm25':bm25_next})

Unnamed: 0,indices,choices,search,bm25,prev_bm25,next_bm25
0,44542,JOSEPH: He stole a candle. O'BRIEN: One candle...,candle,14.147200,13.037027,0.0000
1,44541,O'BRIEN: What did he do to deserve this? JOSEP...,candle,13.037027,0.000000,14.1472
2,112905,TROI: Many things improve with age... maybe yo...,candle,11.491176,0.000000,0.0000
3,88378,"BEVERLY: Computer, secure door. BEVERLY: I lit...",candle,11.378741,0.000000,0.0000
4,88194,QUINT: There's a lot of things she didn't tell...,candle,11.054260,0.000000,0.0000
...,...,...,...,...,...,...
172,108760,"Q: After our last encounter, I was asked to le...",humanity,6.681643,0.000000,0.0000
173,79522,"""Q"" : You show promise, my good fellow. PICARD...",humanity,6.638868,0.000000,0.0000
174,119412,GEORDI: I'll be honest with you. We don't know...,humanity,6.354117,0.000000,0.0000
175,140566,TROI: Feelings aren't positive or negative Dat...,humanity,5.952888,0.000000,0.0000


In [None]:
# set all of the keywords
keywords_1 =  ['candle','dream','doctor','bajor','prophets','inequality','struggle','engineer','prune','humanity',
               'story', 'religion', 'child', 'molly','counselor']
keywords_2 =  ['archaeology','chateau','trial','fiction','romance','wedding','father','kahless','chocolate','future',
               'bridge','saucer','coffee','romulans','colony']
keywords_3 =  ['raktajino','commit no errors','tanagra','borg','cardassia','poetry','spot','assimilate','unusual','barclay',
               'acquisition','rules','vedek','warp crystal',"bat'leth"]
keywords_4 =  ['blood wine','gagh','cook','celebrate','impossible','earl grey','jake','geordi','riker','poker',
               'phaser','occupation','conscious minds','tailor','neutral zone']
keywords_5 =  ['men','women','miles','worf','bashir','commander','captain','kira','troi','guinan',
               'shields','unification','mister spock','tribble','android']
keywords_6 =  ['unraveled','friendship','hailing frequencies','universe','traveler','wesley','good tea','good book','make it so','flute',
               'neutrino levels','noonian soong','breen','evidence','vacation']
keywords_7 =  ['memory','merry man','obedient','honor','san francisco','new orleans','liberation','imagination','maquis','tribunal',
               'trill','lore','risa','detective','ezri']
keywords_8 =  ['not picard','ice cream','morn','weyoun','dukat','civilization','replicators','truth','lunch','past',
               'empath','betray','warp bubble','jadzia','pregnant']
keywords_9 =  ['vineyard','lwaxana','bucket','justice is justice','espionage','same lie','mother','klingon','baseball','moriarty',
               'vulcan','moon','lower decks','the academy','full impulse']
keywords_10 =  ['enterprise','ferengi','nagus','death','root beer','alpha quadrant','wormhole','tasha','holosuite','holmes',
                'the defiant','ensign ro','dabo girl','secret agent','oo-mox']

In [None]:
# create different survey files for all of the different weeks

keywords = {1:keywords_1,
            2:keywords_2,
            3:keywords_3,
            4:keywords_4,
            5:keywords_5,
            6:keywords_6,
            7:keywords_7,
            8:keywords_8,
            9:keywords_9,
            10:keywords_10
            }

for x in keywords.keys():
    counter = str(x)
    t_keys = keywords[x]

    indices = []
    choices = []
    search_term = []
    bm25 = [] # new
    bm25_prev = []
    bm25_next = []
    file_input = {}

    for term in t_keys:
        # do the search
        results=test_engine.bw_search(term,20,True)
        search_results = test_engine.old_pretty_print([x[0] for x in results])
        # update values
        t_ind = [x[0] for x in results]
        t_choices = [' '.join(x) for x in search_results]
        #t_choices = [test_engine.pretty_print(x[0]) for x in results]
        t_search = [term]*len(t_ind)
        # update big list
        indices+=t_ind
        choices+=t_choices
        search_term+=t_search
        bm25 +=[x[1] for x in results]
        bm25_prev +=[x[2] for x in results]
        bm25_next +=[x[3] for x in results]
        # update the dict
        file_input[term] = t_choices

    #new_survey_file(choices_dict=file_input,file_name=f'Evaluation/day_{counter}.qsf',template_name='Query_Template.qsf')
    pd.DataFrame({'indices':indices,'choices':choices,'search':search_term,'bm25':bm25,'prev_bm25':bm25_prev,'next_bm25':bm25_next}).to_csv(f'Evaluation/day_{counter}_data.csv')

# Upload Results

Now that the surveys have been filled out, we can read in the data.


In [None]:
def get_results_df(results_path:str='Evaluation/Results/day2_skyeler_only.tsv',questions_path:str='Evaluation/day_2_data.csv'):
    '''Read in Qualtrics Survey results.

    Input:
        results_path: str, path to the qualtrics survey results
        questions_path: str, the path to the original questions csv

    Output:
        pd.DataFrame
    '''

    # start by reading files
    results = pd.read_csv(results_path)
    questions_ids = pd.read_csv(questions_path,index_col=0)
    # format columns
    questions_ids.columns = ['indices', 'quote', 'query','bm25','prevbm25','nextbm25']
    # drop unneeded columns
    results =results.drop(columns=['StartDate', 'EndDate', 'Status', 'IPAddress', 'Progress', 'Duration (in seconds)', 'Finished', 'RecordedDate', 'ResponseId', 'RecipientLastName', 'RecipientFirstName', 'RecipientEmail', 'ExternalReference', 'LocationLatitude', 'LocationLongitude', 'DistributionChannel', 'UserLanguage'])
    cols = results.columns

    # columns with needed data
    ranking_cols = [x for x in list(cols) if '_' in x]
    q_col = [x for x in list(cols) if 'Q' in x and '_' not in x and 'Q1'!=x]
    questions = results[ranking_cols].T

    # # query+map should be diff
    questions = questions.reset_index()
    questions['query']=questions[0].apply(lambda x:re.findall(r"'(.*?)'", x)[0])
    questions['index']=questions['index'].apply(lambda x: x[:x.index('_')])
    query_map = questions[['index','query']]

    col_results = results[ranking_cols].T

    col_results.columns = ['question','id','ranking']
    col_results = col_results.reset_index()
    col_results['quote']=col_results['question'].apply(lambda x: x[x.index('-')+2:])
    col_results['index'] = col_results['index'].apply(lambda x: x[:x.index('_')])
    col_results['query']=col_results['question'].apply(lambda x:re.findall(r"'(.*?)'", x)[0])
    result = questions_ids.merge(col_results[['query','quote','ranking']],on=['query','quote'],how='left')
    result['ranking']=result['ranking'].fillna(-1) # if not ranked (meaning irrelevant, give score of -1)
    return result

In [None]:
# get results
week1 =get_results_df(results_path='Evaluation/Results/day1_skyeler_only.csv',questions_path='Evaluation/day_1_data.csv')
week2 =get_results_df(results_path='Evaluation/Results/day2_skyeler_only.csv',questions_path='Evaluation/day_2_data.csv')
week3 =get_results_df(results_path='Evaluation/Results/day3_skyeler_only.csv',questions_path='Evaluation/day_3_data.csv')
week4 =get_results_df(results_path='Evaluation/Results/day4_skyeler_only.csv',questions_path='Evaluation/day_4_data.csv')
week5 =get_results_df(results_path='Evaluation/Results/day5_skyeler_only.csv',questions_path='Evaluation/day_5_data.csv')
week6 =get_results_df(results_path='Evaluation/Results/day6_skyeler_only.csv',questions_path='Evaluation/day_6_data.csv')
week7 =get_results_df(results_path='Evaluation/Results/day7_skyeler_only.csv',questions_path='Evaluation/day_7_data.csv')
week8 =get_results_df(results_path='Evaluation/Results/day8_skyeler_only.csv',questions_path='Evaluation/day_8_data.csv')
week9 =get_results_df(results_path='Evaluation/Results/day9_skyeler_only.csv',questions_path='Evaluation/day_9_data.csv')
week10 = get_results_df(results_path='Evaluation/Results/skyeler_day10.csv',questions_path='Evaluation/day_10_data.csv')

In [None]:
# combine data
combined_eval_data = pd.concat([week10,week9,week8,week7,week6, week5,week4,week3,week2,week1])
# save data
# combined_eval_data.to_csv('skyeler_ranking_data.csv')

Unnamed: 0,indices,quote,query,bm25,prevbm25,nextbm25,ranking
0,113689,TIMICIN: No. We've... we've said our good-byes...,lwaxana,12.436612,0.000000,0.00000,-1
1,113558,LWAXANA: I agree. How about letting everybody ...,lwaxana,12.436612,0.000000,0.00000,-1
2,113532,LWAXANA: I'm not sure. A minute. An hour. \n T...,lwaxana,12.436612,0.000000,0.00000,-1
3,55180,"ODO: Marry me, Lwaxana... let me into your lig...",lwaxana,12.436612,8.857818,0.00000,-1
4,55060,ODO: I'm sorry if I made you feel unwelcome......,lwaxana,12.436612,0.000000,0.00000,-1
...,...,...,...,...,...,...,...
257,112533,TROI: Sometimes... even when a victim has deal...,counselor,8.975723,0.000000,4.63639,-1
258,109313,WORF: But what do the Pakleds want? \n RIKER: ...,counselor,8.975723,0.000000,0.00000,-1
259,108851,BORG: We have analyzed your defensive capabili...,counselor,8.975723,0.000000,0.00000,-1
260,105620,TROI: Where's Lieutenant Monroe? \n O'BRIEN: C...,counselor,8.975723,0.000000,0.00000,-1
