# The Librarian, The Computer, The Android, and Big Data

by Nichole Nomura and Quinn Dombrowski

This Jupyter notebook contains the code used to search the *Star Trek* book corpus as described in the paper.

## Import libraries

In [None]:
#Used for navigating directories
import os
#Used for identifying words
from nltk import tokenize
from nltk.tokenize import word_tokenize
#Used for identifying sentences
from nltk.tokenize import sent_tokenize
#Used for regular expressions
import re

## Navigate to directory with the corpus

In [None]:
#Define directory with the corpus
directory = '/Users/qad/Documents/dhtrek'
#Change to the directory
os.chdir(directory)
#List all files in the directory
files = os.listdir(directory)

## Counting question words

In [None]:
#Create output file
with open('/Users/qad/Documents/dhtrek-computer-questionwords.tsv', 'w') as out:
    #Write header row
    out.write('file' + '\t' + 'word' + '\t' + 'sentence' + '\n')
    #Iterate through files
    for file in files:
        #Only use text. files
        if file.endswith('.txt'):
            #Open the input file
            with open(file, 'r') as inputfile:
                #Print the file name
                print(file)
                #Read the input file
                text = inputfile.read()
                #Split text into sentences
                sentences = tokenize.sent_tokenize(text)
                #For each sentence...
                for sentence in sentences:
                    #If it contains 'Computer'
                    if 'Computer' in sentence:
                        #Lower-case the sentence
                        sentence = sentence.lower()
                        #Remove line breaks
                        sentence = sentence.replace('\n', ' ')
                        #Split the sentence into words
                        tokens = word_tokenize(sentence)
                        #If the sentence includes 'why'...
                        if 'why' in tokens:
                            #Write out the sentence and the question word
                            out.write (file + '\t' + 'why' + '\t' + sentence + '\n')
                        #If the sentence includes 'who'...
                        if 'who' in tokens:
                            #Write out the sentence and the question word
                            out.write(file + '\t' + 'who' + '\t' + sentence + '\n')
                        #If the sentence includes 'what'...
                        if 'what' in tokens:
                            #Write out the sentence and the question word
                            out.write(file + '\t' + 'what' + '\t' + sentence + '\n')
                        #If the sentence includes 'when'...
                        if 'when' in tokens:
                            #Write out the sentence and the question word
                            out.write(file + '\t' + 'when' + '\t' + sentence + '\n')
                        #If the sentence includes 'where'...
                        if 'where' in tokens:
                            #Write out the sentence and the question word
                            out.write(file + '\t' + 'where' + '\t' + sentence + '\n')
                        #If the sentence includes 'how'...
                        if 'how' in tokens:
                            #Write out the sentence and the question word
                            out.write(file + '\t' + 'how' + '\t' + sentence + '\n')

## Finding the commands

In [None]:
#Creates an empty list for sentences
compsentences = []
#Opens the output file
with open('/Users/qad/Documents/dhtrek-computer-verbs.tsv', 'w') as out:
    #Writes the header row
    out.write('file' + '\t' + 'word' + '\t' + 'sentence' + '\n')
    #For each file
    for file in files:
        #If it's a text file
        if file.endswith('.txt'):
            #Open the text file
            with open(file, 'r') as inputfile:
                #Read the file
                text = inputfile.read()
                #Split the file into sentences
                sentences = tokenize.sent_tokenize(text)
                #For each sentence
                for sentence in sentences:
                    #If it includes 'Computer'
                    if 'Computer' in sentence:
                        #Replace new lines
                        sentence = sentence.replace('\n', ' ')
                        #Regular expression to search for the next word after quote attribution
                        compquote = re.search(r'Computer,”[A-Za-z ]*, “((\w+))', sentence)
                        #If that exists...
                        if compquote is not None:
                            #Write out the result
                            out.write(file + '\t' + compquote.group(1).lower() + '\t' + sentence + '\n')
                        #Otherwise
                        else:
                            #Get the word that comes next
                            compquote = re.search(r'Computer, ((\w+))', sentence)
                            #If that exists...
                            if compquote is not None:
                                #Write it out
                                out.write(file + '\t' + compquote.group(1).lower() + '\t' + sentence + '\n')


## Contextual window

In [None]:
#Empty list for sentences
compsentences = []
#Open the output file
with open('/Users/qad/Documents/dhtrek-computer-verbs-context.tsv', 'w') as out:
    #Write the header row
    out.write('file' + '\t' + 'word' + '\t' + 'sentence' + '\t' + 'context' + '\n')
    #For each file
    for file in files:
        #Only use text files
        if file.endswith('.txt'):
            #Open the text file
            with open(file, 'r') as inputfile:
                #Read the text file
                text = inputfile.read()
                #Split it into sentences
                sentences = tokenize.sent_tokenize(text)
                #For each sentence, getting the index of the sentence
                for i, sentence in enumerate(sentences):
                    #If 'Computer' is in the sentence
                    if 'Computer' in sentence:
                        #Get 3 sentences before and after
                        contexts = sentences[max(i-3, 0):i+3]
                        #Combine it all together
                        output = ' '.join(contexts)
                        #Strip newlines
                        output = output.replace('\n', ' ')
                        # Get the word after 'Computer' if there's quote attribution
                        compquote = re.search(r'Computer,”[A-Za-z ]*, “((\w+))', sentence)
                        #If that exists
                        if compquote is not None:
                            #Write the context out
                            out.write(file + '\t' + compquote.group(1).lower() + '\t' + output + '\n')
                        #Otherwise...
                        else:
                            #Get the word after 'Computer'
                            compquote = re.search(r'Computer, ((\w+))', sentence)
                            #If that exists
                            if compquote is not None:
                                #Write the context out
                                out.write(file + '\t' + compquote.group(1).lower() + '\t' + output + '\n')


## Commands

In [None]:
#Create an empty list for sentences
datasentences = []
#Open the output file
with open('/Users/qad/Documents/dhtrek-data-verbs-context.tsv', 'w') as out:
    #Write the header row
    out.write('file' + '\t' + 'word' + '\t' + 'sentence' + '\t' + 'context' + '\n')
    #For each file
    for file in files:
        #If it is a text file
        if file.endswith('.txt'):
            #Open the file
            with open(file, 'r') as inputfile:
                #Read the file
                text = inputfile.read()
                #Split the file into sentences
                sentences = tokenize.sent_tokenize(text)
                #For each sentence, keeping track of its index
                for i, sentence in enumerate(sentences):
                    #If it includes 'Data'
                    if 'Data' in sentence:
                        #Get 3 sentences before/after
                        contexts = sentences[max(i-3, 0):i+3]
                        #Join the context sentences
                        output = ' '.join(contexts)
                        #Replace newlines
                        output = output.replace('\n', ' ')
                        #Get the word after Data if there's quote attribution
                        compquote = re.search(r'Data,”[A-Za-z ]*, “((\w+))', sentence)
                        #If that exists
                        if compquote is not None:
                            #Write out the result
                            out.write(file + '\t' + compquote.group(1).lower() + '\t' + output + '\n')
                        #Otherwise
                        else:
                            #Get the word after Data
                            compquote = re.search(r'Data, ((\w+))', sentence)
                            #If that exists
                            if compquote is not None:
                                #Write out the result
                                out.write(file + '\t' + compquote.group(1).lower() + '\t' + output + '\n')


## The computer

In [None]:
#Create empty list for sentences
compsentences = []
#Open output file
with open('/Users/qad/Documents/dhtrek-the-computer-context.tsv', 'w') as out:
    #Write header row
    out.write('file' + '\t' + 'sentences' + '\n')
    #For each file
    for file in files:
        #If it's a text file
        if file.endswith('.txt'):
            #Open the file
            with open(file, 'r') as inputfile:
                #Read the file
                text = inputfile.read()
                #Split it into sentences
                sentences = tokenize.sent_tokenize(text)
                #For each sentence, keeping track of the index
                for i, sentence in enumerate(sentences):
                    #Lower-case the sentence
                    sentence = sentence.lower()
                    #If it includes 'the. computer'...
                    if 'the computer' in sentence:
                        #Get 3 sentences before and after
                        contexts = sentences[max(i-3, 0):i+3]
                        #Combine the context
                        output = ' '.join(contexts)
                        #Replace the newlines
                        output = output.replace('\n', ' ')
                        #Write it to the output file
                        out.write(file + '\t' + output + '\n')

## Computer, Librarian, Archive

In [None]:
#Open the output file
with open('/Users/qad/Documents/dhtrek-the-computer-librarian-archive.tsv', 'w') as out:
    #Write out the header row
    out.write('file' + '\t' + 'thecomputer' + '\t' + 'librarian' + '\t' + 'archive' + '\n')
    #For each file...
    for file in files:
        #If it's a text file
        if file.endswith('.txt'):
            #Open the file
            with open(file, 'r') as inputfile:
                #Read the file
                text = inputfile.read()
                #Split it into sentences
                sentences = tokenize.sent_tokenize(text)
                #Create an empty list for computer sentences
                compsentences = []
                #Create an empty list for librarian sentences
                librariansentences = []
                #Create an empty list for archive sentences
                archivesentences = []
                #For each sentence, keeping track of the index
                for i, sentence in enumerate(sentences):
                    #Lower-case the sentence
                    sentence = sentence.lower()
                    #If it includes 'the computer'
                    if 'the computer' in sentence:
                        #Add it to the. computer list
                        compsentences.append(sentence)
                    #If it includes 'librarian'
                    if 'librarian' in sentence:
                        #Add it to the librarian list
                        librariansentences.append(sentence)
                    #If it includes 'the archive'
                    if 'the archive' in sentence:
                        #Add it to the archive list
                        archivesentences.append(sentence)
                #If there's 1 or more computer sentences
                if len(compsentences) > 0:
                    #Set comptuter status to 'y'
                    compstatus = 'y'
                #Otherwise
                else:
                    #Set computer status to 'n'
                    compstatus = 'n'
                #If there's 1 or more librarian sentences
                if len(librariansentences) > 0:
                    #Set librarian status to 'y'
                    librarianstatus = 'y'
                #Otherwise
                else:
                    #Set librarian status to 'n'
                    librarianstatus = 'n'
                #If there's 1 or more archive sentences
                if len(archivesentences) > 0:
                    #Set archive status to 'y'
                    archivestatus = 'y'
                #Otherwise
                else:
                    #Set archive status to 'n'
                    archivestatus = 'n'
                #Write out the result for that book
                out.write(file + '\t' + compstatus + '\t' + librarianstatus + '\t' + archivestatus + '\n')