# Frequency Counts for Thematic Lexemes

## Packages

In [19]:
import os
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import *
from tabulate import tabulate

stopWords = set(stopwords.words("english"))
stemmer = PorterStemmer()

## Reading in Files

In [2]:
# # checking path
# print(os.listdir())

In [3]:
# setting file paths
lexemesPath = 'thematiclexemes.txt'
textPath = 'Tension_OpeningScene.txt'

In [4]:
# reading files from path
with open(lexemesPath) as f:
    # storing lexemes as a list
    lexemes = f.read().split()
with open(textPath) as f:
    text = f.read()

## Frequency Counts

In [15]:
def lexemeCounter(lexemeList, text):
    '''
    takes a list of lexemes and a string of text as input and returns a dictionary, 
    where each key is a lexeme and the values are the associated frequency counts
    '''
    cntDict = {}
    # tokenizing the text
    tokenizedText = word_tokenize(text)
    
    # creating a list of the stems in the text
    lemmatizedText = [stemmer.stem(token) for token in tokenizedText]

    # looping over each lexeme in the lexeme list
    for lexeme in lexemes:
        # checking how many times a lexeme stem shows up in the list of stems 
        cnt = lemmatizedText.count(stemmer.stem(lexeme))
        cntDict[lexeme] = cnt
    return cntDict

In [16]:
cntDict = lexemeCounter(lexemes, text)
headers = ["Lexeme", "Count"]
print(tabulate(cntDict.items(), headers = headers))

Lexeme         Count
-----------  -------
edge               1
drugs              3
hold               1
roar               1
bats               3
diving             1
voice              1
animals            1
hell               1
bastard            1
narcotics          1
grass              1
mescaline          1
acid               1
cocaine            1
uppers             1
downers            1
screamers          1
laughers           1
ether              5
amyls              1
frenzy             1
depths             1
binge              1
snort              1
stupor             1
consumption        1
blood              1
brain              1
screaming          1
yelling            1
muttered           1
moaning            1
grappling          1
drag               1
swooping           1
screeching         1
lightheaded        1
terrible           1
huge               2
goddamn            2
poor               2
ill                1
dangerous          1
raw                1
helpless     

## Removing Stop Words

In [20]:
# tokenizing the text
tokenizedText = word_tokenize(text)

# removing stop words
newText = " ".join([token for token in tokenizedText if token.lower() not in stopWords])

# saving to file
with open('SceneContent.txt', 'w') as f:
    f.write(newText)