### Load the metadata for the corpus

I really only use the author name.

In [4]:
import glob, codecs, re

catalog = {}
author_catalog = {}

f = codecs.open('SCIFI_pg_catalog_012615_FINAL.csv', 'r', encoding='utf-8')
for line in f.read().split('\n'):
    cols = line.strip().split('|')
    if len(cols) > 1:
        
        author = cols[1]
        title = cols[2]
        file_name = cols[5]
        if file_name.find('/') > -1:
            file_name = file_name.split('/')[-1]
        
        catalog[file_name] = [author, title]
        
        try:
            author_catalog[author].append([title, file_name])
        except KeyError:
            author_catalog[author] = [[title, file_name]]
        
f.close()

print 'len(catalog)', len(catalog)
print 'len(author_catalog)', len(author_catalog)

len(catalog) 1161
len(author_catalog) 482


### Load spacy

In [5]:
import spacy
nlp = spacy.load('en')

print spacy.__version__

1.9.0


### Split the corpus into sentences

In [6]:
import glob, codecs, re

f = codecs.open('SCIFI_sentences.csv', 'w', encoding='utf-8')
for n, path_to_file in enumerate(glob.glob('scifi/*.txt')):
    
    if n % 100 == 0:
        print 'processed', n
    
    file_name = path_to_file.split('/')[-1]
    author = catalog[file_name][0]
    title = catalog[file_name][1]
    
    text = re.sub('\s+', ' ', codecs.open(path_to_file, 'r', encoding='utf-8').read())
    
    doc = nlp(text)
    for s in doc.sents:
        f.write(file_name + '|' + author + '|' + title + '|' + s.text.strip() + '\n')
    
f.close()
    
print 'Done!'

processed 0
processed 100
processed 200
processed 300
processed 400
processed 500
processed 600
processed 700
processed 800
processed 900
processed 1000
processed 1100
Done!


### Gather up each author's sentences

Note that for *Astounding Stories*, the author is always "Various".  Which means that the cliche detector doesn't quite work correctly . . .

In [7]:
import codecs, re

author_text = {}

f = codecs.open('SCIFI_sentences.csv', 'r', encoding='utf-8')
for line in f.read().split('\n'):
    cols = line.strip().split('|')
    if len(cols) > 1:
    
        file_name = cols[0]
        author = cols[1]
        title = cols[2]
        text = cols[3]
        
        try:
             author_text[author].append(text)
        except KeyError:
             author_text[author] = [text]
    
f.close()

print 'Done!'

Done!


In [8]:
a = 'velvetness'
print a[:-4]

velvet


### Get the "author frequency" for the ngrams in the corpus

In [9]:
import re
from collections import defaultdict, Counter

def fix_key(k):
    
    new_k = k
    
    #if len(new_k) > 3 and new_k.endswith('y'):
    #    new_k = new_k[:-1]
    #elif new_k.endswith('ness'):
    #    new_k = new_k[:-4]
        
    return new_k

# ----------------------------------------------------------------

ngram_author_counts = defaultdict(int)
ngram_author_sentence_xref = defaultdict(list)

for author in author_text:
    
    all_author_ngrams = {}
    
    for text_n, text in enumerate(author_text[author]):
            
        doc = nlp(text)
        
        for a in range(0, len(doc) - 1):
            if doc[a].tag_[0] in ['N', 'J'] and doc[a + 1].tag_[0] in ['N', 'J']:
                
                # I DON'T WANT SOME NGRAMS; "Chapter II" is not useful, and spacy thinks "*" is a noun.
                # THIS MIGHT EVOLVE INTO A SECONDARY SET OF STOPWORDS?
                
                if doc[a].text.lower() != 'chapter' and doc[a + 1].text.lower() != 'chapter' and \
                    doc[a].text.lower() != '*' and doc[a + 1].text.lower() != '*':
                        
                    k1 = fix_key(doc[a].lemma_.lower())
                    k2 = fix_key(doc[a + 1].lemma_.lower())
                    
                    keys = [k1, k2]
                    #keys.sort()
                    
                    all_author_ngrams[' '.join(keys)] = 1
                    
                    ngram_author_sentence_xref[' '.join(keys)].append([author, text_n])
            
    for ngram in all_author_ngrams.keys():
        ngram_author_counts[ngram] += 1

print 'len(ngram_author_counts)', len(ngram_author_counts)
print 'Done!' 

len(ngram_author_counts) 588671
Done!


In [10]:
#for k, v in ngram_author_counts.iteritems():
#    if 'velvet' in k and v > 1:
#        print k, v
        
print
for k, v in ngram_author_counts.iteritems():
    if 'velvet' in k:
        has_word = False
        for w in ['black', 'sky', 'shadow', 'dark', 'black', 'backdrop', 'night']:
            if w in k:
                has_word = True
                break
        if has_word == True:
            print k, v


velvet blackness 2
velvet sky 6
velvet shadow 4
velvety darkness 3
velvet backdrop 3
black velvet 33
velvet black 7
velvet night 5
velvety blackness 2
dusky velvet 1
velvet dark 4
velvety night 1


In [11]:
import pickle

f = open('TEMP_ngram_author_counts.pickle', 'w')
pickle.dump(ngram_author_counts, f)
f.close()

f = open('TEMP_ngram_author_sentence_xref.pickle', 'w')
pickle.dump(ngram_author_sentence_xref, f)
f.close()

print 'Done!'

Done!


In [12]:
import pickle

f = open('TEMP_ngram_author_counts.pickle', 'r')
ngram_author_counts = pickle.load(f)
f.close()

f = open('TEMP_ngram_author_sentence_xref.pickle', 'r')
ngram_author_sentence_xref = pickle.load(f)
f.close()

print 'Done!'

Done!


### Word frequencies . . . 

In [13]:
import re
from collections import defaultdict, Counter

word_counts = defaultdict(int)
total_words = 0

for author, texts in author_text.iteritems():
    for text in texts:
        for t in re.split('[^a-z]', text.lower()):
            if t > '':
                word_counts[t] += 1
                total_words += 1
                
word_frequencies = {}
for word, n in word_counts.iteritems():
    word_frequencies[word] = float(n) / float(total_words)

print 'len(word_frequencies)', len(word_frequencies)

len(word_frequencies) 105028


### Find the cliches

Here, I define a cliche as an ngram which is used by more than 30 authors, and which is composed of low-frequency words.

I take the top 200 such ngrams.

In [14]:
from collections import defaultdict, Counter

weighted_cliches = []

for w in Counter(ngram_author_counts).most_common():
        
    try:
        w0_frequency = word_frequencies[w[0].split(' ')[0]]
        w1_frequency = word_frequencies[w[0].split(' ')[1]]

        low_frequency = w0_frequency
        if w1_frequency < low_frequency:
            low_frequency = w1_frequency

        weighted_cliches.append([low_frequency, w[0], w[1]])
    except KeyError:
        pass
            
weighted_cliches.sort()

selected_cliches = []

for n, c in enumerate(weighted_cliches):
    
    if 'velvet' in c[1]:
        has_word = False
        for w in ['black', 'sky', 'shadow', 'dark', 'black', 'backdrop', 'night']:
            if w in c[1]:
                has_word = True
                break
        if has_word == True:
            selected_cliches.append(c[1])
    else:
        if c[2] > 30: 
            if len(selected_cliches) < 200:
                if c[1] not in ['amazing stories', 'end transcriber', 'extensive research', 
                             'minor spelling', 'science fiction', 'typographical error',
                             'los angeles']:
                    selected_cliches.append(c[1])
    
print 'len(selected_cliches)', len(selected_cliches)

len(selected_cliches) 200


In [15]:
print sorted(selected_cliches)

[u'administration building', u'aged man', u'alpha centauri', u'anything unusual', u'artificial gravity', u'atom bomb', u'atomic bomb', u'atomic energy', u'atomic power', u'average man', u'awful lot', u'bald head', u'beady eye', u'bearded man', u'black velvet', u'blank wall', u'blond hair', u'brave man', u'breast pocket', u'broad daylight', u'capital city', u'carbon dioxide', u'certain amount', u'civil war', u'clenched fist', u'cloudless sky', u'coat pocket', u'cold sweat', u'comfortable chair', u'conference room', u'conscious mind', u'considerable distance', u'considerable time', u'control panel', u'crew member', u'damned thing', u'desk drawer', u'desperate effort', u'dining room', u'dusky velvet', u'early dawn', u'eastern horizon', u'elderly man', u'electric current', u'evening meal', u'excited voice', u'fairy tale', u'fantastic universe', u'fat man', u'few inch', u'few month', u'few pace', u'few yard', u'fifth avenue', u'first glimpse', u'first impression', u'first opportunity', u'fi

### Find sentences which contain at least once cliche.

In [16]:
import codecs

f = codecs.open('1_selected_sentences.txt', 'w', encoding='utf-8')

for c in selected_cliches:
    
    for author_sentence in ngram_author_sentence_xref[c]:
        if 'copyright' not in author_text[author_sentence[0]][author_sentence[1]]:
            f.write(c + '|' + author_text[author_sentence[0]][author_sentence[1]] + '\n')
    
f.close()

print 'Done!'

Done!


In [17]:
!sort 1_selected_sentences.txt | uniq > 1_selected_sentences.UNIQ.txt