# Analysis of Jane Austen Character Co-occurrences
This notebook experiments with the concept of fingerprint matrices, a technique for generating an image representing character co-occurrences in novels.

In [None]:
import numpy as np
from urllib import request
from PIL import Image, ImageOps
from nltk.tokenize import sent_tokenize

This function creates a fingerprint matrix for a single character.

In [None]:
def fingerprintmatrix(text, character):
    sentences = sent_tokenize(text)
    even=100-len(sentences)%100
    height = 100
    width = int((even+len(sentences))/100)
    pixel_array = np.zeros([len(sentences)+even,3],dtype=np.uint8)
    pixel_array[:,:] = [255,255, 255]
    i = 0
    for sentence in sentences:
        if(sentence.find(character)!=-1):
            pixel_array[i] = [186,225,255]
        i = i + 1
    
    h = height
    w = width
    
    if(height>width):
        h = width
        w = height
    
    data = pixel_array.reshape([h,w,3])
    img = Image.fromarray(data)
    img = img.resize((800,400))
    filename = character+".png"
    add_border(img, filename)

This function accepts a string containing the novel's text and two character's to search in the text. It then produces and image (fingerprint matrix) in which a pixel indicates if a character is in a sentence or not. This is a simple version of that does not account for overlap (both characters in the same sentence).

In [None]:
def co_occurrence(text, char1, char2, occurrences_only=True):
    sentences = sent_tokenize(text)
    even=100-len(sentences)%100
    height = 100
    width = int((even+len(sentences))/100)
    pixel_array = np.zeros([len(sentences)+even,3],dtype=np.uint8)
    pixel_array[:,:] = [255,255, 255]
    i = 0
    for sentence in sentences:
        if(occurrences_only==False):
            if(sentence.find(char1)!=-1):
                pixel_array[i] = [186,225,255]
            if(sentence.find(char2)!=-1):
                pixel_array[i] = [255,179,186]
        if(sentence.find(char1)!=-1 and sentence.find(char2)!=-1):
            pixel_array[i] = [255, 0, 0]
        i = i + 1

    h = height
    w = width
    
    if(height>width):
        h = width
        w = height
        
    data = pixel_array.reshape([h,w,3])
    img = Image.fromarray(data)
    filename = char1+"_"+char2+".png"
    img = img.resize((800,400))
    add_border(img, filename)


This function adds a border around an image.

In [None]:
def add_border(img,filename):
    img_with_border = ImageOps.expand(img,border=1)
    img_with_border.save(filename)

## Character Co-occurrence of in Jane Austen's Emma.

In [None]:
file = open('austen/emma.txt', 'r',encoding="utf8")  #open the file (all gutenberg texts are UTF8 encoded)
text = file.read()
file.close()

In [None]:
fingerprintmatrix(text, 'Emma')

In [None]:
fingerprintmatrix(text, 'Knightley')
co_occurrence(text, 'Emma', 'Knightley',False)

In [None]:
fingerprintmatrix(text, 'Jane')
co_occurrence(text, 'Emma', 'Jane',False)

Put characters into a list so you can generate matrices for all of them.

In [None]:
characters = ['Emma','Mr. Knightley','Mr. Woodhouse', 'Harriet Smith', 'Frank Churchill']
for c in characters:
    fingerprintmatrix(text,c)

In [None]:
fingerprintmatrix(text, 'Harriet')
co_occurrence(text, 'Emma', 'Harriet',False)

## Character Co-occurrence of in Jane Austen's Pride and Prejudice.

In [None]:
file = open('austen/prideandprejudice.txt', 'r',encoding="utf8")  #open the file (all gutenberg texts are UTF8 encoded)
text = file.read()
file.close()

In [None]:
fingerprintmatrix(text, 'Elizabeth')
fingerprintmatrix(text, 'Darcy')
co_occurrence(text, 'Elizabeth','Darcy')