In [1]:
import pandas as pd
from textblob import TextBlob
import csv
from gutenberg.acquire import load_etext
from gutenberg.cleanup import strip_headers
import os
import numpy as np

In [2]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = (8, 6)
plt.rcParams['font.size'] = 14

Create the novel variables

In [10]:
n = 76 #novel number
t = 'Huck_Finn '#novel name
s = 100 #how many rows to skip

* Get the novels from Gutenberg.
* Strip them of unwanted information.
* Tokenize the text into sentences.

In [11]:
novel = strip_headers(load_etext(n))
novel = novel.replace('\n', ' ')
novel= TextBlob(novel)
novel_sentences = novel.sentences
novel_title = t

Write the sentences to a csv file. 
-There is a bug I haven't figure out yet that requires me to write to csv twice to avoid errors.

In [12]:
for l in range(2):
    novelWriter = csv.writer(open('data/novel_'+novel_title+'.csv', 'w'), delimiter=',')
    for sentence in novel_sentences:
        novelWriter.writerow([sentence])

Read in the csv file to pandas

In [13]:
df_novel = pd.read_csv('data/novel_'+novel_title+'.csv', skiprows = s, header=None)

In [14]:
df_novel.head()

Unnamed: 0,0
0,The Wreck We turned in and Slept Turning ove...
1,"""Buck"" ""It made Her look Spidery"" ""They got ..."
2,"""I am the Late Dauphin"" Tail Piece On the Ra..."
3,"""Was you in my Room?"""
4,Jawing In Trouble Indignation How to Find T...


Create the wrd_length and total_char columns.

In [15]:
wrd_length = []
total_char = []
def wrd_char_counts(sentence):
    total_chars = 0
    wrd_counts = []
    for word in sentence:
        char_count = len(word)
        wrd_counts.append(char_count)
        total_chars += char_count
    total_char.append(total_chars)
    wrd_length.append(wrd_counts)

In [16]:
for l in df_novel[0]:
    sent = TextBlob(l)
    wrd_char_counts(sent.words)

UnicodeDecodeError: 'ascii' codec can't decode byte 0xef in position 0: ordinal not in range(128)

In [None]:
df_novel['wrd_length'] = wrd_length
df_novel['total_char'] = total_char

In [None]:
df_novel.head()

Create syllable count column

In [None]:
def CountSyllables(word, isName=True):
    vowels = "aeiouy"
    #single syllables in words like bread and lead, but split in names like Breanne and Adreann
    specials = ["ia","ea"] if isName else ["ia"]
    specials_except_end = ["ie","ya","es","ed"]  #seperate syllables unless ending the word
    currentWord = word.lower()
    numVowels = 0
    lastWasVowel = False
    last_letter = ""

    for letter in currentWord:
        if letter in vowels:
            #don't count diphthongs unless special cases
            combo = last_letter+letter
            if lastWasVowel and combo not in specials and combo not in specials_except_end:
                lastWasVowel = True
            else:
                numVowels += 1
                lastWasVowel = True
        else:
            lastWasVowel = False

        last_letter = letter

    #remove es & ed which are usually silent
    if len(currentWord) > 2 and currentWord[-2:] in specials_except_end:
        numVowels -= 1

    #remove silent single e, but not ee since it counted it before and we should be correct
    elif len(currentWord) > 2 and currentWord[-1:] == "e" and currentWord[-2:] != "ee" and currentWord != 'the':
        numVowels -= 1

    return numVowels

In [None]:
syl = []
for l in df_novel[0]:
    sent = TextBlob(l)
    syl_single = []
    for x in sent.words:
        m = CountSyllables(x)
        syl_single.append(m)
    syl.append(syl_single)  

In [None]:
syl_count_arr = []
for n in syl:
    n = np.array(n)
    syl_count_arr.append(n)

In [None]:
df_novel['syl_count'] = syl_count_arr

In [None]:
df_novel.head()

In [None]:
#If novel has a lot of numbers for chapter headings.
#d = df_novel[df_novel['total_char']<=2]

Create syllable sum column

In [None]:
syl_sum = []
for l in range(0,len(df_novel)):
    syl_sum.append(df_novel['syl_count'][l].sum())

In [None]:
df_novel['syl_sum'] = syl_sum

Create sentiment column

In [None]:
def detect_sentiment(text):
    return TextBlob(text.decode('utf-8')).sentiment.polarity

In [None]:
df_novel['sentiment'] = df_novel[0].apply(detect_sentiment)

Write all columns to csv file

In [None]:
df_novel.to_csv('data/novel_'+novel_title+'.csv', index=False)

In [None]:
#test
df_test = pd.read_csv('data/novel_'+novel_title+'.csv')

In [None]:
df_test.head()

Create cluster dataframe - remove columns that can't be used

In [None]:
df_cluster = df_novel.drop('wrd_length', 1)

In [None]:
df_cluster = df_cluster.drop('syl_count', 1)

In [None]:
df_cluster.head()

Create 20 clusters on scaled data

In [None]:
X = df_cluster.drop(0, axis=1)

In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
km = KMeans(n_clusters=20, random_state=1)
km.fit(X_scaled)

In [None]:
df_cluster['cluster'] = km.labels_

In [None]:
df_cluster.groupby('cluster').mean()

In [None]:
centers = df_cluster.groupby('cluster').mean()

Create Scatter plot 

In [None]:
colors = np.array(['#0000ff', '#ff00ff', '#39b54a', '#ff0000', '#ffff00', '#000080', '#ff99ff', '#88d392', '#bf0000', '#b4ff33', '#0000bf', '#800080','#1d5b25', '#4d226d', '#2b6855', '#128ab2', '#6666ff', '#a381bd', '#333333','#a0d0e0'])

In [None]:
# scatter plot of calories versus alcohol, colored by cluster (0=red, 1=green, 2=blue)
plt.scatter(df_cluster.syl_sum, df_cluster.sentiment, c=colors[df_cluster.cluster], s=50)

# cluster centers, marked by "+"
plt.scatter(centers.syl_sum, centers.sentiment, linewidths=3, marker='+', s=300, c='black')

# add labels
plt.xlabel('syl_sum')
plt.ylabel('sentiment')

Create 3 clusters

In [None]:
df_cluster_3 = df_cluster

In [None]:
df_cluster_3.head()

In [None]:
X = df_cluster_3.drop(0, axis=1)

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
km = KMeans(n_clusters=3, random_state=1)
km.fit(X_scaled)

In [None]:
df_cluster_3['cluster'] = km.labels_

In [None]:
df_cluster_3.groupby('cluster').mean()

Create 3 Clusters with no syl_sum

In [None]:
df_cluster_no_syl = df_cluster

In [None]:
df_cluster_no_syl = df_cluster.drop('syl_sum', 1)

In [None]:
X = df_cluster_no_syl.drop(0, axis=1)

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
km = KMeans(n_clusters=3, random_state=1)
km.fit(X_scaled)

In [None]:
df_cluster_no_syl['cluster'] = km.labels_

In [None]:
df_cluster_no_syl.groupby('cluster').mean()

Create 5 Clusters

In [None]:
df_cluster_5 = df_cluster

In [None]:
df_cluster_5.head()

In [None]:
X = df_cluster_5.drop(0, axis=1)

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
km = KMeans(n_clusters=5, random_state=1)
km.fit(X_scaled)

In [None]:
df_cluster_5['cluster'] = km.labels_

In [None]:
df_cluster_5.groupby('cluster').mean()

Find ideal amount of clusters for novel

In [None]:
from sklearn import metrics
metrics.silhouette_score(X_scaled, km.labels_)

In [None]:
k_range = range(2,150)
scores = []
for k in k_range:
    km = KMeans(n_clusters=k, random_state=1)
    km.fit(X_scaled)
    scores.append(metrics.silhouette_score(X_scaled, km.labels_))

In [None]:
plt.plot(k_range, scores)
plt.xlabel('Number of clusters')
plt.ylabel('Silihouette Coefficient')
plt.grid(True)

Find Sentiment Pattern - need to work on this code to make it more universal.

20 pieces

In [None]:
df_novel['total_char'].sum()

In [None]:
ratio = (df_novel['total_char'].sum()/20) - 50

In [None]:
t = 0
x = 0
ratio = (df_novel['total_char'].sum()/20) - 100
def find_this(t, x):
    w = 0    
    sent_stop = []
    while w <= ratio:
        w = df_novel['total_char'][t:x].sum()
        sent_stop.append(x)
        x += 1
    return max(sent_stop)     

In [None]:
t = 0
x = 0
start_point = []
stop_point = []
for n in range(1, 21):
    s = find_this(t, x)
    print "df_novel['total_char'][%s:%s]" %(t, s)
    start_point.append(t)
    stop_point.append(s)
    t = s
    x = s + 1      

In [None]:
twenty_piece_char = []
st = 0
for l in start_point: 
    strt = start_point[st]
    stp = stop_point[st]
    print strt
    print stp
    mn = df_novel['sentiment'][strt:stp].mean()
    twenty_piece_char.append(mn)
    print mn
    st +=1   

In [None]:
n = np.array(n)
twenty_piece_char = np.array(twenty_piece_char)
print twenty_piece_char

In [None]:
plt.bar(range(20), twenty_piece_char)
plt.ylabel('sentiment')
plt.show()

Create 3 piece sentiment pattern

In [None]:
t = 0
x = 0
ratio = (df_novel['total_char'].sum()/3) -00
def find_this(t, x):
    w = 0    
    sent_stop = []
    while w < ratio:
        w = df_novel['total_char'][t:x].sum()
        sent_stop.append(x)
        x += 1
    return max(sent_stop)  

In [None]:
t = 0
x = 0
start_point = []
stop_point = []
for n in range(1, 21):
    s = find_this(t, x)
    print "df_novel['total_char'][%s:%s]" %(t, s)
    start_point.append(t)
    stop_point.append(s)
    t = s
    x = s + 1  

In [None]:
three_piece_char = []
st = 0
for l in start_point: 
    strt = start_point[st]
    stp = stop_point[st]
    print strt
    print stp
    mn = df_novel['sentiment'][strt:stp].mean()
    three_piece_char.append(mn)
    print mn
    st +=1

In [None]:
n = np.array(n)
three_piece_char = np.array(three_piece_char)
print three_piece_char

In [None]:
plt.bar(range(3), three_piece_char)
plt.ylabel('sentiment')
plt.show()

Create sentiment pattern based on ideal cluster numbers