In [3625]:
import pandas as pd
import nltk
import numpy as np
import string
from sklearn.metrics  import confusion_matrix,f1_score

### reading files

In [3626]:
with open('../data/edgar_allan_poe.txt','r') as f:
    lines_edgar = f.read().splitlines()
    lines_edgar 

with open('../data/robert_frost.txt','r') as f:
    lines_frost = f.read().splitlines()

In [3627]:
lines_edgar[:5]

["LO! Death hath rear'd himself a throne",
 'In a strange city, all alone,',
 'Far down within the dim west',
 'Where the good, and the bad, and the worst, and the best,',
 'Have gone to their eternal rest.']

In [3628]:
lines_frost[:5]

['Two roads diverged in a yellow wood,',
 'And sorry I could not travel both',
 'And be one traveler, long I stood',
 'And looked down one as far as I could',
 'To where it bent in the undergrowth; ']

In [3629]:
len(lines_edgar)

797

In [3630]:
len(lines_frost)

1581

In [3631]:
df = pd.concat([pd.DataFrame({'lines':lines_edgar,'label':'poe'}),pd.DataFrame({'lines':lines_frost,'label':'frost'})])

In [3632]:
df.lines = df.lines.apply(lambda x:x.lower())
df.lines = df.lines.apply(lambda x:x.strip())


In [3633]:
df = df[df.lines!='']
df

Unnamed: 0,lines,label
0,lo! death hath rear'd himself a throne,poe
1,"in a strange city, all alone,",poe
2,far down within the dim west,poe
3,"where the good, and the bad, and the worst, an...",poe
4,have gone to their eternal rest.,poe
...,...,...
1575,to say which buds are leaf and which are bloom.,frost
1577,a feather-hammer gives a double knock.,frost
1578,this eden day is done at two o'clock.,frost
1579,an hour of winter day might seem too short,frost


In [3634]:
df.reset_index(inplace=True,drop=True)
df

Unnamed: 0,lines,label
0,lo! death hath rear'd himself a throne,poe
1,"in a strange city, all alone,",poe
2,far down within the dim west,poe
3,"where the good, and the bad, and the worst, an...",poe
4,have gone to their eternal rest.,poe
...,...,...
2149,to say which buds are leaf and which are bloom.,frost
2150,a feather-hammer gives a double knock.,frost
2151,this eden day is done at two o'clock.,frost
2152,an hour of winter day might seem too short,frost


### train test split

In [3635]:
df_train = df.sample(frac=0.75)
df_test = df.drop(df_train.index)
df_test = df_test.sample(frac=1)

In [3636]:
df_train

Unnamed: 0,lines,label
198,for the resurrection of deep-buried faith,poe
1853,"still more unfettered,",frost
184,be nothing which thou art not.,poe
1800,and in it a piano loudly playing.,frost
1328,"then he came at me with one hand outstretched,",frost
...,...,...
264,from their throats.,poe
1034,"up where the trees grow short, the mosses tall,",frost
1863,the cosmic motes,frost
1895,but being mounted bareback on the earth?,frost


In [3637]:
df_test

Unnamed: 0,lines,label
1473,"she made a sudden movement toward her bodice,",frost
1945,to live together and then pull apart.,frost
951,of arthur amy's having once been up,frost
532,"(though once we had journeyed down here),",poe
201,"lying down to die, have suddenly arisen",poe
...,...,...
878,but though repeatedly he strove,frost
514,in the ghoul-haunted woodland of weir.,poe
534,nor the ghoul-haunted woodland of weir.,poe
329,could hope to utter. and i! my spells are broken.,poe


### creating the markov model for each class

In [3638]:
df_train.reset_index(inplace=True, drop=True)

#### STEP 1 - tokenize each sentence of the poem

In [3639]:
df_train['lines_tokenized'] = df_train.lines.apply(nltk.word_tokenize)

In [3640]:
df_train.lines_tokenized = df_train.lines_tokenized.apply(lambda x:[i for i in x if i not in string.punctuation])

In [3641]:
df_train.head()

Unnamed: 0,lines,label,lines_tokenized
0,for the resurrection of deep-buried faith,poe,"[for, the, resurrection, of, deep-buried, faith]"
1,"still more unfettered,",frost,"[still, more, unfettered]"
2,be nothing which thou art not.,poe,"[be, nothing, which, thou, art, not]"
3,and in it a piano loudly playing.,frost,"[and, in, it, a, piano, loudly, playing]"
4,"then he came at me with one hand outstretched,",frost,"[then, he, came, at, me, with, one, hand, outs..."


#### STEP 2 - create token to integer mapping for each unique token

In [3642]:
token_to_num_mapping = {}
count=0
for i,row in df_train.iterrows():
    for j in row['lines_tokenized']:
        if j not in token_to_num_mapping:
            token_to_num_mapping[j] = count
            count+=1
        

In [3643]:
len(token_to_num_mapping)

2608

In [3644]:
def token_num(x):
    x_num = []
    for i in x:
        x_num.append(token_to_num_mapping.get(i,len(token_to_num_mapping)+1))
    return x_num

In [3645]:
df_train['lines_tokenized_num'] = df_train.lines_tokenized.apply(token_num)

In [3646]:
df_train.head()

Unnamed: 0,lines,label,lines_tokenized,lines_tokenized_num
0,for the resurrection of deep-buried faith,poe,"[for, the, resurrection, of, deep-buried, faith]","[0, 1, 2, 3, 4, 5]"
1,"still more unfettered,",frost,"[still, more, unfettered]","[6, 7, 8]"
2,be nothing which thou art not.,poe,"[be, nothing, which, thou, art, not]","[9, 10, 11, 12, 13, 14]"
3,and in it a piano loudly playing.,frost,"[and, in, it, a, piano, loudly, playing]","[15, 16, 17, 18, 19, 20, 21]"
4,"then he came at me with one hand outstretched,",frost,"[then, he, came, at, me, with, one, hand, outs...","[22, 23, 24, 25, 26, 27, 28, 29, 30]"


#### separating each model data because each model will have it's own set of data

In [3647]:
df_train_poe = df_train[df_train.label=='poe']
df_train_frost = df_train[df_train.label=='frost']

In [3648]:
df_train_poe.head()

Unnamed: 0,lines,label,lines_tokenized,lines_tokenized_num
0,for the resurrection of deep-buried faith,poe,"[for, the, resurrection, of, deep-buried, faith]","[0, 1, 2, 3, 4, 5]"
2,be nothing which thou art not.,poe,"[be, nothing, which, thou, art, not]","[9, 10, 11, 12, 13, 14]"
5,"and when, amid no earthly moans,",poe,"[and, when, amid, no, earthly, moans]","[15, 31, 32, 33, 34, 35]"
6,"which were seven,)",poe,"[which, were, seven]","[11, 36, 37]"
8,"the truest- the most fervently devoted,",poe,"[the, truest-, the, most, fervently, devoted]","[1, 43, 1, 44, 45, 46]"


In [3649]:
df_train_frost.head()

Unnamed: 0,lines,label,lines_tokenized,lines_tokenized_num
1,"still more unfettered,",frost,"[still, more, unfettered]","[6, 7, 8]"
3,and in it a piano loudly playing.,frost,"[and, in, it, a, piano, loudly, playing]","[15, 16, 17, 18, 19, 20, 21]"
4,"then he came at me with one hand outstretched,",frost,"[then, he, came, at, me, with, one, hand, outs...","[22, 23, 24, 25, 26, 27, 28, 29, 30]"
7,"because it was grassy and wanted wear,",frost,"[because, it, was, grassy, and, wanted, wear]","[38, 17, 39, 40, 15, 41, 42]"
9,from sleeping warm,frost,"[from, sleeping, warm]","[47, 48, 49]"


#### 

### fitting the markov model

#### pi & A -> matrix of transition probalities

In [3650]:
token_to_num_mapping

{'for': 0,
 'the': 1,
 'resurrection': 2,
 'of': 3,
 'deep-buried': 4,
 'faith': 5,
 'still': 6,
 'more': 7,
 'unfettered': 8,
 'be': 9,
 'nothing': 10,
 'which': 11,
 'thou': 12,
 'art': 13,
 'not': 14,
 'and': 15,
 'in': 16,
 'it': 17,
 'a': 18,
 'piano': 19,
 'loudly': 20,
 'playing': 21,
 'then': 22,
 'he': 23,
 'came': 24,
 'at': 25,
 'me': 26,
 'with': 27,
 'one': 28,
 'hand': 29,
 'outstretched': 30,
 'when': 31,
 'amid': 32,
 'no': 33,
 'earthly': 34,
 'moans': 35,
 'were': 36,
 'seven': 37,
 'because': 38,
 'was': 39,
 'grassy': 40,
 'wanted': 41,
 'wear': 42,
 'truest-': 43,
 'most': 44,
 'fervently': 45,
 'devoted': 46,
 'from': 47,
 'sleeping': 48,
 'warm': 49,
 'walls': 50,
 'all': 51,
 'buried': 52,
 'trees': 53,
 'few': 54,
 'out': 55,
 'miraculous': 56,
 'crescent': 57,
 'henceforth': 58,
 'i': 59,
 'hold': 60,
 'thy': 61,
 'flower-enameled': 62,
 'shore': 63,
 'speculation': 64,
 'sovereignty': 65,
 'than': 66,
 'ancient': 67,
 'lore': 68,
 'young': 69,
 'folk': 70,
 '

In [3651]:
# initializing the matrix/vector of transition probablities
poe_matrix = np.zeros((len(token_to_num_mapping),len(token_to_num_mapping)))
frost_matrix = np.zeros((len(token_to_num_mapping),len(token_to_num_mapping)))

poe_vector = np.zeros(len(token_to_num_mapping))
frost_vector = np.zeros(len(token_to_num_mapping))

In [3652]:
for i, row in df_train_poe.iterrows():
    for j in range(len(row['lines_tokenized_num'])-1):
        poe_matrix[row['lines_tokenized_num'][j],row['lines_tokenized_num'][j+1]] +=1
        if j==0:
            poe_vector[row['lines_tokenized_num'][j]]+=1

for i, row in df_train_frost.iterrows():
    for j in range(len(row['lines_tokenized_num'])-1):
        frost_matrix[row['lines_tokenized_num'][j],row['lines_tokenized_num'][j+1]] +=1
        if j==0:
            frost_vector[row['lines_tokenized_num'][j]]+=1

In [3653]:
count_i_poe = poe_matrix.sum(axis=1,keepdims=True)
count_i_frost = frost_matrix.sum(axis=1,keepdims=True)
poe_matrix= np.log((poe_matrix+1)/(count_i_poe+len(token_to_num_mapping)))
frost_matrix= np.log((frost_matrix+1)/(count_i_frost+len(token_to_num_mapping)))

In [3654]:
poe_vector = np.log((poe_vector+1)/(len(df_train_poe)+len(token_to_num_mapping)))
frost_vector = np.log((frost_vector+1)/(len(df_train_frost)+len(token_to_num_mapping)))

In [3655]:
frost_matrix

array([[-7.89394514, -5.49604987, -7.89394514, ..., -7.89394514,
        -7.89394514, -7.89394514],
       [-8.00269416, -8.00269416, -8.00269416, ..., -7.30954698,
        -8.00269416, -8.00269416],
       [-7.86633892, -7.86633892, -7.86633892, ..., -7.86633892,
        -7.86633892, -7.86633892],
       ...,
       [-7.86672229, -7.86672229, -7.86672229, ..., -7.86672229,
        -7.1735751 , -7.86672229],
       [-7.86633892, -7.86633892, -7.86633892, ..., -7.86633892,
        -7.86633892, -7.86633892],
       [-7.86672229, -7.86672229, -7.86672229, ..., -7.86672229,
        -7.86672229, -7.86672229]])

In [3656]:
poe_matrix

array([[-7.87169266, -6.4853983 , -7.87169266, ..., -7.87169266,
        -7.87169266, -7.87169266],
       [-7.95822719, -7.95822719, -6.8596149 , ..., -7.95822719,
        -7.95822719, -7.95822719],
       [-7.8671055 , -7.8671055 , -7.8671055 , ..., -7.8671055 ,
        -7.8671055 , -7.8671055 ],
       ...,
       [-7.86633892, -7.86633892, -7.86633892, ..., -7.86633892,
        -7.86633892, -7.86633892],
       [-7.86633892, -7.86633892, -7.86633892, ..., -7.86633892,
        -7.86633892, -7.86633892],
       [-7.86633892, -7.86633892, -7.86633892, ..., -7.86633892,
        -7.86633892, -7.86633892]])

In [3657]:
frost_vector

array([-6.41863679, -4.13285881, -8.21039626, ..., -8.21039626,
       -8.21039626, -8.21039626])

In [3658]:
poe_vector

array([-6.44667175, -4.59037376, -8.05610966, ..., -8.05610966,
       -8.05610966, -8.05610966])

#### priori -> the initial state

important if imbalanced class

In [3659]:
# each class count at start - priori

prior_poe, prior_frost = len(df_train_poe)/len(df_train),len(df_train_frost)/len(df_train)
log_prior_poe, log_prior_frost = np.log(prior_poe),np.log(prior_frost)

In [3660]:
prior_poe, prior_frost

(0.3372524752475248, 0.6627475247524752)

In [3661]:
log_prior_poe, log_prior_frost

(-1.0869234444177964, -0.411361168633292)

In [3662]:
A_poe = poe_matrix
Pi_poe = poe_vector

In [3663]:
A_frost = frost_matrix
Pi_frost = frost_vector

In [3664]:
def predict(input_):
    prob_poe = log_prior_poe
    prob_frost = log_prior_frost
    
    for i in range(len(input_)-1):
        try:
            if i==0:
                prob_poe+=Pi_poe[input_[i]]
                prob_frost+=Pi_frost[input_[i]]
            else:
                prob_poe+=A_poe[input_[i],input_[i+1]]
                prob_frost+=A_frost[input_[i],input_[i+1]]
        except:
            pass
    
    if prob_frost>prob_poe:
        return 'frost'
    else:
        return 'poe'

### train set prediction

In [3665]:
df_train.head()

Unnamed: 0,lines,label,lines_tokenized,lines_tokenized_num
0,for the resurrection of deep-buried faith,poe,"[for, the, resurrection, of, deep-buried, faith]","[0, 1, 2, 3, 4, 5]"
1,"still more unfettered,",frost,"[still, more, unfettered]","[6, 7, 8]"
2,be nothing which thou art not.,poe,"[be, nothing, which, thou, art, not]","[9, 10, 11, 12, 13, 14]"
3,and in it a piano loudly playing.,frost,"[and, in, it, a, piano, loudly, playing]","[15, 16, 17, 18, 19, 20, 21]"
4,"then he came at me with one hand outstretched,",frost,"[then, he, came, at, me, with, one, hand, outs...","[22, 23, 24, 25, 26, 27, 28, 29, 30]"


In [3666]:
df_train['pred_label'] = df_train.lines_tokenized_num.apply(predict)

### test set prediction

In [3667]:
df_test['lines_tokenized'] = df_test.lines.apply(nltk.word_tokenize)
df_test.lines_tokenized = df_test.lines_tokenized.apply(lambda x:[i for i in x if i not in string.punctuation])
df_test['lines_tokenized_num'] = df_test.lines_tokenized.apply(token_num)

In [3668]:
df_test['pred_label'] = df_test.lines_tokenized_num.apply(predict)

#### Accuracy measure - using f1 score

Using f1 score - as the classes are imbalanced

In [3669]:
def encode_category(cat):
    if cat=='frost':
        return 1
    if cat=='poe':
        return 0

#### accuracy train set

In [3670]:
df_train.label = df_train.label.apply(encode_category)
df_train.pred_label = df_train.pred_label.apply(encode_category)

In [3671]:
df_train.head()

Unnamed: 0,lines,label,lines_tokenized,lines_tokenized_num,pred_label
0,for the resurrection of deep-buried faith,0,"[for, the, resurrection, of, deep-buried, faith]","[0, 1, 2, 3, 4, 5]",0
1,"still more unfettered,",1,"[still, more, unfettered]","[6, 7, 8]",1
2,be nothing which thou art not.,0,"[be, nothing, which, thou, art, not]","[9, 10, 11, 12, 13, 14]",0
3,and in it a piano loudly playing.,1,"[and, in, it, a, piano, loudly, playing]","[15, 16, 17, 18, 19, 20, 21]",1
4,"then he came at me with one hand outstretched,",1,"[then, he, came, at, me, with, one, hand, outs...","[22, 23, 24, 25, 26, 27, 28, 29, 30]",1


In [3672]:
f1_score_train = f1_score(df_train.label,df_train.pred_label)
f1_score_train

0.9907493061979649

In [3673]:
confusion_matrix(df_train.label,df_train.pred_label)

array([[ 525,   20],
       [   0, 1071]])

#### accuracy test set

In [3674]:
df_test.label = df_test.label.apply(encode_category)
df_test.pred_label = df_test.pred_label.apply(encode_category)

In [3675]:
df_test.head()

Unnamed: 0,lines,label,lines_tokenized,lines_tokenized_num,pred_label
1473,"she made a sudden movement toward her bodice,",1,"[she, made, a, sudden, movement, toward, her, ...","[93, 374, 18, 2609, 2609, 905, 107, 2609]",1
1945,to live together and then pull apart.,1,"[to, live, together, and, then, pull, apart]","[74, 228, 354, 15, 22, 2609, 2609]",1
951,of arthur amy's having once been up,1,"[of, arthur, amy, 's, having, once, been, up]","[3, 185, 186, 150, 1699, 144, 565, 211]",1
532,"(though once we had journeyed down here),",0,"[though, once, we, had, journeyed, down, here]","[232, 144, 78, 351, 830, 113, 229]",0
201,"lying down to die, have suddenly arisen",0,"[lying, down, to, die, have, suddenly, arisen]","[1348, 113, 74, 1349, 263, 1350, 1351]",0


In [3676]:
f1_score_test = f1_score(df_test.label,df_test.pred_label)
f1_score_test

0.8880503144654088

In [3677]:
confusion_matrix(df_test.label,df_test.pred_label)

array([[ 96,  77],
       [ 12, 353]])

#### this turns out to be a bad classifier for the class which has lower number in  imbalanced class 

#### Basic steps done:

* files of each poets read 
* each line is considered 1 data row with label
* train and test set split
* now we will create as many markov model as number of classes
* first tokenize each row of training set
* convert tokens into integers -> unique word mapped to an index
* now as required by any markov model ,we need 
    * matrix of transition probablities - Matrix `A` and Vector `Pi`
    * Current State - That is basically here initial state if is imbalanced classes , so priori
* instead of `probality+multiplication` , we used `log probablity+summation`