In [1]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

In [2]:
df = pd.read_csv('poetry_society_data.csv')
df.head()

Unnamed: 0,author,content,poem name,age,type
0,WILLIAM SHAKESPEARE,Let the bird of loudest lay\r\nOn the sole Ara...,The Phoenix and the Turtle,Renaissance,Mythology & Folklore
1,DUCHESS OF NEWCASTLE MARGARET CAVENDISH,"Sir Charles into my chamber coming in,\r\nWhen...",An Epilogue to the Above,Renaissance,Mythology & Folklore
2,THOMAS BASTARD,"Our vice runs beyond all that old men saw,\r\n...","Book 7, Epigram 42",Renaissance,Mythology & Folklore
3,EDMUND SPENSER,"Lo I the man, whose Muse whilome did maske,\r\...","from The Faerie Queene: Book I, Canto I",Renaissance,Mythology & Folklore
4,RICHARD BARNFIELD,"Long have I longd to see my love againe,\r\nSt...",Sonnet 16,Renaissance,Mythology & Folklore


In [3]:
df2 = df[~df['content'].str.contains('Copyright', case=False)]
df2 = df2[~df2['content'].str.contains('published by', case=False)]
df2 = df2[~df2['content'].str.contains('published in', case=False)]
df2 = df2[~df2['content'].str.contains('selected works', case=False)]
df2 = df2[~df2['content'].str.contains('reprint', case=False)]
df2 = df2[~df2['content'].str.contains('1904')]
df2 = df2[~df2['content'].str.contains('1919')]
df2 = df2[~df2['content'].str.contains('10-19')]
df2[df2['content'].str.contains('_o', case=False)]

Unnamed: 0,author,content,poem name,age,type
3,EDMUND SPENSER,"Lo I the man, whose Muse whilome did maske,\r\...","from The Faerie Queene: Book I, Canto I",Renaissance,Mythology & Folklore


In [4]:
df2.to_csv('poetry_society_data_cleaned01.csv')

In [5]:
myvectorizer = CountVectorizer()
X1 = myvectorizer.fit_transform(df2['content'])

In [6]:
myvectorizer.get_feature_names()

['1538',
 '1917',
 '1921',
 '1922',
 '1928',
 '_o',
 'abandon',
 'abandonment',
 'abashed',
 'abasht',
 'abate',
 'abating',
 'abed',
 'abhor',
 'abhorring',
 'abided',
 'abides',
 'abject',
 'abjure',
 'abler',
 'aboard',
 'aboue',
 'abound',
 'about',
 'above',
 'abreast',
 'abristle',
 'abroad',
 'abruptlytook',
 'absence',
 'absent',
 'absorb',
 'absorbed',
 'abstain',
 'abstract',
 'absurdly',
 'abundance',
 'abundant',
 'abus',
 'abusd',
 'abuse',
 'abused',
 'abydos',
 'abye',
 'abysm',
 'ac',
 'accent',
 'accents',
 'accept',
 'acceptance',
 'accepted',
 'access',
 'accessory',
 'accidents',
 'accomplished',
 'accord',
 'according',
 'accordingly',
 'account',
 'accounted',
 'accounts',
 'accurst',
 'accus',
 'accuse',
 'accustomed',
 'ache',
 'acherontes',
 'achilles',
 'acid',
 'acidalian',
 'acquainted',
 'acquit',
 'acre',
 'acres',
 'across',
 'act',
 'action',
 'actions',
 'actives',
 'acute',
 'ad',
 'adam',
 'adamant',
 'adams',
 'add',
 'added',
 'adder',
 'adders',
 '

In [8]:
print(type(X1))
X1.shape

<class 'scipy.sparse.csr.csr_matrix'>


(474, 11276)

In [7]:
print(X1.nnz) # Get the count of explicitly-stored values (nonzeros)
100 * X1.nnz / (X1.shape[0]*X1.shape[1]) # Percentage of cells with nonzero values

55922


1.0462832826674928

In [8]:
from sklearn.naive_bayes import MultinomialNB

In [9]:
model = MultinomialNB().fit(X1,df2['age'])

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X1, df2['age'],
random_state=10)

In [12]:
pModel = MultinomialNB()
pModel.fit(Xtrain,ytrain)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [13]:
y_model = pModel.predict(Xtest)

In [14]:
from sklearn.metrics import accuracy_score
accuracy_score(ytest, y_model)

0.9327731092436975

In [21]:
y_model.shape

(119,)

In [27]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer().fit(X1)

In [30]:
contents_tfidf = tfidf_transformer.transform(X1)

In [34]:
X_train, X_test, y_train, y_test = train_test_split(contents_tfidf, df2['age'],
random_state=10)

In [35]:
newModel = MultinomialNB()
newModel.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [36]:
y2_model = newModel.predict(X_test)
accuracy_score(y_test, y2_model)

0.7142857142857143

In [2]:
corpus = [
     'This is the first document.',
     'This document is the second document.',
     'And this is the third one.',
     'Is this the first document?',
 ]

In [3]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
vectorizer.get_feature_names()

['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']

In [4]:
X.toarray()

array([[0, 1, 1, 1, 0, 0, 1, 0, 1],
       [0, 2, 0, 1, 0, 1, 1, 0, 1],
       [1, 0, 0, 1, 1, 0, 1, 1, 1],
       [0, 1, 1, 1, 0, 0, 1, 0, 1]], dtype=int64)

In [7]:
type(X)

scipy.sparse.csr.csr_matrix

In [9]:
vectorizer2 = CountVectorizer(analyzer='word', ngram_range=(1, 2))
X2 = vectorizer2.fit_transform(corpus)
vectorizer2.get_feature_names()

['and',
 'and this',
 'document',
 'document is',
 'first',
 'first document',
 'is',
 'is the',
 'is this',
 'one',
 'second',
 'second document',
 'the',
 'the first',
 'the second',
 'the third',
 'third',
 'third one',
 'this',
 'this document',
 'this is',
 'this the']

In [11]:
X2.toarray()

array([[0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0],
       [0, 0, 2, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0],
       [1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0],
       [0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1]],
      dtype=int64)

In [138]:
vectorizer.inverse_transform(X)

[array(['this', 'is', 'the', 'first', 'document'], dtype='<U8'),
 array(['this', 'is', 'the', 'document', 'second'], dtype='<U8'),
 array(['this', 'is', 'the', 'and', 'third', 'one'], dtype='<U8'),
 array(['this', 'is', 'the', 'first', 'document'], dtype='<U8')]

In [145]:
vectorizer3 = CountVectorizer()

In [147]:
X3 = vectorizer.transform(corpus)

In [141]:
X3.toarray()

array([[0, 1, 1, 1, 0, 0, 1, 0, 1],
       [0, 2, 0, 1, 0, 1, 1, 0, 1],
       [1, 0, 0, 1, 1, 0, 1, 1, 1],
       [0, 1, 1, 1, 0, 0, 1, 0, 1]], dtype=int64)

In [148]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'