# Poem Genres Categorization

In [1]:
## Libraries used
import pandas as pd
import numpy as np
import re
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier

In [2]:
# importing data as pandas dataframe
data = pd.read_csv('Poems.csv',index_col = 0)

In [3]:
data.head()

Unnamed: 0,content,type
0,"Why didst thou promise such a beauteous day,\r...",Nature
1,"The welcome Sun from sea Freake is returned,\r...",Nature
2,"I met a courtier riding on the plain,\r\nWell-...",Nature
3,"Walking the fields a wantcatcher I spied,\r\nT...",Nature
4,"Fishing, if I a fisher may protest,\r\nOf plea...",Nature


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 748 entries, 0 to 747
Data columns (total 2 columns):
content    748 non-null object
type       748 non-null object
dtypes: object(2)
memory usage: 17.5+ KB


In [5]:
# Shuffling the dataset
data = data.sample(frac=1,random_state=10).reset_index().drop(columns = ["index"])

In [6]:
# defining the dependant variable
y = data.type
y.unique()

array(['Nature', 'sad', 'Love', 'peace'], dtype=object)

In [7]:
# Data Preprocessing
container = []
for i in range(748):
    poem = re.sub('[^a-zA-Z]',' ',data['content'][i])  # Converting all the irrelevent characters into space
    poem = poem.lower() # converting all the alphabets into lowercase
    poem = poem.split() # tokenizing the string into words
    ps = PorterStemmer() # Defining Stemmer tool 
    poem = [ps.stem(word) for word in poem if not word in set(stopwords.words('english'))] # Removing all the stopwords and lemmatizing all the relevent words
    poem = " ".join(poem) 
    container.append(poem)

In [8]:
# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(container,y,test_size = 0.2, random_state = 16)

In [9]:
cv = CountVectorizer()

In [10]:
# converting all the tokens into features for modeling
count_train = cv.fit_transform(X_train)
count_test = cv.transform(X_test)

## Classifier 1 - Naive Bayes

In [11]:
# defining naive bayes classifier
nb_classifier = MultinomialNB()

In [12]:
# fitting the model on the training set and predicting the outcome of the test set
nb_classifier.fit(count_train,y_train)
pred = nb_classifier.predict(count_test)

# computing the accuracy of the model
score = metrics.accuracy_score(y_test,pred)
print(score)

cm = metrics.confusion_matrix(y_test,pred,labels = ['Love', 'Nature', 'peace', 'sad'])
print(cm)

0.68
[[56 11  0  1]
 [12 20  0  6]
 [ 5  0  3  8]
 [ 4  0  1 23]]


### By looking over the model score we can say that our model is 68% accurate while predicting the genres but we could increase it by adjusting the value of alpha

In [13]:
# predicting the outcome on each value of alpha from 0 to 1 with a step size of 0.01

def train_and_predict(alpha):
    nb_classifier = MultinomialNB(alpha=alpha)
    nb_classifier.fit(count_train,y_train)
    pred = nb_classifier.predict(count_test)
    score = metrics.accuracy_score(y_test,pred)
    return score

alphas = np.arange(0,1,0.01)
for alpha in alphas:
    print('Alpha: ', alpha)
    print('Score: ', train_and_predict(alpha))
    print()


Alpha:  0.0
Score:  0.6733333333333333

Alpha:  0.01
Score:  0.7333333333333333

Alpha:  0.02
Score:  0.7333333333333333

Alpha:  0.03
Score:  0.7333333333333333

Alpha:  0.04
Score:  0.7333333333333333

Alpha:  0.05
Score:  0.7333333333333333

Alpha:  0.06
Score:  0.7333333333333333

Alpha:  0.07
Score:  0.74

Alpha:  0.08
Score:  0.74

Alpha:  0.09
Score:  0.74

Alpha:  0.1
Score:  0.7333333333333333

Alpha:  0.11
Score:  0.7266666666666667

Alpha:  0.12
Score:  0.7333333333333333

Alpha:  0.13
Score:  0.7333333333333333

Alpha:  0.14
Score:  0.7333333333333333

Alpha:  0.15
Score:  0.7333333333333333

Alpha:  0.16
Score:  0.7266666666666667

Alpha:  0.17
Score:  0.7266666666666667

Alpha:  0.18
Score:  0.7266666666666667

Alpha:  0.19
Score:  0.7266666666666667

Alpha:  0.2
Score:  0.7266666666666667

Alpha:  0.21
Score:  0.7266666666666667

Alpha:  0.22
Score:  0.7333333333333333

Alpha:  0.23
Score:  0.7333333333333333

Alpha:  0.24
Score:  0.74

Alpha:  0.25
Score:  0.74

Alpha: 

  'setting alpha = %.1e' % _ALPHA_MIN)


 0.7066666666666667

Alpha:  0.42
Score:  0.7066666666666667

Alpha:  0.43
Score:  0.7066666666666667

Alpha:  0.44
Score:  0.7066666666666667

Alpha:  0.45
Score:  0.7066666666666667

Alpha:  0.46
Score:  0.7066666666666667

Alpha:  0.47000000000000003
Score:  0.7066666666666667

Alpha:  0.48
Score:  0.7066666666666667

Alpha:  0.49
Score:  0.7066666666666667

Alpha:  0.5
Score:  0.7066666666666667

Alpha:  0.51
Score:  0.7066666666666667

Alpha:  0.52
Score:  0.7066666666666667

Alpha:  0.53
Score:  0.7066666666666667

Alpha:  0.54
Score:  0.7

Alpha:  0.55
Score:  0.7

Alpha:  0.56
Score:  0.7

Alpha:  0.5700000000000001
Score:  0.7

Alpha:  0.58
Score:  0.7

Alpha:  0.59
Score:  0.7

Alpha:  0.6
Score:  0.7

Alpha:  0.61
Score:  0.7

Alpha:  0.62
Score:  0.7

Alpha:  0.63
Score:  0.7

Alpha:  0.64
Score:  0.7

Alpha:  0.65
Score:  0.7

Alpha:  0.66
Score:  0.7

Alpha:  0.67
Score:  0.7

Alpha:  0.68
Score:  0.7

Alpha:  0.6900000000000001
Score:  0.7

Alpha:  0.7000000000000001
Sco

## By observing the scores over various alphas, It can be concluded that if we choose alpha = 0.07 then our model becomes 74% accurate while predicting the genres of the poems which is a good improvement in our model.

### Now, I am going to try a different approach of modeling with cross validation and hyper-parameter tuning. Hoping, My model may perform better.

In [14]:
# Defining independent variables set
X1 = cv.fit_transform(container)

In [15]:
# defining the value of alpha as a parameter-grid
alpha = np.arange(0,1,0.01)
param_grid = {'alpha':alpha}

In [16]:
# defining naive bayes classifier
nb_classifier = MultinomialNB()

In [17]:
# defining grid search object with naive bayes classifier and 10-fold cross validation
nb_classifier_cv = GridSearchCV(nb_classifier, param_grid, cv = 10)

In [18]:
# fitting the model with cross validation
nb_classifier_cv.fit(X1,y)

  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)


GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'alpha': array([0.  , 0.01, ..., 0.98, 0.99])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [19]:
print(nb_classifier_cv.best_params_)
print(nb_classifier_cv.best_score_)

{'alpha': 0.35000000000000003}
0.6911764705882353


## By this approach the accuracy of our model comes out to be approx 70% and with previous method model accuracy is 74%, so our average model accuracy is about 72% .

## Classifier 2 - Random Forest

In [20]:
# defining the classifier 
rf_classifier = RandomForestClassifier(n_estimators = 300, random_state=42)

In [21]:
# fitting the classifier into the training set
rf_classifier.fit(count_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [22]:
# predicting the outcomes on the test set
pred1 = rf_classifier.predict(count_test)

In [23]:
# computing the accuracy of the model
score = metrics.accuracy_score(y_test,pred1)
print(score)

cm = metrics.confusion_matrix(y_test,pred1,labels = ['Love', 'Nature', 'peace', 'sad'])
print(cm)

0.64
[[57 11  0  0]
 [14 20  0  4]
 [ 9  0  3  4]
 [11  1  0 16]]


## The accuracy of the Random Forest model is 64% which quite low as compared to the Naive Bayes model