In [4]:
#import dependencies
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [12]:
#read in CSV file
file = pd.read_csv('NLP_HAPA_brief.csv')
file.head()

Unnamed: 0,class,text
0,coping planning,work out early in the morning
1,risk perception,A bad digestive systems
2,Benefits,ability to enjoy life to the fullest
3,risk perception,aching joints
4,coping planning,Add a salad


In [15]:
#Preprocess text
nltk.download('stopwords')
corpus = []
for i in range(0, 1087):
   #search for string objects between A and Z
    review = re.sub('[^a-zA-Z]', ' ', file['text'][i])
    review = review.lower()
    review = review.split()
    #porterstemmer allows us to classify words based on a abbreviated stem of a word
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\paxto\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [22]:
#creat a bag-of words model and reduces redundancy 
cv = CountVectorizer()
X = cv.fit_transform(corpus).toarray()
y = file.iloc[:,0].values


array(['coping planning', 'risk perception', 'Benefits', ...,
       'coping planning', 'coping planning', 'Benefits'], dtype=object)

In [23]:
#split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 21)


In [24]:
#Fitting Naive Bayes to the Training set
classifier = GaussianNB()
classifier.fit(X_train, y_train)

GaussianNB()

In [25]:
#Predicting the NB Test set results
y_pred_NB = classifier.predict(X_test)
cm_NB = confusion_matrix(y_test, y_pred_NB) 
cm_NB

array([[ 60,   0,  67],
       [  3,  62,  12],
       [  4,   1, 118]], dtype=int64)

In [40]:
#Classification report for Naive Bayes
from sklearn.metrics import classification_report

print(f'********************Naive Bayes Model Fit*********************')
print(classification_report(y_test, y_pred_NB)) 
print(f'Training Score: {classifier.score(X_train, y_train)}')
print(f'Testing Score: {classifier.score(X_test, y_test)}')

********************Naive Bayes Model Fit*********************
                 precision    recall  f1-score   support

       Benefits       0.90      0.47      0.62       127
coping planning       0.98      0.81      0.89        77
risk perception       0.60      0.96      0.74       123

       accuracy                           0.73       327
      macro avg       0.83      0.75      0.75       327
   weighted avg       0.80      0.73      0.73       327

Training Score: 0.9855263157894737
Testing Score: 0.8470948012232415


In [35]:
#Fitting Decision Tree model
classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 21)
classifier.fit(X_train, y_train)

DecisionTreeClassifier(criterion='entropy', random_state=21)

In [36]:
#Predicting the decision tree results and confusion matrix
y_pred_DT = classifier.predict(X_test)
cm_DT = confusion_matrix(y_test, y_pred_DT) 
cm_DT

array([[104,   4,  19],
       [ 16,  61,   0],
       [ 16,   1, 106]], dtype=int64)

In [41]:
#Classification report for 
print(f'*****************Decision Model Model*****************')
print(classification_report(y_test, y_pred_DT)) 
print(f'Training Score: {classifier.score(X_train, y_train)}')
print(f'Testing Score: {classifier.score(X_test, y_test)}')

*****************Decision Model Model*****************
                 precision    recall  f1-score   support

       Benefits       0.76      0.82      0.79       127
coping planning       0.92      0.79      0.85        77
risk perception       0.85      0.86      0.85       123

       accuracy                           0.83       327
      macro avg       0.85      0.82      0.83       327
   weighted avg       0.83      0.83      0.83       327

Training Score: 0.9855263157894737
Testing Score: 0.8470948012232415


In [39]:
#Fitting a random forest model
classifier = RandomForestClassifier(n_estimators = 300, criterion = 'entropy', random_state = 21)
classifier.fit(X_train, y_train)

RandomForestClassifier(criterion='entropy', n_estimators=300, random_state=21)

In [42]:
#Predicting the Random Forest results and confusion matrix
y_pred_RF = classifier.predict(X_test)
cm_RF = confusion_matrix(y_test, y_pred_RF) 
cm_RF

array([[101,   3,  23],
       [  1,  62,  14],
       [  8,   1, 114]], dtype=int64)

In [43]:
print(f'*****************Random Forest Model*****************')
print(classification_report(y_test, y_pred_RF)) 
print(f'Training Score: {classifier.score(X_train, y_train)}')
print(f'Testing Score: {classifier.score(X_test, y_test)}')

*****************Random Forest Model*****************
                 precision    recall  f1-score   support

       Benefits       0.92      0.80      0.85       127
coping planning       0.94      0.81      0.87        77
risk perception       0.75      0.93      0.83       123

       accuracy                           0.85       327
      macro avg       0.87      0.84      0.85       327
   weighted avg       0.86      0.85      0.85       327

Training Score: 0.9855263157894737
Testing Score: 0.8470948012232415
