## Data Modeling and Evaluation Steps (Model 2)

At this moment, we are going to create and implement the *Naive Bayes Classifer*.

In [2]:
# from google.colab import drive
# drive.mount('/content/drive')

#### Step 1: Retake the *X* and *y* variables from our trainning set

In [1]:
# Import basic libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Load the trainning dataset
# train_medication_reviews = pd.read_csv('/content/drive/Othercomputers/My MacBook Pro/Sentiment-Analysis-of-Medication-Reviews-Project/medication_reviews_dataset_to_train.csv', sep=',')
train_medication_reviews = pd.read_csv('/Users/rafaelaqueiroz/Sentiment-Analysis-of-Medication-Reviews-Project/medication_reviews_dataset_to_train.csv', sep=',')
train_medication_reviews

Unnamed: 0,drugName,condition,rating,date,usefulCount,year,review_word_lemm,polarity,rating_classification
0,Valsartan,Left Ventricular Dysfunction,9.0,2012-05-20,27,2012,"['no', 'side', 'effect', 'take', 'combination'...",0.000000,2
1,Guanfacine,ADHD,8.0,2010-04-27,192,2010,"['son', 'halfway', 'fourth', 'week', 'intuniv'...",0.188021,2
2,Lybrel,Birth Control,5.0,2009-12-14,17,2009,"['used', 'take', 'another', 'oral', 'contracep...",0.113636,1
3,Ortho Evra,Birth Control,8.0,2015-11-03,10,2015,"['first', 'time', 'using', 'form', 'birth', 'c...",0.262500,2
4,Buprenorphine / naloxone,Opiate Dependence,9.0,2016-11-27,37,2016,"['suboxone', 'completely', 'turned', 'life', '...",0.163333,2
...,...,...,...,...,...,...,...,...,...
112324,Carbamazepine,Trigeminal Neuralgia,1.0,2016-01-31,10,2016,"['mg', 'seems', 'work', 'every', 'nd', 'day', ...",0.000000,0
112325,Tekturna,High Blood Pressure,7.0,2010-02-07,18,2010,"['tekturna', 'day', 'effect', 'immediate', 'al...",-0.087500,2
112326,Campral,Alcohol Dependence,10.0,2015-05-31,125,2015,"['wrote', 'first', 'report', 'midoctober', 'no...",0.261905,2
112327,Thyroid desiccated,Underactive Thyroid,10.0,2015-09-19,79,2015,"['ive', 'thyroid', 'medication', 'year', 'spen...",0.201313,2


In [3]:
# As we already know from our previous notebook (notebook 3), our independent variable (X) is going to be the "review_word_lemm" variable
X_train = train_medication_reviews.review_word_lemm
X_train

0         ['no', 'side', 'effect', 'take', 'combination'...
1         ['son', 'halfway', 'fourth', 'week', 'intuniv'...
2         ['used', 'take', 'another', 'oral', 'contracep...
3         ['first', 'time', 'using', 'form', 'birth', 'c...
4         ['suboxone', 'completely', 'turned', 'life', '...
                                ...                        
112324    ['mg', 'seems', 'work', 'every', 'nd', 'day', ...
112325    ['tekturna', 'day', 'effect', 'immediate', 'al...
112326    ['wrote', 'first', 'report', 'midoctober', 'no...
112327    ['ive', 'thyroid', 'medication', 'year', 'spen...
112328    ['ive', 'chronic', 'constipation', 'adult', 'l...
Name: review_word_lemm, Length: 112329, dtype: object

In [6]:
X_train.shape

(112329,)

In [7]:
type(X_train)

pandas.core.series.Series

In [4]:
# As we know, our target or dependent variable (y) is going to be the 'rating_classification' variable
y_train = train_medication_reviews.rating_classification
y_train

0         2
1         2
2         1
3         2
4         2
         ..
112324    0
112325    2
112326    2
112327    2
112328    2
Name: rating_classification, Length: 112329, dtype: int64

In [9]:
y_train.shape

(112329,)

In [10]:
type(y_train)

pandas.core.series.Series

#### Step 2: Create the *X* and *y* variables from our testing set

Note: As this is the testing set, we are not going to apply any cleaning or processint to it. However, as we have done label encoding to the *rating* column, we would need to label encoding this column at this set as well since the model cannot predict the sentiment of the reviews from rating 1 to 10 as we are representing those numbers differently, such as: 0 - negative reviews, 1 - "neutral" reviews, and 2 - positive reviews.

In [5]:
# Load the test dataset
# test_drug_reviews_df = pd.read_csv('/content/drive/MyDrive/Data-Science-Other-Materials/Data-Scientist-Bootcamp/Sentiment-Analysis-of-Drug-Reviews/drugsComTest_raw.tsv', delimiter='\t')
test_drug_reviews_df = pd.read_csv('/Users/rafaelaqueiroz/Sentiment-Analysis-of-Medication-Reviews-Project/drugsComTest_raw.tsv', delimiter='\t')
test_drug_reviews_df.head()

Unnamed: 0.1,Unnamed: 0,drugName,condition,review,rating,date,usefulCount
0,163740,Mirtazapine,Depression,"""I&#039;ve tried a few antidepressants over th...",10.0,"February 28, 2012",22
1,206473,Mesalamine,"Crohn's Disease, Maintenance","""My son has Crohn&#039;s disease and has done ...",8.0,"May 17, 2009",17
2,159672,Bactrim,Urinary Tract Infection,"""Quick reduction of symptoms""",9.0,"September 29, 2017",3
3,39293,Contrave,Weight Loss,"""Contrave combines drugs that were used for al...",9.0,"March 5, 2017",35
4,97768,Cyclafem 1 / 35,Birth Control,"""I have been on this birth control for one cyc...",9.0,"October 22, 2015",4


In [6]:
# Import library to label encode the rating column
from sklearn.preprocessing import LabelEncoder

# Define the bin edges and labels (0 = 'negative', 1 = 'neutral', 2 = 'positive')
bin_edges = [0, 4, 6, 10]  # Ratings 1-4 are negative, 5-6 are neutral, 7-10 are positive
bin_labels = ['negative', 'neutral', 'positive'] # The rating_classification column should now only have 0, 1, or 2 values

# Use cut to bin the "rating" column and create the new column called "rating_classification"
test_drug_reviews_df['rating_classification'] = pd.cut(test_drug_reviews_df['rating'], bins=bin_edges, labels=bin_labels)

# Instantiate LabelEncoder and fit_transform the new column
le = LabelEncoder()
test_drug_reviews_df['rating_classification'] = le.fit_transform(test_drug_reviews_df['rating_classification'])
test_drug_reviews_df.head(5)

Unnamed: 0.1,Unnamed: 0,drugName,condition,review,rating,date,usefulCount,rating_classification
0,163740,Mirtazapine,Depression,"""I&#039;ve tried a few antidepressants over th...",10.0,"February 28, 2012",22,2
1,206473,Mesalamine,"Crohn's Disease, Maintenance","""My son has Crohn&#039;s disease and has done ...",8.0,"May 17, 2009",17,2
2,159672,Bactrim,Urinary Tract Infection,"""Quick reduction of symptoms""",9.0,"September 29, 2017",3,2
3,39293,Contrave,Weight Loss,"""Contrave combines drugs that were used for al...",9.0,"March 5, 2017",35,2
4,97768,Cyclafem 1 / 35,Birth Control,"""I have been on this birth control for one cyc...",9.0,"October 22, 2015",4,2


In [13]:
type(test_drug_reviews_df)

pandas.core.frame.DataFrame

In [7]:
test_drug_reviews_df.shape

(53766, 8)

In [8]:
X_test = test_drug_reviews_df['review']
X_test

0        "I&#039;ve tried a few antidepressants over th...
1        "My son has Crohn&#039;s disease and has done ...
2                            "Quick reduction of symptoms"
3        "Contrave combines drugs that were used for al...
4        "I have been on this birth control for one cyc...
                               ...                        
53761    "I have taken Tamoxifen for 5 years. Side effe...
53762    "I&#039;ve been taking Lexapro (escitaploprgra...
53763    "I&#039;m married, 34 years old and I have no ...
53764    "I was prescribed Nucynta for severe neck/shou...
53765                                        "It works!!!"
Name: review, Length: 53766, dtype: object

In [16]:
type(X_test)

pandas.core.series.Series

In [17]:
X_test.shape

(53766,)

In [9]:
y_test = test_drug_reviews_df.rating_classification
y_test

0        2
1        2
2        2
3        2
4        2
        ..
53761    2
53762    2
53763    2
53764    0
53765    2
Name: rating_classification, Length: 53766, dtype: int64

In [19]:
type(y_test)

pandas.core.series.Series

In [20]:
y_test.shape

(53766,)

#### Step 3.2: Create a pipeline with the *Naive Bayes Classifier*

In [10]:
# Import libraries to create the pipeline
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer # For BoW and TFIDF
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB # We use Multinomial because we have 3 classifications
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [25]:
# Create a function to make the pipeline

def pipeline_nbc():
    # X is the feature matrix
    X_train = train_medication_reviews.review_word_lemm
    # y is the label vector
    y_train = train_medication_reviews.rating_classification
    
    pipeline_nbc = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('classifier', MultinomialNB(fit_prior=False, class_prior=[0.3, 0.3, 0.4])) # Initialize the Multinomial Naive Bayes
])

    # Train the model on the training set
    pipeline_nbc.fit(X_train, y_train)

    # Predict the labels for the test set
    X_test = test_drug_reviews_df['review']
    y_test = test_drug_reviews_df.rating_classification
    y_pred = pipeline_nbc.predict(X_test)

    # Calculate the evaluation metrics
    print(classification_report(y_test, y_pred))
    labels = np.unique(y_pred)
    confusion_mat = confusion_matrix(y_test, y_pred, labels=labels)
    print(confusion_mat)

pipeline_nbc()

              precision    recall  f1-score   support

           0       0.67      0.47      0.55     13497
           1       0.15      0.22      0.18      4829
           2       0.77      0.81      0.79     35440

    accuracy                           0.67     53766
   macro avg       0.53      0.50      0.51     53766
weighted avg       0.69      0.67      0.68     53766

[[ 6351  1406  5740]
 [ 1020  1046  2763]
 [ 2069  4553 28818]]


These results show that:

- The precision for class 0 is 0.67, which means that when the classifier predicts a sample belongs to class 0, it is correct 67% of the time. The precision for class 1 is 0.15, which means that when the classifier predicts a sample belongs to class 1, it is correct only 15% of the time. The precision for class 2 is 0.77, which means that when the classifier predicts a sample belongs to class 2, it is correct 77% of the time.

- The recall for class 0 is 0.47, which means that the classifier only correctly identified 47% of the samples that actually belong to class 0. The recall for class 1 is 0.22, which means that the classifier correctly identified 22% of the samples that actually belong to class 1. The recall for class 2 is 0.81, which means that the classifier correctly identified 81% of the samples that actually belong to class 2.

- The F1 score for class 0 is 0.55, for class 1 is 0.18, and for class 2 is 0.79.

- Support: Support refers to the number of samples in each class. In your case, there are 13,497 samples in class 0, 4,829 samples in class 1, and 35,440 samples in class 2.

- Accuracy: Accuracy measures the proportion of samples that the classifier correctly classified. The accuracy is 0.67, which means that the classifier correctly classified 67% of the samples.

Let's try different parameters with *GridSearchCV( )* to see if we can improve these results.

In [27]:
# Let's try different parameters

# Define the pipeline
pipeline_mnb = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('classifier', MultinomialNB())
])

# Define the parameter grid for the GridSearchCV
parameters = {
    'vect__max_df': [0.5, 0.75, 1.0],
    'vect__max_features': [None, 5000, 10000],
    'tfidf__use_idf': [True, False],
    'classifier__alpha': [0.1, 0.5, 1.0],
    'classifier__fit_prior': [True, False],
    'classifier__class_prior': [[0.2, 0.4, 0.4], [0.3, 0.3, 0.4], [0.4, 0.3, 0.3]]
}

# Initialize the GridSearchCV object
grid_search = GridSearchCV(pipeline_mnb, parameters, cv=5, scoring='f1_weighted', n_jobs=-1)

# Fit the GridSearchCV object to the training data
grid_search.fit(X_train, y_train)

# Print the best parameters and the best score
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

Best parameters: {'classifier__alpha': 0.5, 'classifier__class_prior': [0.3, 0.3, 0.4], 'classifier__fit_prior': True, 'tfidf__use_idf': False, 'vect__max_df': 0.5, 'vect__max_features': 10000}
Best score: 0.7293727960153751


In [11]:
# Create new pipeline with the parameters updated

# X is the feature matrix
X_train = train_medication_reviews.review_word_lemm
# y is the label vector
y_train = train_medication_reviews.rating_classification

# Define the pipeline with the best parameters
pipeline_mnbc = Pipeline([
    ('vect', CountVectorizer(max_df=0.5, max_features=10000)),
    ('tfidf', TfidfTransformer(use_idf=False)),
    ('classifier', MultinomialNB(alpha=0.5, fit_prior=True, class_prior=[0.3, 0.3, 0.4]))
])

# Train the model on the training set
pipeline_mnbc.fit(X_train, y_train)

# Predict the labels for the testing set
X_test = test_drug_reviews_df['review']
y_test = test_drug_reviews_df.rating_classification
y_pred = pipeline_mnbc.predict(X_test)

# Calculate the evaluation metrics for the testing set
print(classification_report(y_test, y_pred))
labels = np.unique(y_pred)
confusion_mat = confusion_matrix(y_test, y_pred, labels=labels)
print(confusion_mat)

              precision    recall  f1-score   support

           0       0.71      0.51      0.59     13497
           1       0.21      0.40      0.28      4829
           2       0.83      0.81      0.82     35440

    accuracy                           0.70     53766
   macro avg       0.58      0.58      0.56     53766
weighted avg       0.74      0.70      0.72     53766

[[ 6920  2704  3873]
 [  882  1935  2012]
 [ 1982  4598 28860]]


In [12]:
# Predict the labels for the training set
X_train = train_medication_reviews.review_word_lemm
y_train = train_medication_reviews.rating_classification
y_pred_train = pipeline_mnbc.predict(X_train)
print(y_pred_train)

[2 2 1 ... 2 2 2]


In [13]:
# Calculate and print the evaluation metrics for the training set
print(classification_report(y_train, y_pred_train))
confusion_mat = confusion_matrix(y_train, y_pred_train)
print(confusion_mat)

              precision    recall  f1-score   support

           0       0.69      0.61      0.65     27839
           1       0.28      0.30      0.29      9993
           2       0.83      0.86      0.85     74497

    accuracy                           0.75    112329
   macro avg       0.60      0.59      0.60    112329
weighted avg       0.75      0.75      0.75    112329

[[17045  2618  8176]
 [ 2255  3040  4698]
 [ 5259  5282 63956]]


In [23]:
# Import joblib or picke
import joblib

# Save the trained model to a file
joblib.dump(pipeline_mnbc, 'mnbc_model.joblib')

['mnbc_model.joblib']

In [24]:
print(pipeline_mnbc)

Pipeline(steps=[('vect', CountVectorizer(max_df=0.5, max_features=10000)),
                ('tfidf', TfidfTransformer(use_idf=False)),
                ('classifier',
                 MultinomialNB(alpha=0.5, class_prior=[0.3, 0.3, 0.4]))])
