# Feature Engineering (Part 2)

This notebook is the continuation of the *feature engineering* part 1 started in the previous file.

In [2]:
# from google.colab import drive
# drive.mount('/content/drive')

In [3]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import plotly.express as px

import nltk
nltk.download(['punkt', 'wordnet'])
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer            

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/rafaelaqueiroz/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/rafaelaqueiroz/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
# Load the dataset
# medication_reviews_dataset_to_train = pd.read_csv('/content/drive/Othercomputers/My MacBook Pro/Sentiment-Analysis-of-Medication-Reviews-Project/medication_reviews_dataset_to_train.csv', sep=',')
medication_reviews_dataset_to_train = pd.read_csv('/Users/rafaelaqueiroz/Sentiment-Analysis-of-Medication-Reviews-Project/medication_reviews_dataset_to_train.csv', sep=',')
medication_reviews_dataset_to_train.head()

Unnamed: 0,drugName,condition,rating,date,usefulCount,year,review_word_lemm,polarity,rating_classification
0,Valsartan,Left Ventricular Dysfunction,9.0,2012-05-20,27,2012,"['no', 'side', 'effect', 'take', 'combination'...",0.0,2
1,Guanfacine,ADHD,8.0,2010-04-27,192,2010,"['son', 'halfway', 'fourth', 'week', 'intuniv'...",0.188021,2
2,Lybrel,Birth Control,5.0,2009-12-14,17,2009,"['used', 'take', 'another', 'oral', 'contracep...",0.113636,1
3,Ortho Evra,Birth Control,8.0,2015-11-03,10,2015,"['first', 'time', 'using', 'form', 'birth', 'c...",0.2625,2
4,Buprenorphine / naloxone,Opiate Dependence,9.0,2016-11-27,37,2016,"['suboxone', 'completely', 'turned', 'life', '...",0.163333,2


#### 4.1 Creating the *X* and *y* variables from our trainning set

In [5]:
# Our independent variable (X) is going to be the "review_word_lemm" variable
X = medication_reviews_dataset_to_train.review_word_lemm
X

0         ['no', 'side', 'effect', 'take', 'combination'...
1         ['son', 'halfway', 'fourth', 'week', 'intuniv'...
2         ['used', 'take', 'another', 'oral', 'contracep...
3         ['first', 'time', 'using', 'form', 'birth', 'c...
4         ['suboxone', 'completely', 'turned', 'life', '...
                                ...                        
112324    ['mg', 'seems', 'work', 'every', 'nd', 'day', ...
112325    ['tekturna', 'day', 'effect', 'immediate', 'al...
112326    ['wrote', 'first', 'report', 'midoctober', 'no...
112327    ['ive', 'thyroid', 'medication', 'year', 'spen...
112328    ['ive', 'chronic', 'constipation', 'adult', 'l...
Name: review_word_lemm, Length: 112329, dtype: object

In [6]:
X.shape

(112329,)

In [7]:
type(X)

pandas.core.series.Series

In [8]:
# Our target or dependent variable (y) is going to be the 'rating_classification' variable
y = medication_reviews_dataset_to_train.rating_classification
y

0         2
1         2
2         1
3         2
4         2
         ..
112324    0
112325    2
112326    2
112327    2
112328    2
Name: rating_classification, Length: 112329, dtype: int64

In [9]:
y.shape

(112329,)

In [10]:
type(y)

pandas.core.series.Series

### Step 5: Applying NLP learning algorithms

At this moment, we are going to apply different learning algorithms and techniques from NLP in the dataset, such as:

1. *CountVectorizer* to convert the words into vector numeric representations,
2. *Bag of Words* (BoW) to count the frequency of words, 
3. *TF-IDF* to get the count of more unique (uncommon) words present in the reviews in comparison with the total words that occur in the whole dataset (considering, in this case, all the words used in the reviews), 
4. *Word2Vec* to understand the context of the words used in the reviews.

#### 5.1 CountVectorizer

In [11]:
# Import libraries
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer # For BoW and TFIDF
cv = CountVectorizer()

# Define a function to vectorize the reviews
def create_df_word_matrix(text, vectorizer):
    doc_term_matrix = vectorizer.fit_transform(text)
    return pd.DataFrame(doc_term_matrix.toarray(), columns = vectorizer.get_feature_names_out())

In [12]:
create_df_word_matrix(X, cv)

Unnamed: 0,aa,aaa,aaaaaammazing,aaaaand,aaaaarg,aaaand,aaahh,aaand,aaccidentpain,aadd,...,zytiga,zytigaprednisone,zytram,zyvox,zzquill,zzzquil,zzzquill,zzzzap,zzzzz,zzzzzzzz
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112324,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
112325,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
112326,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
112327,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


With these results, we can see that the *CountVectorizer* object converted the words into a vector with a value of 0 or 1. Each word is now a feature that constitutes a column in our dataframe. In total, we have 66660 columns in which a word is represented by it, and 112329 rows or observations (each row represents one review).

In [13]:
reviews_to_be_vectorized = medication_reviews_dataset_to_train['review_word_lemm'].apply(lambda x: "".join(x))
reviews_to_be_vectorized

0         ['no', 'side', 'effect', 'take', 'combination'...
1         ['son', 'halfway', 'fourth', 'week', 'intuniv'...
2         ['used', 'take', 'another', 'oral', 'contracep...
3         ['first', 'time', 'using', 'form', 'birth', 'c...
4         ['suboxone', 'completely', 'turned', 'life', '...
                                ...                        
112324    ['mg', 'seems', 'work', 'every', 'nd', 'day', ...
112325    ['tekturna', 'day', 'effect', 'immediate', 'al...
112326    ['wrote', 'first', 'report', 'midoctober', 'no...
112327    ['ive', 'thyroid', 'medication', 'year', 'spen...
112328    ['ive', 'chronic', 'constipation', 'adult', 'l...
Name: review_word_lemm, Length: 112329, dtype: object

In [14]:
type(reviews_to_be_vectorized)

pandas.core.series.Series

In [15]:
X_reviews_to_be_vectorized = cv.fit_transform(reviews_to_be_vectorized)
type(X_reviews_to_be_vectorized)

scipy.sparse._csr.csr_matrix

In [16]:
X_reviews_to_be_vectorized.toarray() # It converts a sparse array in a dense array

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [17]:
X_reviews_to_be_vectorized.shape

(112329, 66660)

#### 5.2 Bag of Words (BoW)

In [18]:
# # Call up the function that was defined earlier to view BoW as a DataFrame
# create_df_word_matrix(reviews_to_be_vectorized, CountVectorizer())

In [19]:
# Create a df with the count of the words used in the reviews
create_df_word_matrix(reviews_to_be_vectorized, cv)

Unnamed: 0,aa,aaa,aaaaaammazing,aaaaand,aaaaarg,aaaand,aaahh,aaand,aaccidentpain,aadd,...,zytiga,zytigaprednisone,zytram,zyvox,zzquill,zzzquil,zzzquill,zzzzap,zzzzz,zzzzzzzz
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112324,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
112325,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
112326,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
112327,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


BoW calculates the frequency of the words, so 1 means that we have the word present in the review, and 0 we don't.

#### 5.3 TF-IDF

In [20]:
# Fit a basic TFIDF Vectorizer and view the results
tfidf_vect = TfidfVectorizer()
X_tfidf = tfidf_vect.fit_transform(reviews_to_be_vectorized)
type(X_tfidf)

scipy.sparse._csr.csr_matrix

In [None]:
X_tfidf.toarray()

In [None]:
X_tfidf.shape

(112329, 66660)

In [None]:
# # Create a df to view TF-IDF result as a DataFrame
# create_df_word_matrix(reviews_to_be_vectorized, TfidfVectorizer(binary=False)) # Weights the rare words in comparison to the most ones used in the document

In [None]:
# Create a df with the weights of the words with TFIDF 
create_df_word_matrix(reviews_to_be_vectorized, tfidf_vect)

TF-IDF works by giving a higher weight to rare or more uncommon words present in the dataset (considering the total of words used, in our case, in the reviews).