In [1]:
###### Import fundamentals
import numpy as np
import pandas as pd
import seaborn as sns
import re

# Import nltk and download punkt, wordnet
import nltk

# Import word_tokenize and stopwords from nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer 
from nltk.tag import pos_tag


# Sklearn
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import ComplementNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_validate, train_test_split
from sklearn.model_selection import cross_val_score


# I will keep the resulting plots
%matplotlib inline

# Enable Jupyter Notebook's intellisense
%config IPCompleter.greedy=True

# We want to see whole content (non-truncated)
pd.set_option('display.max_colwidth', -1)

In [2]:
# Load the tweets
tweets = pd.read_csv("Dataset.csv")

# Print the first five rows
display(tweets.head())

Unnamed: 0,Tweets,Label
0,@lynn93630469 Support my little sister in her school by buying a laptop for her Online Class and a stable wifi connection in our home. She badly needs it because she only uses an android phone and a data connection. 😊 Whenever she has something to encode she borrows laptop from our cousin's.,Neutral
1,"Yan, tama yan. Dapat lang na nasa #1 &amp; #2 ang tags natin, aba! Pambawi sa puyat ko, hahaha. Alas-kwatro na ako nakatulog kanina dahil sa SBEN19 MAPA tapos gumising ng 7 AM para sa online class 😬 STREAM SBEN19MAPA @SB19Official #SBEN19MAPAOutNow #SBNineteenAtKalayaan2021",Neutral
2,Kabi-kbila na ang utang ko dahil sa online class n to🥺🥺 panload pa lng di ko n afford🥺 Need some help🙏🙏🥺,Negative
3,Goodmorning🌞 Online class is real😂,Neutral
4,umay sa globe fiber. goodluck pag may online class na talaga.,Negative


In [3]:
#Open the text file containing the Filipino Stopwords based from https://github.com/stopwords-iso/stopwords-tl

file = open("StopWords/flstopwords.txt", "r", encoding="utf8")
flstopwords = file.read().split("\n")
file.close()

import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

In [4]:
#Clean data

"""
Created on Wed Aug  5 15:39:20 2020

@author: bhasfe
"""
def process_tweets(tweet):
            
    tweet = re.sub(r"won't", "will not", tweet)
    tweet = re.sub(r"can't", "can not", tweet)
    tweet = re.sub(r"n't", " not", tweet)
    tweet = re.sub(r"'ve", " have", tweet)
    tweet = re.sub(r"'ll", " will", tweet)
    tweet = re.sub(r"'re", " are", tweet)
    
    tweet = re.sub(r"'di", "hindi", tweet)
    tweet = re.sub(r"di", "hindi", tweet)
    
    # Remove links
    tweet = re.sub(r"http\S+|www\S+|https\S+", '', tweet, flags=re.MULTILINE)
    
    #remove numbers
    tweet = re.sub(r'\d','', tweet)
    
    # Remove mentions and hashtag
    tweet = re.sub(r'\@\w+|\#','', tweet)
   
    # clean the words
    clean = word_tokenize(tweet)

    # Remove the English stop words
    clean = [token for token in clean if token not in stopwords.words("english")]
    
    #Remove the Filipino stop words
    clean = [token for token in clean if token not in flstopwords]
    
    # Remove non-alphabetic characters and keep the words contains three or more letters
    clean = [token for token in clean if token.isalpha() and len(token)>2]
    
    clean = ' '.join(clean)
    return clean
    
# Call the function and store the result into a new column
tweets["Processed"] = tweets["Tweets"].str.lower().apply(process_tweets)

display(tweets[["Processed"]].head(15))

Unnamed: 0,Processed
0,support little sister school buying laptop online class stable wifi connection home badly needs uses android phone data connection whenever something encode borrows laptop cousin
1,yan tama yan lang nasa amp tags natin aba pambawi puyat hahaha nakatulog kanina sben mapa tapos gumising online class stream sbenmapa sbenmapaoutnow sbnineteenatkalayaan
2,utang online class panload lng need
3,online class
4,umay globe fiber goodluck pag online class talaga
5,mad last october first semester terpaksa jahindi online class ended class semester last thursday sin hindid deserve
6,online class ayoko mag enroll
7,sinusulit lang yung year online class law school palaging bahay lang
8,today using lot khursus online class watching stream movie lastly playing game wow day
9,know hinhindi tlaga pwede online class nakatulog lecture hahahahahahaha


In [5]:
# Lemmatization & Stemming according to POS tagging
def NormalizeWithPOS(text):

    word_list = word_tokenize(text)
    rev = []
    lemmatizer = WordNetLemmatizer() 
    stemmer = PorterStemmer() 
    for word, tag in pos_tag(word_list):
        if tag.startswith('J'):
            w = lemmatizer.lemmatize(word, pos='a')
        elif tag.startswith('V'):
            w = lemmatizer.lemmatize(word, pos='v')
        elif tag.startswith('N'):
            w = lemmatizer.lemmatize(word, pos='n')
        elif tag.startswith('R'):
            w = lemmatizer.lemmatize(word, pos='r')
        else:
            w = word
        #w = stemmer.stem(w)
        rev.append(w)
    tweet = ' '.join(rev)
    return tweet

tweets["Processed"] = tweets["Processed"].apply(NormalizeWithPOS)
display(tweets[["Processed"]].head(15))

Unnamed: 0,Processed
0,support little sister school buy laptop online class stable wifi connection home badly need us android phone data connection whenever something encode borrow laptop cousin
1,yan tama yan lang nasa amp tag natin aba pambawi puyat hahaha nakatulog kanina sben mapa tapos gumising online class stream sbenmapa sbenmapaoutnow sbnineteenatkalayaan
2,utang online class panload lng need
3,online class
4,umay globe fiber goodluck pag online class talaga
5,mad last october first semester terpaksa jahindi online class end class semester last thursday sin hindid deserve
6,online class ayoko mag enroll
7,sinusulit lang yung year online class law school palaging bahay lang
8,today use lot khursus online class watch stream movie lastly play game wow day
9,know hinhindi tlaga pwede online class nakatulog lecture hahahahahahaha


In [6]:
#Get features

file = open("StopWords/tfidf_stops.txt", "r", encoding="utf8")
tfidf_stops = file.read().split("\n")
file.close()

enstopwords = set(stopwords.words('english'))
        
# Initialize a Tf-idf Vectorizer
vectorizer = TfidfVectorizer(max_df=0.90, min_df=2, stop_words=enstopwords and flstopwords and tfidf_stops)

# Fit and transform the vectorizer corpus = [str (item) for item in corpus]
tfidf_matrix = vectorizer.fit_transform(str (item) for item in tweets["Processed"])

# Let's see what we have
tfidf_matrix

# Create a DataFrame for tf-idf vectors and display the first five rows
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns= vectorizer.get_feature_names())
display(tfidf_df.head())



Unnamed: 0,aaaaaaa,aabot,aabsent,aadjust,aalaga,aantay,aaral,aaralin,aasawa,aattend,...,ªðÿ,âœš,œðÿ,𝐂𝐡𝐚𝐢𝐧,𝐌𝐚𝐧𝐚𝐠𝐞𝐦𝐞𝐧𝐭,𝐎𝐧𝐥𝐢𝐧𝐞,𝐏𝐀𝐒𝐈𝐀,𝐏𝐮𝐛𝐥𝐢𝐜,𝐒𝐮𝐩𝐩𝐥𝐲,𝐓𝐫𝐚𝐢𝐧𝐢𝐧𝐠
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
# Select the features and the target
X = tfidf_matrix
y = tweets["Label"]

In [8]:
#split the data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size = .20)

In [9]:
#function for confusion matrix
def confussionMatrix(cl,X_test,y_test):
    # Predict the labels
    y_pred = cl.predict(X_test)
    
    # Print the Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    print("Confusion Matrix\n")
    print(cm)
    
    # Print the Classification Report
    cr = classification_report(y_test, y_pred)
    print("\n\nClassification Report\n")
    print(cr)
    
    return

Naive Bayes Classifiers

In [10]:
#1 Multinomial Naive Bayes
mnb = MultinomialNB()

#train our algorithm
mnb.fit(X_train, y_train)

#Test the trained classifier
predicted_class = mnb.predict(X_test)

#call the function confussionMatrix to compute the accuracy of the model
confussionMatrix(mnb,X_test,y_test)

Confusion Matrix

[[677 248   0]
 [238 669   1]
 [ 49 119   0]]


Classification Report

              precision    recall  f1-score   support

    Negative       0.70      0.73      0.72       925
     Neutral       0.65      0.74      0.69       908
    Positive       0.00      0.00      0.00       168

    accuracy                           0.67      2001
   macro avg       0.45      0.49      0.47      2001
weighted avg       0.62      0.67      0.64      2001



In [11]:
#2 Complement Naive Bayes
cnb = ComplementNB()

#train our algorithm
cnb.fit(X_train, y_train)

#Test the trained classifier
predicted_class = cnb.predict(X_test)

#call the function confussionMatrix to compute the accuracy of the model
confussionMatrix(cnb,X_test,y_test)

Confusion Matrix

[[698 165  62]
 [276 515 117]
 [ 45  73  50]]


Classification Report

              precision    recall  f1-score   support

    Negative       0.68      0.75      0.72       925
     Neutral       0.68      0.57      0.62       908
    Positive       0.22      0.30      0.25       168

    accuracy                           0.63      2001
   macro avg       0.53      0.54      0.53      2001
weighted avg       0.65      0.63      0.63      2001



In [12]:
#3 Bernoulli Naive Bayes classifier
nb = BernoulliNB()

#train our algorithm
nb.fit(X_train, y_train)

#Test the trained classifier
predicted_class = nb.predict(X_test)

#call the function confussionMatrix to compute the accuracy of the model
confussionMatrix(nb,X_test,y_test)

Confusion Matrix

[[630 264  31]
 [198 662  48]
 [ 40 114  14]]


Classification Report

              precision    recall  f1-score   support

    Negative       0.73      0.68      0.70       925
     Neutral       0.64      0.73      0.68       908
    Positive       0.15      0.08      0.11       168

    accuracy                           0.65      2001
   macro avg       0.50      0.50      0.50      2001
weighted avg       0.64      0.65      0.64      2001



Applying imbalanced- learn

!pip install imbalanced-learn

In [13]:
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=42)
X_resample, y_resample = ros.fit_resample(X_train, y_train)

In [18]:
print(y_train.value_counts())
print(y_resample.value_counts())

Neutral     3733
Negative    3584
Positive    683 
Name: Label, dtype: int64
Neutral     3733
Positive    3733
Negative    3733
Name: Label, dtype: int64


In [19]:
#1 Multinomial Naive Bayes

#retrain our algorithm
mnb.fit(X_resample, y_resample)

#Test the trained classifier
predicted_class = mnb.predict(X_test)

#call the function confussionMatrix to compute the accuracy of the model
confussionMatrix(mnb,X_test,y_test)

Confusion Matrix

[[670 157  98]
 [255 455 198]
 [ 37  46  85]]


Classification Report

              precision    recall  f1-score   support

    Negative       0.70      0.72      0.71       925
     Neutral       0.69      0.50      0.58       908
    Positive       0.22      0.51      0.31       168

    accuracy                           0.60      2001
   macro avg       0.54      0.58      0.53      2001
weighted avg       0.65      0.60      0.62      2001



In [20]:
#2 Complement Naive Bayes

#retrain our algorithm
cnb.fit(X_resample, y_resample)

#Test the trained classifier
predicted_class = cnb.predict(X_test)

#call the function confussionMatrix to compute the accuracy of the model
confussionMatrix(cnb,X_test,y_test)

Confusion Matrix

[[679 145 101]
 [268 420 220]
 [ 36  38  94]]


Classification Report

              precision    recall  f1-score   support

    Negative       0.69      0.73      0.71       925
     Neutral       0.70      0.46      0.56       908
    Positive       0.23      0.56      0.32       168

    accuracy                           0.60      2001
   macro avg       0.54      0.59      0.53      2001
weighted avg       0.65      0.60      0.61      2001



In [21]:
#3 Bernoulli Naive Bayes classifier

#retrain our algorithm
nb.fit(X_resample, y_resample)

#Test the trained classifier
predicted_class = nb.predict(X_test)

#call the function confussionMatrix to compute the accuracy of the model
confussionMatrix(nb,X_test,y_test)

Confusion Matrix

[[621 234  70]
 [194 592 122]
 [ 33  74  61]]


Classification Report

              precision    recall  f1-score   support

    Negative       0.73      0.67      0.70       925
     Neutral       0.66      0.65      0.65       908
    Positive       0.24      0.36      0.29       168

    accuracy                           0.64      2001
   macro avg       0.54      0.56      0.55      2001
weighted avg       0.66      0.64      0.65      2001



#predict a tweet

In [23]:
words = "ang saya ng online class"
clean = process_tweets(str.lower(words))
clean = NormalizeWithPOS(clean)
words = vectorizer.transform([clean])

In [24]:
mnb.predict(words)

array(['Positive'], dtype='<U8')

In [25]:
cnb.predict(words)

array(['Positive'], dtype='<U8')

In [26]:
nb.predict(words)

array(['Positive'], dtype='<U8')

In [22]:
import pickle
pickle.dump(mnb, open("MNB_model.pkl", 'wb'))
pickle.dump(cnb, open("CNB_model.pkl", 'wb'))
pickle.dump(nb, open("BNB_model.pkl", 'wb'))
pickle.dump(vectorizer, open("vectorizer.pkl", 'wb'))