In [3]:
###### Import fundamentals
import numpy as np
import pandas as pd
import pandas.util.testing as tm
import matplotlib.pyplot as plt
import seaborn as sns
import re
import TglStemmer

# Import nltk and download punkt, wordnet
import nltk

#import warning
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Import word_tokenize and stopwords from nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer 
from nltk.tag import pos_tag


# Sklearn
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score


# I will keep the resulting plots
%matplotlib inline

# Enable Jupyter Notebook's intellisense
%config IPCompleter.greedy=True

# We want to see whole content (non-truncated)
#pd.set_option('display.max_colwidth', None)

In [4]:
# Load the tweets
tweets = pd.read_csv("F.csv")

# Print the first five rows
display(tweets.head())

# Print the summary statistics
#print(tweets.describe())

# Drop duplicated rows
tweets.drop_duplicates(inplace=True)

# Print the info
#print(tweets.info())

Unnamed: 0,Tweets,Label
0,@lynn93630469 Support my little sister in her ...,Neutral
1,"Yan, tama yan. Dapat lang na nasa #1 &amp; #2 ...",Neutral
2,Kabi-kbila na ang utang ko dahil sa online cla...,Negative
3,Goodmorning🌞 Online class is real😂,Positive
4,umay sa globe fiber. goodluck pag may online c...,Negative


In [5]:
#Open the text file containing the Filipino Stopwords based from https://github.com/stopwords-iso/stopwords-tl

file = open("StopWords/flstopwords.txt", "r", encoding="utf8")
flstopwords = file.read().split("\n")
file.close()

In [6]:
def process_tweets(tweet):
            
    tweet = re.sub(r"won't", "will not", tweet)
    tweet = re.sub(r"can't", "can not", tweet)
    tweet = re.sub(r"n't", " not", tweet)
    tweet = re.sub(r"'ve", " have", tweet)
    tweet = re.sub(r"'ll", " will", tweet)
    tweet = re.sub(r"'re", " are", tweet)
    
    tweet = re.sub(r"'di", "hindi", tweet)
    
    tweet = re.sub(r"di", "hindi", tweet)
    
    # Remove links
    tweet = re.sub(r"http\S+|www\S+|https\S+", '', tweet, flags=re.MULTILINE)
    
    #remove numbers
    tweet = re.sub(r'\d','', tweet)
    
    # Remove mentions and hashtag
    tweet = re.sub(r'\@\w+|\#','', tweet)
   
    # clean the words
    clean = word_tokenize(tweet)

    # Remove the English stop words
    clean = [token for token in clean if token not in stopwords.words("english")]
    
    #Remove the Filipino stop words
    clean = [token for token in clean if token not in flstopwords]
    
    # Remove non-alphabetic characters and keep the words contains three or more letters
    clean = [token for token in clean if token.isalpha() and len(token)>2]
    
    clean = ' '.join(clean)
    return clean
    
# Call the function and store the result into a new column
tweets["Processed"] = tweets["Tweets"].str.lower().apply(process_tweets)
#tweets["Content"].str.lower().apply(process_tweets)

display(tweets[["Processed"]].head(15))

Unnamed: 0,Processed
0,support little sister school buying laptop onl...
1,yan tama yan lang nasa amp tags natin aba pamb...
2,utang online class panload lng need
3,online class
4,umay globe fiber goodluck pag online class talaga
5,mad last october first semester terpaksa jahin...
6,online class ayoko mag enroll
7,sinusulit lang yung year online class law scho...
8,today using lot khursus online class watching ...
9,know hinhindi tlaga pwede online class nakatul...


In [None]:
def NormalizeWithPOS(text):
    # Lemmatization & Stemming according to POS tagging

    word_list = word_tokenize(text)
    rev = []
    lemmatizer = WordNetLemmatizer() 
    stemmer = PorterStemmer() 
    for word, tag in pos_tag(word_list):
        if tag.startswith('J'):
            w = lemmatizer.lemmatize(word, pos='a')
        elif tag.startswith('V'):
            w = lemmatizer.lemmatize(word, pos='v')
        elif tag.startswith('N'):
            w = lemmatizer.lemmatize(word, pos='n')
        elif tag.startswith('R'):
            w = lemmatizer.lemmatize(word, pos='r')
        else:
            w = word
        w = stemmer.stem(w)
        rev.append(w)
    tweet = ' '.join(rev)
    return tweet

In [None]:
tweets["Processed"] = tweets["Processed"].apply(NormalizeWithPOS)

In [None]:
enstopwords = set(stopwords.words('english'))
        
# Initialize a Tf-idf Vectorizer
vectorizer = TfidfVectorizer(idf_id=True,max_df=0.90, min_df=2, stop_words=enstopwords and flstopwords)

# Fit and transform the vectorizer corpus = [str (item) for item in corpus]
tfidf_matrix = vectorizer.fit_transform(str (item) for item in tweets["Processed"])

# Let's see what we have
tfidf_matrix

# Create a DataFrame for tf-idf vectors and display the first five rows
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns= vectorizer.get_feature_names())
display(tfidf_df)

In [None]:
# Select the features and the target
X = tfidf_matrix
#X = tweets["Processed"] 
y = tweets["Label"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size = .20)

In [None]:
#The type of X_train_dtm and X_test_dtm is scipy.sparse.csr.csr_matrix
#Before model selection, You need to convert it into numpy sparse format using toarray() method
print('Before conversion:')
print('X_train_dtm: ', type(X_train))
print('X_test_dtm: ', type(X_test))
X_train = X_train.toarray()
X_test  = X_test.toarray()
print('After conversion:')
print('X_train_dtm: ', type(X_train))
print('X_test_dtm: ', type(X_test))

In [None]:
#3 Multinomial Naive Bayes
from sklearn.naive_bayes import MultinomialNB #this works best for text classification
mnb = MultinomialNB()
#train our algorithm
mnb.fit(X_train, y_train)
#Test the trained classifier
predicted_class = mnb.predict(X_test)
print('Accuracy of MNB for this dataset: %3.2f' %  accuracy_score(y_test, predicted_class))

In [None]:
words = "ang ganda ng online class"
words = vectorizer.transform([str (item) for item in words])

In [None]:
print(str (item) for item in words)
mnb.predict(words[0])

In [None]:
import pickle
# Save the model
pickle.dump(mnb, open("model.pkl", 'wb'))
pickle.dump(vectorizer, open("vectorizer.pkl", 'wb'))

In [None]:
from joblib import dump, load
dump(mnb, 'model.joblib') 

In [None]:
mnb = load('model.joblib') 

In [None]:
words = "ang ganda ng online class"
words = vectorizer.transform(str (item) for item in words)

In [None]:
mnb.predict(words)