# Model Testing

In [1]:
# Imports
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import warnings 
warnings.filterwarnings('ignore')
import time


from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.stem import WordNetLemmatizer,PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB,BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier,RandomForestClassifier,AdaBoostClassifier
from sklearn.metrics import confusion_matrix, plot_confusion_matrix,classification_report
from sklearn.feature_extraction import DictVectorizer
from sklearn import metrics

from nltk.stem import WordNetLemmatizer,PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords

import re
import pickle

In [16]:
# Define class for model 
class StemTokenize(object):
    def __init__(self):
        self.ps = PorterStemmer()
    def __call__(self, articles):
        return [self.ps.stem(t) for t in word_tokenize(articles)]

In [2]:
# Read Dataset 
# Due to dataset is very huge, 
# It's can't upload in Github 
# You can find the dataset at 
# https://www.kaggle.com/datasets/kazanova/sentiment140
df = pd.read_csv("../data/test_twitter.csv",encoding='ISO-8859-1')

# Content
It contains the following 6 fields:
- target: the polarity of the tweet (0 = negative, 2 = neutral, 4 = positive)
  ids: The id of the tweet ( 2087)
- date: the date of the tweet (Sat May 16 23:58:44 UTC 2009)
- flag: The query (lyx). If there is no query, then this value is NO_QUERY.
- user: the user that tweeted (robotickilldozr)
- text: the text of the tweet (Lyx is cool)

In [3]:
# View
df.head()

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [4]:
# Drop not use columns
df = df.drop(columns=['1467810369',
                     'Mon Apr 06 22:19:45 PDT 2009',
                     'NO_QUERY',
                     '_TheSpecialOne_',])

In [5]:
# View
df.head()

Unnamed: 0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,is upset that he can't update his Facebook by ...
1,0,@Kenichan I dived many times for the ball. Man...
2,0,my whole body feels itchy and like its on fire
3,0,"@nationwideclass no, it's not behaving at all...."
4,0,@Kwesidei not the whole crew


In [6]:
# Due to 1st row of data becomes to column of dataframe
# We decide ignore that and change them to name of colunms
df.rename(columns={"0":"subreddit",
           "@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D" : "text"}, 
          inplace=True)

In [7]:
# We select only subreddit column as 0
# Because in dataset show 0 is equal people have depression
# The model is trained by only depression and vent datas
# If model predict other topic, The result is not good
df = df[df["subreddit"]== 0]

In [8]:
# Check missing values
df.isnull().sum()

subreddit    0
text         0
dtype: int64

In [9]:
# Check duplicate rows
df.duplicated().sum()

9815

In [10]:
# Drop duplicate rows
df.drop_duplicates(inplace=True)

In [11]:
# Recheck
df.duplicated().sum()

0

In [12]:
# Check shape
df.shape

(790184, 2)

In [13]:
# Taget of model is 1 
df["subreddit"] = df["subreddit"].map({0:1})

In [14]:
# Define X & y
X_eva = df["text"]
y_eva = df["subreddit"]

In [17]:
# Open the best model to predict data
filename = '../model/model.sav'
model = pickle.load(open(filename, 'rb'))

In [18]:
# Predict
y_preds = model.predict(X_eva)

In [20]:
# Show accury score
# The score below percet half of part dataset 
# Because It's might be out of word
# in our model 
# The next step we should to scanning the frequency words
# for improve our model 
metrics.accuracy_score(y_eva, y_preds)

0.4925840057505594