# Twitter Sentiment Analyzer: Module Trainer Program

## Libraries

In [1]:
import twitter
import nltk
import pandas as pd
import numpy as np
import pickle
import re as charRemoval
from nltk.tokenize import word_tokenize
from string import punctuation 
from nltk.corpus import stopwords
from nltk.classify import SklearnClassifier
from sklearn.naive_bayes import BernoulliNB
import time
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /home/mmvc/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/mmvc/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Importing Training Data Set

In [2]:
# Importing the training data set and loading it into a panda data frame
filename= "trainingdataset17k.csv"
dataFramTraining = pd.read_csv("./Data/"+filename, header=None, usecols=[0,5], names=['tweet polarity','text'], encoding="ISO-8859-1")
#grouping the data by labels
dataFramTrainingNegative = dataFramTraining.loc[dataFramTraining['tweet polarity']==0]
dataFramTrainingPositive = dataFramTraining.loc[dataFramTraining['tweet polarity']==4]
SplittedTrainSet = [dataFramTrainingNegative.iloc[0:8500,:],dataFramTrainingPositive.iloc[0:8500,:]]
#merging the two data sets
MergedArrays = pd.concat(SplittedTrainSet)
#defining polarity and labels
polarity = [ (MergedArrays['tweet polarity'] == 0), (MergedArrays['tweet polarity'] == 4)]
negative_positive = ['negative',  'positive']
#replacing polarity numberss with labels
MergedArrays['label'] = np.select(polarity, negative_positive)
#converting the pandas datafram to a dictionary
trainingset = MergedArrays.to_dict('records')

## PreProcessing Tweets

In [3]:
#class to clean the tweets
class TweetsPreProcessing:
    def __init__(self):
        # stop words are words that dont add much meaning to the sentence in the senitment
        # and punctiotation also don't add much meaning to the sentence in the senitment
        self.notimportantWords = set(stopwords.words('english') + list(punctuation) + ['AT_USER','URL']) 
    #loops inside the tweet list and call the tweet cleaner function
    def TweetsProcessing(self, tweetlists):
        Tweets_Processed=[]
        for t in tweetlists:
            Tweets_Processed.append((self.TweetCleaner(t["text"]),t["label"]))  
        return Tweets_Processed
    # tweet cleaner function convert texts to lowercases and removes hashtags urls and usernames
    def TweetCleaner(self, t):
        urlLinks='((www\.[^\s]+)|(https?://[^\s]+))'
        username='@[^\s]+'
        hashtags= r'#([^\s]+)'
        t = t.lower()
        t = charRemoval.sub(urlLinks, 'URL', t) 
        t = charRemoval.sub(username, 'AT_USER', t)
        t = charRemoval.sub(hashtags, r'\1', t) 
        t = word_tokenize(t)
        wordcleaned=[]
        #loop through t and if the word is important then append it
        for w in t:
            if w not in self.notimportantWords:
                wordcleaned.append(w)
        return wordcleaned

## NLP using NLTK

In [4]:
def NLTKProcessor1(Training_Data_Pre):
    #nltk proccessor that builds vocabulary set
    vocab = []
    #for loop in the training data set and extend i
    for (i, j) in Training_Data_Pre:
        vocab.extend(i)
    #return those frequencies of vocab as keys.
    return nltk.FreqDist(vocab).keys()

In [5]:
def NLTKProcessor2(tweet):
    #nltk proccessor that extract features set
    WordtweetFeatures=set(tweet)
    f={}
    for i in buildedvocab:
            #if word in the vocab is in the tweet and returns true false
            # using JSON key
        f['contains(%s)' % i]=(i in WordtweetFeatures)
    return f

In [6]:
#preproccessing the tweets in trainset
TrainSet_PreProccesing = TweetsPreProcessing().TweetsProcessing(trainingset)
#building vocabulary
buildedvocab = NLTKProcessor1(TrainSet_PreProccesing)
#Make a training feature vector using NLTKProccessor1 and NLTKProccessor2
features_training=nltk.classify.apply_features(NLTKProcessor2,TrainSet_PreProccesing)

In [7]:
%%time
#train the module using BernoulliNB
print("Training with file: ",filename)
SKLearnClassifier = SklearnClassifier(BernoulliNB()).train(features_training)
print("Building Training Module Accomplished")

Training with file:  trainingdataset17k.csv
Building Training Module Accomplished
CPU times: user 2min 49s, sys: 9.09 s, total: 2min 58s
Wall time: 2min 58s


## Saving the module to use in other notebook

In [8]:
#save the module data
save_classifier = open("TrainModuleClassifierNLP.pickle","wb")
pickle.dump(SKLearnClassifier, save_classifier)
save_classifier.close()