In [1]:
import re
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
import string
import nltk
import warnings 
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [16]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/abhinavraj/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [39]:
import nltk
from nltk.stem.lancaster import LancasterStemmer
# word stemmer
stemmer = LancasterStemmer()

In [27]:
df = pd.read_csv('tokenized_tweet.csv')
data = df.values.tolist()
print(data[0][3])

know listenin habit earlier started freakin part


In [23]:
# turn a list into a set (of unique items) and then a list again (this removes duplicates)
classes = list(set([a[1] for a in data]))
print(classes)

['anger', 'hate', 'happiness', 'relief', 'fun', 'worry', 'boredom', 'sadness', 'surprise', 'empty', 'enthusiasm', 'neutral', 'love']


In [24]:
# capture unique stemmed words in the training corpus
corpus_words = {}
class_words = {}

for c in classes:
    # prepare a list of words within each class
    class_words[c] = []

In [29]:
for dat in data:
    # tokenize each sentence into words
    for word in nltk.word_tokenize(str(dat[3])):
        # ignore a some things
        if word not in ["?", "'s"]:
            # stem and lowercase each word
            stemmed_word = stemmer.stem(word.lower())
            # have we not seen this word already?
            if stemmed_word not in corpus_words:
                corpus_words[stemmed_word] = 1
            else:
                corpus_words[stemmed_word] += 1

            # add the word to our words in class list
            class_words[dat[1]].extend([stemmed_word])

In [36]:
# we now have each stemmed word and the number of occurances of the word in our training corpus (the word's commonality)
# also we have all words in each class
print ((class_words['empty']))

['know', 'listenin', 'habit', 'ear', 'start', 'freakin', 'part', 'middl', 'school', 'elem', 'high', 'schools', 'wil', 'remain', 'op', 'thos', 'nee', 'credit', 'gradu', 'cal', 'brok', 'fuck', 'textd', 'just', 'answ', 'nev', 'learn', 'writ', 'french', 'just', 'bas', 'stuff', 'hav', 'headach', 'going', 'goodnight', 'nit', 'bor', 'oppos', 'almost', 'sleep', 'watch', 'harp', 'island', 'ther', 'vodk', 'involv', 'trudy', 'burnet', 'nor', 'that', 'goe', 'bout', 'aint', 'gam', 'surv', 'anoth', 'gam', 'sery', 'notin', 'lebron', 'guessin', '@', 'tiffanylu', 'i', 'know', 'i', 'was', 'listenin', 'to', 'bad', 'habit', 'ear', 'and', 'i', 'start', 'freakin', 'at', 'his', 'part', '=', '[', '@', 'crey', 'middl', 'school', 'and', 'elem', '.', 'high', 'schools', 'wil', 'remain', 'op', 'for', 'thos', 'who', 'nee', 'credit', 'to', 'gradu', '.', 'cal', 'is', 'brok', '@', 'djmicdamn', 'hey', 'yu', 'lil', 'fuck', 'i', 'textd', 'yu', '@', 'softtouchm', 'just', 'answ', 'you-', 'nev', 'learn', 'how', 'to', 'writ'

In [40]:
def calculate_class_score(sentence, class_name, show_details=True):
    score = 0
    # tokenize each word in our new sentence
    for word in nltk.word_tokenize(sentence):
        # check to see if the stem of the word is in any of our classes
        if stemmer.stem(word.lower()) in class_words[class_name]:
            # treat each word with relative weight
            score += (1 / corpus_words[stemmer.stem(word.lower())])

            if show_details:
                print ("   match: %s (%s)" % (stemmer.stem(word.lower()), 1 / corpus_words[stemmer.stem(word.lower())]))
    return score

In [50]:
sentence = "good day for us to have lunch?"

# now we can find the class with the highest score
for c in class_words.keys():
    print ("Class: %s  Score: %s \n" % (c, calculate_class_score(sentence, c)))

   match: good (0.00037064492216456633)
   match: day (0.0009174311926605505)
   match: for (0.0008176614881439084)
   match: us (0.001763668430335097)
   match: to (0.00025933609958506224)
   match: hav (0.00017540782318891423)
Class: anger  Score: 0.004304149956078099 

   match: good (0.00037064492216456633)
   match: day (0.0009174311926605505)
   match: for (0.0008176614881439084)
   match: us (0.001763668430335097)
   match: to (0.00025933609958506224)
   match: hav (0.00017540782318891423)
   match: lunch (0.003676470588235294)
Class: hate  Score: 0.007980620544313394 

   match: good (0.00037064492216456633)
   match: day (0.0009174311926605505)
   match: for (0.0008176614881439084)
   match: us (0.001763668430335097)
   match: to (0.00025933609958506224)
   match: hav (0.00017540782318891423)
   match: lunch (0.003676470588235294)
Class: happiness  Score: 0.007980620544313394 

   match: good (0.00037064492216456633)
   match: day (0.0009174311926605505)
   match: for (0.00081

In [44]:
def classify(sentence):
    high_class = None
    high_score = 0
    # loop through our classes
    for c in class_words.keys():
        # calculate score of sentence for each class
        score = calculate_class_score(sentence, c, show_details=False)
        # keep track of highest score
        if score > high_score:
            high_class = c
            high_score = score

    return high_class, high_score

In [55]:
classify("i know  i was listenin to bad habit earlier and i started freakin at his part =[,know listenin habit earlier started freakin part")

('worry', 0.48897636308974246)