In [1]:
import pandas as pd
import re
import nltk

In [2]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

In [3]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.corpus import sentiwordnet as swn
from nltk import ngrams, FreqDist
from nltk.corpus import wordnet

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/rachitjain/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rachitjain/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/rachitjain/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/rachitjain/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [4]:
data_neg = pd.read_csv('Data/training_negative.csv', encoding='latin-1')
data_pos = pd.read_csv('Data/training_positive.csv', encoding='latin-1')

In [5]:
data = data_neg.append(data_pos)
data = data.iloc[:,1:]

## Sampling Data for Trying out Approaches

In [6]:
df = data.sample(n=20000, random_state=1)
df.tail()

Unnamed: 0,Polarity,Tweet
400592,4,I get the worst writer's cramp. These thank yo...
333050,0,@ladyinreddress the sun is all gone now...
93698,4,"FINALLY a break till Wed, no work OR school, b..."
345456,4,I want a Blackberry...
37125,4,in montrea doing some much needed ocean swimmi...


## Pre-Processing Text

In [7]:
def clean_text(tweet):
    # tweet = tweet.lower()                                   # Converting to lower case
    tweet = re.sub(r'@[^\s]+', ' ', tweet)                  # Removing mentions
    tweet = re.sub(r'https?:\/[^\s]+', ' ', tweet)          # Removing URLs
    tweet = re.sub(r'www.[^\s]+', ' ', tweet)               # Removing Email IDs
    tweet = re.sub(r'#', '', tweet)                         # Removing hashtags
    tweet = re.sub(r'_', ' ', tweet)                        # Sometimes hashtags are done with _ representing break between two words
    tweet = re.sub(r'\.{2,}', ' ', tweet)                   # Removing sentence separators
    tweet = re.sub(r"[0-9]+",' ', tweet)                    # Removing numbers as they do not indicate sentiment
    tweet = re.sub(r"[^\w\d\s]+",'', tweet)                # Removing unnecessary punctuations apart from apostrophe which can be used in clitics
    tweet = re.sub(r"\b[a-zA-Z]\b", ' ', tweet)
    tweet = re.sub(r"\bamp\b", ' ', tweet)                  # Removing &amp signs mis-translated
    return ' '.join(tweet.split())

In [8]:
clean_text("I am &amp n't doing good") 

'am nt doing good'

In [9]:
def tweet_word_tokenizer(tweet):
    return word_tokenize(tweet)

In [10]:
clitics = {
    "nt": 'not',
    "ve": 'have',
    "s": 'is',
    "m": 'am',
    "re": 'are',
    "ll": 'I will',
    "bout": 'about'
}

In [12]:
# count = 0
def handle_clitics(tweet):
    # global count
    # count += 1
    for i in range(len(tweet)):
        if tweet[i] in clitics.keys():
            tweet[i] = clitics[tweet[i]]
    return tweet

In [13]:
def stopword_removal(tweet):
    tweet = [word for word in tweet if word not in stopwords.words('english')]
    if len(tweet) == 0:
        tweet = ['None']
    return tweet

In [73]:
short_forms = {
    'n': 'and',
    'ya': 'you',
    'luv': 'love',
    'lol': 'laugh',
    'k': 'okay',
    'na': 'no',
    'ily': 'love',
    'im': 'am',
    'morn': 'morning',
    'nght': 'night',
    'n8': 'night',
    'no': 'not',
    'Ill': 'will'
    }

In [74]:
def handle_shortforms(tweet):
    for i in range(len(tweet)):
        if tweet[i] in short_forms.keys():
            tweet[i] = short_forms[tweet[i]]
    return tweet

In [75]:
handle_shortforms(['I','am','lol','in','practice'])

['I', 'am', 'laugh', 'in', 'practice']

In [76]:
df['Tweet_regex'] = df['Tweet'].apply(clean_text)

In [77]:
df['Tweet_clean'] = df['Tweet_regex'].apply(tweet_word_tokenizer)

In [78]:
df['Tweet_stopword'] = df['Tweet_clean'].apply(stopword_removal)

In [79]:
df['Tweet_clitics'] = df['Tweet_stopword'].apply(handle_clitics)

In [80]:
df['Tweet_shortforms'] = df['Tweet_clitics'].apply(handle_shortforms)

In [81]:
df.head(30)

Unnamed: 0,Polarity,Tweet,Tweet_regex,Tweet_clean,Tweet_stopword,Tweet_clitics,Tweet_shortforms,Tweet_pos,Tweet_lemma
514293,0,i miss nikki nu nu already shes always there ...,miss nikki nu nu already shes always there whe...,"[miss, nikki, nu, nu, already, shes, always, t...","[miss, nikki, nu, nu, already, shes, always, n...","[miss, nikki, nu, nu, already, shes, always, n...","[miss, nikki, nu, nu, already, shes, always, n...","[(miss, JJ), (nikki, NN), (nu, JJ), (nu, JJ), ...","[miss, nikki, nu, nu, already, shes, always, n..."
142282,0,So I had a dream last night. I remember a sig...,So had dream last night remember sign which cl...,"[So, had, dream, last, night, remember, sign, ...","[So, dream, last, night, remember, sign, clear...","[So, dream, last, night, remember, sign, clear...","[So, dream, last, night, remember, sign, clear...","[(So, RB), (dream, NN), (last, JJ), (night, NN...","[So, dream, last, night, remember, sign, clear..."
403727,0,@girlyghost ohh poor sickly you (((hugs)) ho...,ohh poor sickly you hugs hope you feel little ...,"[ohh, poor, sickly, you, hugs, hope, you, feel...","[ohh, poor, sickly, hugs, hope, feel, little, ...","[ohh, poor, sickly, hugs, hope, feel, little, ...","[ohh, poor, sickly, hugs, hope, feel, little, ...","[(ohh, JJ), (poor, JJ), (sickly, JJ), (hugs, N...","[ohh, poor, sickly, hug, hope, feel, little, g..."
649503,0,it is raining again,it is raining again,"[it, is, raining, again]",[raining],[raining],[raining],"[(raining, VBG)]",[rain]
610789,0,@MissKeriBaby wish I was in LA right now,wish was in LA right now,"[wish, was, in, LA, right, now]","[wish, LA, right]","[wish, LA, right]","[wish, LA, right]","[(wish, JJ), (LA, NNP), (right, NN)]","[wish, LA, right]"
67315,0,Nala Olowalu still has a full tummy from bread...,Nala Olowalu still has full tummy from bread b...,"[Nala, Olowalu, still, has, full, tummy, from,...","[Nala, Olowalu, still, full, tummy, bread, bas...","[Nala, Olowalu, still, full, tummy, bread, bas...","[Nala, Olowalu, still, full, tummy, bread, bas...","[(Nala, NNP), (Olowalu, NNP), (still, RB), (fu...","[Nala, Olowalu, still, full, tummy, bread, bas..."
33521,4,@macintom site doesn't seem to want to load up...,site doesnt seem to want to load up they must ...,"[site, doesnt, seem, to, want, to, load, up, t...","[site, doesnt, seem, want, load, must, getting...","[site, doesnt, seem, want, load, must, getting...","[site, doesnt, seem, want, load, must, getting...","[(site, NN), (doesnt, NNS), (seem, VBP), (want...","[site, doesnt, seem, want, load, must, get, lo..."
256032,0,time for some sleep- hav to actually do some w...,time for some sleep hav to actually do some wo...,"[time, for, some, sleep, hav, to, actually, do...","[time, sleep, hav, actually, work, tmrw]","[time, sleep, hav, actually, work, tmrw]","[time, sleep, hav, actually, work, tmrw]","[(time, NN), (sleep, JJ), (hav, NN), (actually...","[time, sleep, hav, actually, work, tmrw]"
657012,0,@supercoolkp In Oxford that month.,In Oxford that month,"[In, Oxford, that, month]","[In, Oxford, month]","[In, Oxford, month]","[In, Oxford, month]","[(In, IN), (Oxford, NNP), (month, NN)]","[In, Oxford, month]"
180587,4,"..time for a cup of tea and fruit bagels, i'm ...",time for cup of tea and fruit bagels im going ...,"[time, for, cup, of, tea, and, fruit, bagels, ...","[time, cup, tea, fruit, bagels, am, going, tur...","[time, cup, tea, fruit, bagels, am, going, tur...","[time, cup, tea, fruit, bagels, am, going, tur...","[(time, NN), (cup, NN), (tea, NN), (fruit, NN)...","[time, cup, tea, fruit, bagel, be, go, turn, d..."


In [82]:
df.head()

Unnamed: 0,Polarity,Tweet,Tweet_regex,Tweet_clean,Tweet_stopword,Tweet_clitics,Tweet_shortforms,Tweet_pos,Tweet_lemma
514293,0,i miss nikki nu nu already shes always there ...,miss nikki nu nu already shes always there whe...,"[miss, nikki, nu, nu, already, shes, always, t...","[miss, nikki, nu, nu, already, shes, always, n...","[miss, nikki, nu, nu, already, shes, always, n...","[miss, nikki, nu, nu, already, shes, always, n...","[(miss, JJ), (nikki, NN), (nu, JJ), (nu, JJ), ...","[miss, nikki, nu, nu, already, shes, always, n..."
142282,0,So I had a dream last night. I remember a sig...,So had dream last night remember sign which cl...,"[So, had, dream, last, night, remember, sign, ...","[So, dream, last, night, remember, sign, clear...","[So, dream, last, night, remember, sign, clear...","[So, dream, last, night, remember, sign, clear...","[(So, RB), (dream, NN), (last, JJ), (night, NN...","[So, dream, last, night, remember, sign, clear..."
403727,0,@girlyghost ohh poor sickly you (((hugs)) ho...,ohh poor sickly you hugs hope you feel little ...,"[ohh, poor, sickly, you, hugs, hope, you, feel...","[ohh, poor, sickly, hugs, hope, feel, little, ...","[ohh, poor, sickly, hugs, hope, feel, little, ...","[ohh, poor, sickly, hugs, hope, feel, little, ...","[(ohh, JJ), (poor, JJ), (sickly, JJ), (hugs, N...","[ohh, poor, sickly, hug, hope, feel, little, g..."
649503,0,it is raining again,it is raining again,"[it, is, raining, again]",[raining],[raining],[raining],"[(raining, VBG)]",[rain]
610789,0,@MissKeriBaby wish I was in LA right now,wish was in LA right now,"[wish, was, in, LA, right, now]","[wish, LA, right]","[wish, LA, right]","[wish, LA, right]","[(wish, JJ), (LA, NNP), (right, NN)]","[wish, LA, right]"


In [83]:
data.iloc[514293]

Polarity                                                    0
Tweet       i miss nikki nu nu already  shes always there ...
Name: 514293, dtype: object

In [84]:
def stemmer(tweet):
    porter_stemmer = PorterStemmer()
    tweet = [porter_stemmer.stem(word) for word in tweet]
    return tweet

In [85]:
# stemmer(['I','am','playing','making', 'what','I','do'])

In [86]:
# df['Tweet_stem'] = df['Tweet_shortforms'].apply(stemmer)
# df.head()

In [87]:
def pos_tagger(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:         
        return None

In [88]:
def pos_tagging(tweet):
    tweet = nltk.pos_tag(tweet) 
    return tweet

In [155]:
### TO BE MODIFIED ################
def tweet_lemmatizer(tweet):
    lemmatizer = WordNetLemmatizer()
    lemmatized = []
    pos_wordnet = list(map(lambda x: (x[0], pos_tagger(x[1])), tweet))
    for word, tag in pos_wordnet:
        if tag is None:
            lemmatized.append(word)
        else:       
            lemmatized.append(lemmatizer.lemmatize(word, tag))
    lemmatized = ' '.join(lemmatized)
    lemmatized_sent = ', '.join(lemmatized)
    # print(list(lemmatized.split()))
    return list(lemmatized.split())

In [91]:
df['Tweet_pos'] = df['Tweet_shortforms'].apply(pos_tagging)
df.head()

Unnamed: 0,Polarity,Tweet,Tweet_regex,Tweet_clean,Tweet_stopword,Tweet_clitics,Tweet_shortforms,Tweet_pos,Tweet_lemma
514293,0,i miss nikki nu nu already shes always there ...,miss nikki nu nu already shes always there whe...,"[miss, nikki, nu, nu, already, shes, always, t...","[miss, nikki, nu, nu, already, shes, always, n...","[miss, nikki, nu, nu, already, shes, always, n...","[miss, nikki, nu, nu, already, shes, always, n...","[(miss, JJ), (nikki, NN), (nu, JJ), (nu, JJ), ...","[miss, nikki, nu, nu, already, shes, always, n..."
142282,0,So I had a dream last night. I remember a sig...,So had dream last night remember sign which cl...,"[So, had, dream, last, night, remember, sign, ...","[So, dream, last, night, remember, sign, clear...","[So, dream, last, night, remember, sign, clear...","[So, dream, last, night, remember, sign, clear...","[(So, RB), (dream, NN), (last, JJ), (night, NN...","[So, dream, last, night, remember, sign, clear..."
403727,0,@girlyghost ohh poor sickly you (((hugs)) ho...,ohh poor sickly you hugs hope you feel little ...,"[ohh, poor, sickly, you, hugs, hope, you, feel...","[ohh, poor, sickly, hugs, hope, feel, little, ...","[ohh, poor, sickly, hugs, hope, feel, little, ...","[ohh, poor, sickly, hugs, hope, feel, little, ...","[(ohh, JJ), (poor, JJ), (sickly, JJ), (hugs, N...","[ohh, poor, sickly, hug, hope, feel, little, g..."
649503,0,it is raining again,it is raining again,"[it, is, raining, again]",[raining],[raining],[raining],"[(raining, VBG)]",[rain]
610789,0,@MissKeriBaby wish I was in LA right now,wish was in LA right now,"[wish, was, in, LA, right, now]","[wish, LA, right]","[wish, LA, right]","[wish, LA, right]","[(wish, JJ), (LA, NNP), (right, NN)]","[wish, LA, right]"


In [92]:
df['Tweet_lemma'] = df['Tweet_pos'].apply(tweet_lemmatizer)
df.head()

Unnamed: 0,Polarity,Tweet,Tweet_regex,Tweet_clean,Tweet_stopword,Tweet_clitics,Tweet_shortforms,Tweet_pos,Tweet_lemma
514293,0,i miss nikki nu nu already shes always there ...,miss nikki nu nu already shes always there whe...,"[miss, nikki, nu, nu, already, shes, always, t...","[miss, nikki, nu, nu, already, shes, always, n...","[miss, nikki, nu, nu, already, shes, always, n...","[miss, nikki, nu, nu, already, shes, always, n...","[(miss, JJ), (nikki, NN), (nu, JJ), (nu, JJ), ...","[miss, nikki, nu, nu, already, shes, always, n..."
142282,0,So I had a dream last night. I remember a sig...,So had dream last night remember sign which cl...,"[So, had, dream, last, night, remember, sign, ...","[So, dream, last, night, remember, sign, clear...","[So, dream, last, night, remember, sign, clear...","[So, dream, last, night, remember, sign, clear...","[(So, RB), (dream, NN), (last, JJ), (night, NN...","[So, dream, last, night, remember, sign, clear..."
403727,0,@girlyghost ohh poor sickly you (((hugs)) ho...,ohh poor sickly you hugs hope you feel little ...,"[ohh, poor, sickly, you, hugs, hope, you, feel...","[ohh, poor, sickly, hugs, hope, feel, little, ...","[ohh, poor, sickly, hugs, hope, feel, little, ...","[ohh, poor, sickly, hugs, hope, feel, little, ...","[(ohh, JJ), (poor, JJ), (sickly, JJ), (hugs, N...","[ohh, poor, sickly, hug, hope, feel, little, g..."
649503,0,it is raining again,it is raining again,"[it, is, raining, again]",[raining],[raining],[raining],"[(raining, VBG)]",[rain]
610789,0,@MissKeriBaby wish I was in LA right now,wish was in LA right now,"[wish, was, in, LA, right, now]","[wish, LA, right]","[wish, LA, right]","[wish, LA, right]","[(wish, JJ), (LA, NNP), (right, NN)]","[wish, LA, right]"


In [158]:
def make_sentences(df, col):
    df['Tweet_sent'] = df[col].apply(lambda x:' '.join([i for i in x]))
    return df

df = make_sentences(df, 'Tweet_lemma')
df.head()

Unnamed: 0,Polarity,Tweet,Tweet_regex,Tweet_clean,Tweet_stopword,Tweet_clitics,Tweet_shortforms,Tweet_pos,Tweet_lemma,Tweet_sent
514293,0,i miss nikki nu nu already shes always there ...,miss nikki nu nu already shes always there whe...,"[miss, nikki, nu, nu, already, shes, always, t...","[miss, nikki, nu, nu, already, shes, always, n...","[miss, nikki, nu, nu, already, shes, always, n...","[miss, nikki, nu, nu, already, shes, always, n...","[(miss, JJ), (nikki, NN), (nu, JJ), (nu, JJ), ...","[miss, nikki, nu, nu, already, shes, always, n...",miss nikki nu nu already shes always need than...
142282,0,So I had a dream last night. I remember a sig...,So had dream last night remember sign which cl...,"[So, had, dream, last, night, remember, sign, ...","[So, dream, last, night, remember, sign, clear...","[So, dream, last, night, remember, sign, clear...","[So, dream, last, night, remember, sign, clear...","[(So, RB), (dream, NN), (last, JJ), (night, NN...","[So, dream, last, night, remember, sign, clear...",So dream last night remember sign clearly tell...
403727,0,@girlyghost ohh poor sickly you (((hugs)) ho...,ohh poor sickly you hugs hope you feel little ...,"[ohh, poor, sickly, you, hugs, hope, you, feel...","[ohh, poor, sickly, hugs, hope, feel, little, ...","[ohh, poor, sickly, hugs, hope, feel, little, ...","[ohh, poor, sickly, hugs, hope, feel, little, ...","[(ohh, JJ), (poor, JJ), (sickly, JJ), (hugs, N...","[ohh, poor, sickly, hug, hope, feel, little, g...",ohh poor sickly hug hope feel little good soon
649503,0,it is raining again,it is raining again,"[it, is, raining, again]",[raining],[raining],[raining],"[(raining, VBG)]",[rain],rain
610789,0,@MissKeriBaby wish I was in LA right now,wish was in LA right now,"[wish, was, in, LA, right, now]","[wish, LA, right]","[wish, LA, right]","[wish, LA, right]","[(wish, JJ), (LA, NNP), (right, NN)]","[wish, LA, right]",wish LA right


In [94]:
# df['Tweet_token'] = df['Tweet_lemma'].apply(tweet_word_tokenizer)
# df.head()

In [95]:
pos_st = df[df['Polarity'] == 4]['Tweet_lemma'].sum()
neg_st = df[df['Polarity'] == 0]['Tweet_lemma'].sum()

In [96]:
# neg_st

In [97]:
pos_uni_freq = FreqDist(ngrams(pos_st, 1))
neg_uni_freq = FreqDist(ngrams(neg_st, 1))
pos_bi_freq = FreqDist(ngrams(pos_st, 2))
neg_bi_freq = FreqDist(ngrams(neg_st, 2))
pos_tri_freq = FreqDist(ngrams(pos_st, 3))
neg_tri_freq = FreqDist(ngrams(neg_st, 3))

In [98]:
pos_uni_freq

FreqDist({('get',): 891, ('go',): 714, ('day',): 643, ('good',): 621, ('Im',): 620, ('love',): 543, ('like',): 470, ('see',): 369, ('time',): 363, ('know',): 339, ...})

In [106]:
pos_uni_top = pos_uni_freq.most_common(300)
neg_uni_top = neg_uni_freq.most_common(300)
pos_bi_top = pos_bi_freq.most_common(300)
neg_bi_top = neg_bi_freq.most_common(300)
pos_tri_top = pos_tri_freq.most_common(300)
neg_tri_top = neg_tri_freq.most_common(300)

In [107]:
def get_top_words(sent_list):
    word_list = []
    for i in range(len(sent_list)):
        word_list.append(sent_list[i][0][0])
    return word_list

In [109]:
pos_uni_top_words = get_top_words(pos_uni_top)
neg_uni_top_words = get_top_words(neg_uni_top)
pos_bi_top_words = get_top_words(pos_bi_top)
neg_bi_top_words = get_top_words(neg_bi_top)
pos_tri_top_words = get_top_words(pos_tri_top)
neg_tri_top_words = get_top_words(neg_tri_top)

In [111]:
pos_uni_top_words

['get',
 'go',
 'day',
 'good',
 'Im',
 'love',
 'like',
 'see',
 'time',
 'know',
 'be',
 'make',
 'well',
 'laugh',
 'today',
 'work',
 'think',
 'one',
 'great',
 'new',
 'back',
 'watch',
 'night',
 'dont',
 'look',
 'come',
 'say',
 'thanks',
 'wait',
 'really',
 'haha',
 'fun',
 'The',
 'nice',
 'need',
 'no',
 'You',
 'want',
 'Thanks',
 'much',
 'morning',
 'still',
 'thats',
 'home',
 'would',
 'twitter',
 'Good',
 'cant',
 'Its',
 'take',
 'follow',
 'right',
 'Just',
 'happy',
 'hope',
 'tomorrow',
 'last',
 'thing',
 'tonight',
 'friend',
 'will',
 'way',
 'My',
 'awesome',
 'feel',
 'tweet',
 'youre',
 'LOL',
 'week',
 'oh',
 'movie',
 'people',
 'start',
 'everyone',
 'song',
 'weekend',
 'show',
 'guy',
 'use',
 'try',
 'gon',
 'sleep',
 'bed',
 'miss',
 'play',
 'cool',
 'first',
 'sure',
 'hour',
 'It',
 'do',
 'little',
 'find',
 'thank',
 'best',
 'lt',
 'Oh',
 'let',
 'So',
 'soon',
 'yes',
 'school',
 'Have',
 'ur',
 'And',
 'life',
 'yeah',
 'enjoy',
 'Ive',
 'yea

In [112]:
uni_top_common = set(set(pos_uni_top_words) & set(neg_uni_top_words))
pos_best_words = list(set(pos_uni_top_words) - uni_top_common)
neg_best_words = list(set(neg_uni_top_words) - uni_top_common)

In [113]:
pos_best_words

['excite',
 'On',
 'Have',
 'hair',
 'win',
 'funny',
 'LOVE',
 'lovely',
 'Thank',
 'okay',
 'world',
 'Good',
 'word',
 'Haha',
 'you',
 'quite',
 'book',
 'YOU',
 'beautiful',
 'awesome',
 'yay',
 'Id',
 'worry',
 'Morning',
 'welcome',
 'rest',
 'free',
 'train',
 'glad',
 'None',
 'hey',
 'wow',
 'follower',
 'enjoy',
 'bit',
 'coffee',
 'Hope',
 'check',
 'In',
 'sweet',
 'LOL',
 'music',
 'wonder',
 'forward',
 'Love',
 'Lol',
 'fan',
 'Hey',
 'smile',
 'change',
 'Thats',
 'finally',
 'Happy',
 'amaze',
 'video',
 'birthday',
 'cute',
 'If',
 'listen',
 'Yes',
 'Thanks',
 'meet',
 'picture',
 'ask',
 'At',
 'pay',
 'course',
 'luck',
 'write',
 'post',
 'every',
 'add',
 'hahaha',
 'God',
 'Hi',
 'wonderful',
 'hehe',
 'thank',
 'xx',
 'Yay',
 'name']

In [115]:
len(neg_best_words)

81

In [116]:
len(pos_best_words)

81

In [117]:
df['Tweet'].iloc[50]

'Yeah so physio was crap, still about a month away from any form of real physical activity. Couldnt even sit and straignthen my leg '

In [118]:
df.tail()

Unnamed: 0,Polarity,Tweet,Tweet_regex,Tweet_clean,Tweet_stopword,Tweet_clitics,Tweet_shortforms,Tweet_pos,Tweet_lemma
400592,4,I get the worst writer's cramp. These thank yo...,get the worst writers cramp These thank you no...,"[get, the, worst, writers, cramp, These, thank...","[get, worst, writers, cramp, These, thank, not...","[get, worst, writers, cramp, These, thank, not...","[get, worst, writers, cramp, These, thank, not...","[(get, VB), (worst, JJS), (writers, NNS), (cra...","[get, bad, writer, cramp, These, thank, note, ..."
333050,0,@ladyinreddress the sun is all gone now...,the sun is all gone now,"[the, sun, is, all, gone, now]","[sun, gone]","[sun, gone]","[sun, gone]","[(sun, NN), (gone, VBN)]","[sun, go]"
93698,4,"FINALLY a break till Wed, no work OR school, b...",FINALLY break till Wed no work OR school both ...,"[FINALLY, break, till, Wed, no, work, OR, scho...","[FINALLY, break, till, Wed, work, OR, school, ...","[FINALLY, break, till, Wed, work, OR, school, ...","[FINALLY, break, till, Wed, work, OR, school, ...","[(FINALLY, NNP), (break, VB), (till, NN), (Wed...","[FINALLY, break, till, Wed, work, OR, school, ..."
345456,4,I want a Blackberry...,want Blackberry,"[want, Blackberry]","[want, Blackberry]","[want, Blackberry]","[want, Blackberry]","[(want, NN), (Blackberry, NNP)]","[want, Blackberry]"
37125,4,in montrea doing some much needed ocean swimmi...,in montrea doing some much needed ocean swimmi...,"[in, montrea, doing, some, much, needed, ocean...","[montrea, much, needed, ocean, swimming, SURFS...","[montrea, much, needed, ocean, swimming, SURFS...","[montrea, much, needed, ocean, swimming, SURFS...","[(montrea, RB), (much, RB), (needed, VBN), (oc...","[montrea, much, need, ocean, swimming, SURFS, UP]"


In [119]:
def dummy(tweet):
    return tweet

In [156]:
cv = CountVectorizer(  
                      tokenizer=dummy,
                      preprocessor=dummy,
                      ngram_range=(1,1)
                    )

In [None]:
X = cv.fit_transform(df['Tweet_lemma']).toarray()
print(cv.get_feature_names())
print(X.shape)

In [136]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [140]:
X.shape

(20000, 23954)

In [141]:
X_train, X_test, y_train, y_test = train_test_split(X, df['Polarity'], test_size=0.25, random_state=2)

In [142]:
X_train.shape

(15000, 23954)

In [143]:
def model_run(model, X_train, y_train):
    model.fit(X_train, y_train)

In [144]:
def model_predict(model, X_test, y_test):
    print('Accuracy is: ', model.score(X_test, y_test)*100)
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))

In [145]:
model = MultinomialNB()
model_run(model, X_train, y_train)
model_predict(model, X_test, y_test)

Accuracy is:  73.06
              precision    recall  f1-score   support

           0       0.71      0.76      0.74      2462
           4       0.75      0.70      0.72      2538

    accuracy                           0.73      5000
   macro avg       0.73      0.73      0.73      5000
weighted avg       0.73      0.73      0.73      5000



In [146]:
# model = LogisticRegression()
# model_run(model, X_train, y_train)
# model_predict(model, X_test, y_test)

In [147]:
model = LinearSVC()
model_run(model, X_train, y_train)
model_predict(model, X_test, y_test)

Accuracy is:  71.02000000000001
              precision    recall  f1-score   support

           0       0.71      0.70      0.70      2462
           4       0.71      0.72      0.72      2538

    accuracy                           0.71      5000
   macro avg       0.71      0.71      0.71      5000
weighted avg       0.71      0.71      0.71      5000



In [148]:
df.head()

Unnamed: 0,Polarity,Tweet,Tweet_regex,Tweet_clean,Tweet_stopword,Tweet_clitics,Tweet_shortforms,Tweet_pos,Tweet_lemma
514293,0,i miss nikki nu nu already shes always there ...,miss nikki nu nu already shes always there whe...,"[miss, nikki, nu, nu, already, shes, always, t...","[miss, nikki, nu, nu, already, shes, always, n...","[miss, nikki, nu, nu, already, shes, always, n...","[miss, nikki, nu, nu, already, shes, always, n...","[(miss, JJ), (nikki, NN), (nu, JJ), (nu, JJ), ...","[miss, nikki, nu, nu, already, shes, always, n..."
142282,0,So I had a dream last night. I remember a sig...,So had dream last night remember sign which cl...,"[So, had, dream, last, night, remember, sign, ...","[So, dream, last, night, remember, sign, clear...","[So, dream, last, night, remember, sign, clear...","[So, dream, last, night, remember, sign, clear...","[(So, RB), (dream, NN), (last, JJ), (night, NN...","[So, dream, last, night, remember, sign, clear..."
403727,0,@girlyghost ohh poor sickly you (((hugs)) ho...,ohh poor sickly you hugs hope you feel little ...,"[ohh, poor, sickly, you, hugs, hope, you, feel...","[ohh, poor, sickly, hugs, hope, feel, little, ...","[ohh, poor, sickly, hugs, hope, feel, little, ...","[ohh, poor, sickly, hugs, hope, feel, little, ...","[(ohh, JJ), (poor, JJ), (sickly, JJ), (hugs, N...","[ohh, poor, sickly, hug, hope, feel, little, g..."
649503,0,it is raining again,it is raining again,"[it, is, raining, again]",[raining],[raining],[raining],"[(raining, VBG)]",[rain]
610789,0,@MissKeriBaby wish I was in LA right now,wish was in LA right now,"[wish, was, in, LA, right, now]","[wish, LA, right]","[wish, LA, right]","[wish, LA, right]","[(wish, JJ), (LA, NNP), (right, NN)]","[wish, LA, right]"


In [170]:
a = ', '.join(df['Tweet_sent'])

In [172]:
tfidf_counts = TfidfVectorizer(tokenizer= word_tokenize, # type of tokenization
                               ngram_range=(1,1)) # number of n-grams
tfidf_data = tfidf_counts.fit_transform(df['Tweet_sent'])

In [173]:
# tfidf_counts = TfidfVectorizer()
# tfidf_data = tfidf_counts.fit_transform(a)

In [174]:
tfidf_data.shape

(20000, 19437)

In [175]:
X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(tfidf_data, df['Polarity'], test_size=0.25, random_state=2)

In [176]:
print(X_train_tfidf.shape)
print(X_test_tfidf.shape)
print(y_train_tfidf.shape)
print(y_test_tfidf.shape)

(15000, 19437)
(5000, 19437)
(15000,)
(5000,)


In [177]:
model = MultinomialNB()
model_run(model, X_train_tfidf, y_train_tfidf)
model_predict(model, X_test_tfidf, y_test_tfidf)

Accuracy is:  73.92
              precision    recall  f1-score   support

           0       0.72      0.77      0.74      2462
           4       0.76      0.71      0.73      2538

    accuracy                           0.74      5000
   macro avg       0.74      0.74      0.74      5000
weighted avg       0.74      0.74      0.74      5000



In [178]:
model = LinearSVC()
model_run(model, X_train_tfidf, y_train_tfidf)
model_predict(model, X_test_tfidf, y_test_tfidf)

Accuracy is:  73.26
              precision    recall  f1-score   support

           0       0.74      0.71      0.72      2462
           4       0.73      0.75      0.74      2538

    accuracy                           0.73      5000
   macro avg       0.73      0.73      0.73      5000
weighted avg       0.73      0.73      0.73      5000



In [None]:
model = LogisticRegression()
model_run(model, X_train_tfidf, y_train_tfidf)
model_predict(model, X_test_tfidf, y_test_tfidf)