In [10]:
import pandas as pd
import re
import nltk

In [11]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

In [12]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.corpus import sentiwordnet as swn
from nltk import ngrams, FreqDist
from nltk.corpus import wordnet

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/rachitjain/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rachitjain/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/rachitjain/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/rachitjain/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [13]:
data_neg = pd.read_csv('Data/training_negative.csv', encoding='latin-1')
data_pos = pd.read_csv('Data/training_positive.csv', encoding='latin-1')

In [14]:
data = data_neg.append(data_pos)
data = data.iloc[:,1:]

## Sampling Data for Trying out Approaches

In [15]:
df = data.sample(n=100000, random_state=1)
df.tail()

Unnamed: 0,Polarity,Tweet
268005,0,RLS sux got the jitters cant sleep
724233,0,Looking for my glasses...but found them on top...
146469,0,stupid school!! i hate waking up in the morning
631514,0,one of my boston terriers is limping like craz...
748100,0,Damn. I'm fuckin loud.


## Pre-Processing Text

In [16]:
def clean_text(tweet):
    # tweet = tweet.lower()                                 # Converting to lower case
    tweet = re.sub(r'@[^\s]+', ' ', tweet)                  # Removing mentions
    tweet = re.sub(r'https?:\/[^\s]+', ' ', tweet)          # Removing URLs
    tweet = re.sub(r'www.[^\s]+', ' ', tweet)               # Removing Email IDs
    tweet = re.sub(r'#', '', tweet)                         # Removing hashtags
    tweet = re.sub(r'_', ' ', tweet)                        # Sometimes hashtags are done with _ representing break between two words
    tweet = re.sub(r'\.{2,}', ' ', tweet)                   # Removing sentence separators
    tweet = re.sub(r"[0-9]+",' ', tweet)                    # Removing numbers as they do not indicate sentiment
    tweet = re.sub(r"[^\w\d\s]+",'', tweet)                 # Removing unnecessary punctuations apart from apostrophe which can be used in clitics
    tweet = re.sub(r"\b[a-zA-Z]\b", ' ', tweet)             # Removing single letters
    tweet = re.sub(r"\bamp\b", ' ', tweet)                  # Removing &amp signs mis-translated
    return ' '.join(tweet.split())

In [17]:
clean_text("I am &amp n't doing good") 

'am nt doing good'

In [18]:
def tweet_word_tokenizer(tweet):
    return word_tokenize(tweet)

In [19]:
clitics = {
    "nt": 'not',
    "ve": 'have',
    "s": 'is',
    "m": 'am',
    "re": 'are',
    "ll": 'I will',
    "bout": 'about'
}

In [20]:
# count = 0
def handle_clitics(tweet):
    # global count
    # count += 1
    for i in range(len(tweet)):
        if tweet[i] in clitics.keys():
            tweet[i] = clitics[tweet[i]]
    return tweet

In [21]:
def stopword_removal(tweet):
    tweet = [word for word in tweet if word not in stopwords.words('english')]
    if len(tweet) == 0:
        tweet = ['None']
    return tweet

In [22]:
short_forms = {
    'n': 'and',
    'ya': 'you',
    'luv': 'love',
    'lol': 'laugh',
    'k': 'okay',
    'na': 'no',
    'ily': 'love',
    'im': 'am',
    'morn': 'morning',
    'nght': 'night',
    'n8': 'night',
    'no': 'not',
    'Ill': 'will'
    }

In [23]:
def handle_shortforms(tweet):
    for i in range(len(tweet)):
        if tweet[i] in short_forms.keys():
            tweet[i] = short_forms[tweet[i]]
    return tweet

In [24]:
handle_shortforms(['I','am','lol','in','practice'])

['I', 'am', 'laugh', 'in', 'practice']

In [25]:
df['Tweet_regex'] = df['Tweet'].apply(clean_text)

In [26]:
df['Tweet_clean'] = df['Tweet_regex'].apply(tweet_word_tokenizer)

In [27]:
df['Tweet_stopword'] = df['Tweet_clean'].apply(stopword_removal)

In [28]:
df['Tweet_clitics'] = df['Tweet_stopword'].apply(handle_clitics)

In [29]:
df['Tweet_shortforms'] = df['Tweet_clitics'].apply(handle_shortforms)

In [30]:
df.head(30)

Unnamed: 0,Polarity,Tweet,Tweet_regex,Tweet_clean,Tweet_stopword,Tweet_clitics,Tweet_shortforms
514293,0,i miss nikki nu nu already shes always there ...,miss nikki nu nu already shes always there whe...,"[miss, nikki, nu, nu, already, shes, always, t...","[miss, nikki, nu, nu, already, shes, always, n...","[miss, nikki, nu, nu, already, shes, always, n...","[miss, nikki, nu, nu, already, shes, always, n..."
142282,0,So I had a dream last night. I remember a sig...,So had dream last night remember sign which cl...,"[So, had, dream, last, night, remember, sign, ...","[So, dream, last, night, remember, sign, clear...","[So, dream, last, night, remember, sign, clear...","[So, dream, last, night, remember, sign, clear..."
403727,0,@girlyghost ohh poor sickly you (((hugs)) ho...,ohh poor sickly you hugs hope you feel little ...,"[ohh, poor, sickly, you, hugs, hope, you, feel...","[ohh, poor, sickly, hugs, hope, feel, little, ...","[ohh, poor, sickly, hugs, hope, feel, little, ...","[ohh, poor, sickly, hugs, hope, feel, little, ..."
649503,0,it is raining again,it is raining again,"[it, is, raining, again]",[raining],[raining],[raining]
610789,0,@MissKeriBaby wish I was in LA right now,wish was in LA right now,"[wish, was, in, LA, right, now]","[wish, LA, right]","[wish, LA, right]","[wish, LA, right]"
67315,0,Nala Olowalu still has a full tummy from bread...,Nala Olowalu still has full tummy from bread b...,"[Nala, Olowalu, still, has, full, tummy, from,...","[Nala, Olowalu, still, full, tummy, bread, bas...","[Nala, Olowalu, still, full, tummy, bread, bas...","[Nala, Olowalu, still, full, tummy, bread, bas..."
33521,4,@macintom site doesn't seem to want to load up...,site doesnt seem to want to load up they must ...,"[site, doesnt, seem, to, want, to, load, up, t...","[site, doesnt, seem, want, load, must, getting...","[site, doesnt, seem, want, load, must, getting...","[site, doesnt, seem, want, load, must, getting..."
256032,0,time for some sleep- hav to actually do some w...,time for some sleep hav to actually do some wo...,"[time, for, some, sleep, hav, to, actually, do...","[time, sleep, hav, actually, work, tmrw]","[time, sleep, hav, actually, work, tmrw]","[time, sleep, hav, actually, work, tmrw]"
657012,0,@supercoolkp In Oxford that month.,In Oxford that month,"[In, Oxford, that, month]","[In, Oxford, month]","[In, Oxford, month]","[In, Oxford, month]"
180587,4,"..time for a cup of tea and fruit bagels, i'm ...",time for cup of tea and fruit bagels im going ...,"[time, for, cup, of, tea, and, fruit, bagels, ...","[time, cup, tea, fruit, bagels, am, going, tur...","[time, cup, tea, fruit, bagels, am, going, tur...","[time, cup, tea, fruit, bagels, am, going, tur..."


In [31]:
df.head()

Unnamed: 0,Polarity,Tweet,Tweet_regex,Tweet_clean,Tweet_stopword,Tweet_clitics,Tweet_shortforms
514293,0,i miss nikki nu nu already shes always there ...,miss nikki nu nu already shes always there whe...,"[miss, nikki, nu, nu, already, shes, always, t...","[miss, nikki, nu, nu, already, shes, always, n...","[miss, nikki, nu, nu, already, shes, always, n...","[miss, nikki, nu, nu, already, shes, always, n..."
142282,0,So I had a dream last night. I remember a sig...,So had dream last night remember sign which cl...,"[So, had, dream, last, night, remember, sign, ...","[So, dream, last, night, remember, sign, clear...","[So, dream, last, night, remember, sign, clear...","[So, dream, last, night, remember, sign, clear..."
403727,0,@girlyghost ohh poor sickly you (((hugs)) ho...,ohh poor sickly you hugs hope you feel little ...,"[ohh, poor, sickly, you, hugs, hope, you, feel...","[ohh, poor, sickly, hugs, hope, feel, little, ...","[ohh, poor, sickly, hugs, hope, feel, little, ...","[ohh, poor, sickly, hugs, hope, feel, little, ..."
649503,0,it is raining again,it is raining again,"[it, is, raining, again]",[raining],[raining],[raining]
610789,0,@MissKeriBaby wish I was in LA right now,wish was in LA right now,"[wish, was, in, LA, right, now]","[wish, LA, right]","[wish, LA, right]","[wish, LA, right]"


In [32]:
data.iloc[514293]

Polarity                                                    0
Tweet       i miss nikki nu nu already  shes always there ...
Name: 514293, dtype: object

In [33]:
def stemmer(tweet):
    porter_stemmer = PorterStemmer()
    tweet = [porter_stemmer.stem(word) for word in tweet]
    return tweet

In [34]:
# stemmer(['I','am','playing','making', 'what','I','do'])

In [35]:
# df['Tweet_stem'] = df['Tweet_shortforms'].apply(stemmer)
# df.head()

In [36]:
def pos_tagger(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:         
        return None

In [37]:
def pos_tagging(tweet):
    tweet = nltk.pos_tag(tweet) 
    return tweet

In [38]:
### TO BE MODIFIED ################
def tweet_lemmatizer(tweet):
    lemmatizer = WordNetLemmatizer()
    lemmatized = []
    pos_wordnet = list(map(lambda x: (x[0], pos_tagger(x[1])), tweet))
    for word, tag in pos_wordnet:
        if tag is None:
            lemmatized.append(word)
        else:       
            lemmatized.append(lemmatizer.lemmatize(word, tag))
    lemmatized = ' '.join(lemmatized)
    lemmatized_sent = ', '.join(lemmatized)
    # print(list(lemmatized.split()))
    return list(lemmatized.split())

In [39]:
df['Tweet_pos'] = df['Tweet_shortforms'].apply(pos_tagging)
df.head()

Unnamed: 0,Polarity,Tweet,Tweet_regex,Tweet_clean,Tweet_stopword,Tweet_clitics,Tweet_shortforms,Tweet_pos
514293,0,i miss nikki nu nu already shes always there ...,miss nikki nu nu already shes always there whe...,"[miss, nikki, nu, nu, already, shes, always, t...","[miss, nikki, nu, nu, already, shes, always, n...","[miss, nikki, nu, nu, already, shes, always, n...","[miss, nikki, nu, nu, already, shes, always, n...","[(miss, JJ), (nikki, NN), (nu, JJ), (nu, JJ), ..."
142282,0,So I had a dream last night. I remember a sig...,So had dream last night remember sign which cl...,"[So, had, dream, last, night, remember, sign, ...","[So, dream, last, night, remember, sign, clear...","[So, dream, last, night, remember, sign, clear...","[So, dream, last, night, remember, sign, clear...","[(So, RB), (dream, NN), (last, JJ), (night, NN..."
403727,0,@girlyghost ohh poor sickly you (((hugs)) ho...,ohh poor sickly you hugs hope you feel little ...,"[ohh, poor, sickly, you, hugs, hope, you, feel...","[ohh, poor, sickly, hugs, hope, feel, little, ...","[ohh, poor, sickly, hugs, hope, feel, little, ...","[ohh, poor, sickly, hugs, hope, feel, little, ...","[(ohh, JJ), (poor, JJ), (sickly, JJ), (hugs, N..."
649503,0,it is raining again,it is raining again,"[it, is, raining, again]",[raining],[raining],[raining],"[(raining, VBG)]"
610789,0,@MissKeriBaby wish I was in LA right now,wish was in LA right now,"[wish, was, in, LA, right, now]","[wish, LA, right]","[wish, LA, right]","[wish, LA, right]","[(wish, JJ), (LA, NNP), (right, NN)]"


In [40]:
df['Tweet_lemma'] = df['Tweet_pos'].apply(tweet_lemmatizer)
df.head()

Unnamed: 0,Polarity,Tweet,Tweet_regex,Tweet_clean,Tweet_stopword,Tweet_clitics,Tweet_shortforms,Tweet_pos,Tweet_lemma
514293,0,i miss nikki nu nu already shes always there ...,miss nikki nu nu already shes always there whe...,"[miss, nikki, nu, nu, already, shes, always, t...","[miss, nikki, nu, nu, already, shes, always, n...","[miss, nikki, nu, nu, already, shes, always, n...","[miss, nikki, nu, nu, already, shes, always, n...","[(miss, JJ), (nikki, NN), (nu, JJ), (nu, JJ), ...","[miss, nikki, nu, nu, already, shes, always, n..."
142282,0,So I had a dream last night. I remember a sig...,So had dream last night remember sign which cl...,"[So, had, dream, last, night, remember, sign, ...","[So, dream, last, night, remember, sign, clear...","[So, dream, last, night, remember, sign, clear...","[So, dream, last, night, remember, sign, clear...","[(So, RB), (dream, NN), (last, JJ), (night, NN...","[So, dream, last, night, remember, sign, clear..."
403727,0,@girlyghost ohh poor sickly you (((hugs)) ho...,ohh poor sickly you hugs hope you feel little ...,"[ohh, poor, sickly, you, hugs, hope, you, feel...","[ohh, poor, sickly, hugs, hope, feel, little, ...","[ohh, poor, sickly, hugs, hope, feel, little, ...","[ohh, poor, sickly, hugs, hope, feel, little, ...","[(ohh, JJ), (poor, JJ), (sickly, JJ), (hugs, N...","[ohh, poor, sickly, hug, hope, feel, little, g..."
649503,0,it is raining again,it is raining again,"[it, is, raining, again]",[raining],[raining],[raining],"[(raining, VBG)]",[rain]
610789,0,@MissKeriBaby wish I was in LA right now,wish was in LA right now,"[wish, was, in, LA, right, now]","[wish, LA, right]","[wish, LA, right]","[wish, LA, right]","[(wish, JJ), (LA, NNP), (right, NN)]","[wish, LA, right]"


In [41]:
def make_sentences(df, col, title):
    df[title] = df[col].apply(lambda x:' '.join([i for i in x]))
    return df

In [42]:
df = make_sentences(df, 'Tweet_lemma', 'Tweet_sent')
df.head()

Unnamed: 0,Polarity,Tweet,Tweet_regex,Tweet_clean,Tweet_stopword,Tweet_clitics,Tweet_shortforms,Tweet_pos,Tweet_lemma,Tweet_sent
514293,0,i miss nikki nu nu already shes always there ...,miss nikki nu nu already shes always there whe...,"[miss, nikki, nu, nu, already, shes, always, t...","[miss, nikki, nu, nu, already, shes, always, n...","[miss, nikki, nu, nu, already, shes, always, n...","[miss, nikki, nu, nu, already, shes, always, n...","[(miss, JJ), (nikki, NN), (nu, JJ), (nu, JJ), ...","[miss, nikki, nu, nu, already, shes, always, n...",miss nikki nu nu already shes always need than...
142282,0,So I had a dream last night. I remember a sig...,So had dream last night remember sign which cl...,"[So, had, dream, last, night, remember, sign, ...","[So, dream, last, night, remember, sign, clear...","[So, dream, last, night, remember, sign, clear...","[So, dream, last, night, remember, sign, clear...","[(So, RB), (dream, NN), (last, JJ), (night, NN...","[So, dream, last, night, remember, sign, clear...",So dream last night remember sign clearly tell...
403727,0,@girlyghost ohh poor sickly you (((hugs)) ho...,ohh poor sickly you hugs hope you feel little ...,"[ohh, poor, sickly, you, hugs, hope, you, feel...","[ohh, poor, sickly, hugs, hope, feel, little, ...","[ohh, poor, sickly, hugs, hope, feel, little, ...","[ohh, poor, sickly, hugs, hope, feel, little, ...","[(ohh, JJ), (poor, JJ), (sickly, JJ), (hugs, N...","[ohh, poor, sickly, hug, hope, feel, little, g...",ohh poor sickly hug hope feel little good soon
649503,0,it is raining again,it is raining again,"[it, is, raining, again]",[raining],[raining],[raining],"[(raining, VBG)]",[rain],rain
610789,0,@MissKeriBaby wish I was in LA right now,wish was in LA right now,"[wish, was, in, LA, right, now]","[wish, LA, right]","[wish, LA, right]","[wish, LA, right]","[(wish, JJ), (LA, NNP), (right, NN)]","[wish, LA, right]",wish LA right


In [43]:
pos_st = df[df['Polarity'] == 4]['Tweet_lemma'].sum()
neg_st = df[df['Polarity'] == 0]['Tweet_lemma'].sum()

In [44]:
# neg_st

In [45]:
pos_uni_freq = FreqDist(ngrams(pos_st, 1))
neg_uni_freq = FreqDist(ngrams(neg_st, 1))
pos_bi_freq = FreqDist(ngrams(pos_st, 2))
neg_bi_freq = FreqDist(ngrams(neg_st, 2))
pos_tri_freq = FreqDist(ngrams(pos_st, 3))
neg_tri_freq = FreqDist(ngrams(neg_st, 3))

In [46]:
pos_uni_freq

FreqDist({('get',): 4559, ('go',): 3676, ('good',): 3164, ('day',): 3094, ('Im',): 2973, ('love',): 2904, ('like',): 2371, ('time',): 1845, ('see',): 1818, ('know',): 1698, ...})

In [47]:
pos_uni_top = pos_uni_freq.most_common(1000)
neg_uni_top = neg_uni_freq.most_common(1000)
pos_bi_top = pos_bi_freq.most_common(1000)
neg_bi_top = neg_bi_freq.most_common(1000)
pos_tri_top = pos_tri_freq.most_common(1000)
neg_tri_top = neg_tri_freq.most_common(1000)

In [48]:
len(pos_uni_top)

1000

In [49]:
def get_top_words(sent_list):
    word_list = []
    for i in range(len(sent_list)):
        word_list.append(sent_list[i][0][0])
    return word_list

In [50]:
pos_uni_top_words = get_top_words(pos_uni_top)
neg_uni_top_words = get_top_words(neg_uni_top)
pos_bi_top_words = get_top_words(pos_bi_top)
neg_bi_top_words = get_top_words(neg_bi_top)
pos_tri_top_words = get_top_words(pos_tri_top)
neg_tri_top_words = get_top_words(neg_tri_top)

In [51]:
print(len(pos_uni_top_words))
print(len(neg_uni_top_words))


1000
1000


In [52]:
uni_top_common = set(set(pos_uni_top_words) & set(neg_uni_top_words))
pos_best_words = list(set(pos_uni_top_words) - uni_top_common)
neg_best_words = list(set(neg_uni_top_words) - uni_top_common)

In [53]:
uni_top_common_list = list(uni_top_common)

In [54]:
pos_best_words

['topic',
 'glass',
 'World',
 'silly',
 'myspace',
 'evening',
 'watchin',
 'Listening',
 'xD',
 'bless',
 'See',
 'rule',
 'ALL',
 'OK',
 'goodnight',
 'Who',
 'excited',
 'promise',
 'Nice',
 'design',
 'track',
 'advice',
 'adorable',
 'Come',
 'special',
 'Awesome',
 'yo',
 'fantastic',
 'folk',
 'As',
 'everybody',
 'celebrate',
 'FF',
 'present',
 'quote',
 'UP',
 'ON',
 'YAY',
 'Next',
 'interest',
 'interview',
 'possible',
 'interesting',
 'peep',
 'safe',
 'chicken',
 'welcome',
 'DM',
 'Take',
 'yep',
 'appreciate',
 'garden',
 'alot',
 'social',
 'perfect',
 'buddy',
 'Congrats',
 'exactly',
 'Follow',
 'Check',
 'Miley',
 'Out',
 'Hello',
 'Let',
 'remind',
 'land',
 'chillin',
 'shout',
 'Were',
 'bar',
 'young',
 'choice',
 'Beautiful',
 'hilarious',
 'hubby',
 'Cool',
 'wife',
 'extra',
 'arrive',
 'Welcome',
 'surprise',
 'huh',
 'business',
 'Hahaha',
 'PS',
 'youve',
 'Lets',
 'yup',
 'alright',
 'wine',
 'sunshine',
 'Ha',
 'Tweet',
 'Like',
 'shall',
 'beer',
 'ti

In [55]:
neg_best_words

['flu',
 'bother',
 'felt',
 'dentist',
 'dang',
 'stomach',
 'cancel',
 'state',
 'REALLY',
 'shift',
 'apparently',
 'bored',
 'airport',
 'Work',
 'Missing',
 'death',
 'return',
 'upset',
 'lonely',
 'bc',
 'doctor',
 'storm',
 'fair',
 'allow',
 'stress',
 'iPod',
 'arm',
 'act',
 'scary',
 'rainy',
 'ugh',
 'notice',
 'Too',
 'inside',
 'pop',
 'sore',
 'father',
 'crash',
 'ear',
 'waste',
 'Bad',
 'revise',
 'cable',
 'boo',
 'wet',
 'Someone',
 'plus',
 'Miss',
 'RIP',
 'door',
 'coz',
 'Feeling',
 'sooooo',
 'knee',
 'ruin',
 'loss',
 'fell',
 'seat',
 'stuck',
 'expensive',
 'pray',
 'barely',
 'tear',
 'except',
 'disappoint',
 'pull',
 'assignment',
 'crap',
 'forever',
 'bloody',
 'hr',
 'ate',
 'cough',
 'sunburn',
 'bug',
 'blow',
 'heat',
 'burn',
 'mobile',
 'ouch',
 'homework',
 'unfortunately',
 'invite',
 'middle',
 'sadly',
 'attack',
 'nose',
 'hospital',
 'gosh',
 'exhaust',
 'available',
 'Sad',
 'dnt',
 'completely',
 'shouldnt',
 'drunk',
 'Stupid',
 'shut',


In [56]:
len(pos_best_words)

211

In [57]:
df['Tweet'].iloc[50]

'Yeah so physio was crap, still about a month away from any form of real physical activity. Couldnt even sit and straignthen my leg '

In [58]:
df['Tweet_sent'].iloc[50]

'Yeah physio crap still month away form real physical activity Couldnt even sit straignthen leg'

In [59]:
df.tail()

Unnamed: 0,Polarity,Tweet,Tweet_regex,Tweet_clean,Tweet_stopword,Tweet_clitics,Tweet_shortforms,Tweet_pos,Tweet_lemma,Tweet_sent
268005,0,RLS sux got the jitters cant sleep,RLS sux got the jitters cant sleep,"[RLS, sux, got, the, jitters, cant, sleep]","[RLS, sux, got, jitters, cant, sleep]","[RLS, sux, got, jitters, cant, sleep]","[RLS, sux, got, jitters, cant, sleep]","[(RLS, NNP), (sux, NN), (got, VBD), (jitters, ...","[RLS, sux, get, jitter, cant, sleep]",RLS sux get jitter cant sleep
724233,0,Looking for my glasses...but found them on top...,Looking for my glasses but found them on top o...,"[Looking, for, my, glasses, but, found, them, ...","[Looking, glasses, found, top, head, man, gett...","[Looking, glasses, found, top, head, man, gett...","[Looking, glasses, found, top, head, man, gett...","[(Looking, VBG), (glasses, NNS), (found, VBD),...","[Looking, glass, find, top, head, man, get, old]",Looking glass find top head man get old
146469,0,stupid school!! i hate waking up in the morning,stupid school hate waking up in the morning,"[stupid, school, hate, waking, up, in, the, mo...","[stupid, school, hate, waking, morning]","[stupid, school, hate, waking, morning]","[stupid, school, hate, waking, morning]","[(stupid, JJ), (school, NN), (hate, NN), (waki...","[stupid, school, hate, wake, morning]",stupid school hate wake morning
631514,0,one of my boston terriers is limping like craz...,one of my boston terriers is limping like craz...,"[one, of, my, boston, terriers, is, limping, l...","[one, boston, terriers, limping, like, crazy, ...","[one, boston, terriers, limping, like, crazy, ...","[one, boston, terriers, limping, like, crazy, ...","[(one, CD), (boston, NN), (terriers, NNS), (li...","[one, boston, terrier, limp, like, crazy, real...",one boston terrier limp like crazy really bother
748100,0,Damn. I'm fuckin loud.,Damn Im fuckin loud,"[Damn, Im, fuckin, loud]","[Damn, Im, fuckin, loud]","[Damn, Im, fuckin, loud]","[Damn, Im, fuckin, loud]","[(Damn, NNP), (Im, NNP), (fuckin, VBD), (loud,...","[Damn, Im, fuckin, loud]",Damn Im fuckin loud


In [60]:
def dummy(tweet):
    return tweet

In [61]:
cv = CountVectorizer(  
                      tokenizer=dummy,
                      preprocessor=dummy,
                      ngram_range=(1,1)
                    )

In [62]:
X = cv.fit_transform(df['Tweet_lemma']).toarray()
print(cv.get_feature_names())
print(X.shape)

(100000, 69933)


In [63]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [64]:
X.shape

(100000, 69933)

In [65]:
# X_train, X_test, y_train, y_test = train_test_split(X, df['Polarity'], test_size=0.25, random_state=2)

In [66]:
X_train = X[:80000,:]
X_test = X[80000:,:]
y_train = df['Polarity'][:80000]
y_test = df['Polarity'][80000:]

In [67]:
X_train.shape

(80000, 69933)

In [68]:
X_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [69]:
def model_run(model, X_train, y_train):
    model.fit(X_train, y_train)

In [70]:
def model_predict(model, X_test, y_test):
    print('Accuracy is: ', model.score(X_test, y_test)*100)
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))

In [71]:
model = MultinomialNB()
model_run(model, X_train, y_train)
model_predict(model, X_test, y_test)

In [None]:
# model = LogisticRegression()
# model_run(model, X_train, y_train)
# model_predict(model, X_test, y_test)

In [1]:
model = LinearSVC()
model_run(model, X_train, y_train)
model_predict(model, X_test, y_test)

NameError: name 'LinearSVC' is not defined

In [None]:
df.head()

Unnamed: 0,Polarity,Tweet,Tweet_regex,Tweet_clean,Tweet_stopword,Tweet_clitics,Tweet_shortforms,Tweet_pos,Tweet_lemma,Tweet_sent
514293,0,i miss nikki nu nu already shes always there ...,miss nikki nu nu already shes always there whe...,"[miss, nikki, nu, nu, already, shes, always, t...","[miss, nikki, nu, nu, already, shes, always, n...","[miss, nikki, nu, nu, already, shes, always, n...","[miss, nikki, nu, nu, already, shes, always, n...","[(miss, JJ), (nikki, NN), (nu, JJ), (nu, JJ), ...","[miss, nikki, nu, nu, already, shes, always, n...",miss nikki nu nu already shes always need than...
142282,0,So I had a dream last night. I remember a sig...,So had dream last night remember sign which cl...,"[So, had, dream, last, night, remember, sign, ...","[So, dream, last, night, remember, sign, clear...","[So, dream, last, night, remember, sign, clear...","[So, dream, last, night, remember, sign, clear...","[(So, RB), (dream, NN), (last, JJ), (night, NN...","[So, dream, last, night, remember, sign, clear...",So dream last night remember sign clearly tell...
403727,0,@girlyghost ohh poor sickly you (((hugs)) ho...,ohh poor sickly you hugs hope you feel little ...,"[ohh, poor, sickly, you, hugs, hope, you, feel...","[ohh, poor, sickly, hugs, hope, feel, little, ...","[ohh, poor, sickly, hugs, hope, feel, little, ...","[ohh, poor, sickly, hugs, hope, feel, little, ...","[(ohh, JJ), (poor, JJ), (sickly, JJ), (hugs, N...","[ohh, poor, sickly, hug, hope, feel, little, g...",ohh poor sickly hug hope feel little good soon
649503,0,it is raining again,it is raining again,"[it, is, raining, again]",[raining],[raining],[raining],"[(raining, VBG)]",[rain],rain
610789,0,@MissKeriBaby wish I was in LA right now,wish was in LA right now,"[wish, was, in, LA, right, now]","[wish, LA, right]","[wish, LA, right]","[wish, LA, right]","[(wish, JJ), (LA, NNP), (right, NN)]","[wish, LA, right]",wish LA right


In [None]:
tfidf_counts = TfidfVectorizer(tokenizer= word_tokenize, # type of tokenization
                               ngram_range=(1,1)) # number of n-grams
tfidf_data = tfidf_counts.fit_transform(df['Tweet_sent'])

In [None]:
# tfidf_counts = TfidfVectorizer()
# tfidf_data = tfidf_counts.fit_transform(a)

In [None]:
tfidf_data.shape

(20000, 19437)

In [None]:
X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(tfidf_data, df['Polarity'], test_size=0.25, random_state=2)

In [None]:
print(X_train_tfidf.shape)
print(X_test_tfidf.shape)
print(y_train_tfidf.shape)
print(y_test_tfidf.shape)

(15000, 19437)
(5000, 19437)
(15000,)
(5000,)


In [None]:
model = MultinomialNB()
model_run(model, X_train_tfidf, y_train_tfidf)
model_predict(model, X_test_tfidf, y_test_tfidf)

Accuracy is:  73.92
              precision    recall  f1-score   support

           0       0.72      0.77      0.74      2462
           4       0.76      0.71      0.73      2538

    accuracy                           0.74      5000
   macro avg       0.74      0.74      0.74      5000
weighted avg       0.74      0.74      0.74      5000



In [None]:
model = LinearSVC()
model_run(model, X_train_tfidf, y_train_tfidf)
model_predict(model, X_test_tfidf, y_test_tfidf)

Accuracy is:  73.26
              precision    recall  f1-score   support

           0       0.74      0.71      0.72      2462
           4       0.73      0.75      0.74      2538

    accuracy                           0.73      5000
   macro avg       0.73      0.73      0.73      5000
weighted avg       0.73      0.73      0.73      5000



In [None]:
# model = LogisticRegression()
# model_run(model, X_train_tfidf, y_train_tfidf)
# model_predict(model, X_test_tfidf, y_test_tfidf)

In [None]:
def remove_extra_words(tweet):
    tweet = [word for word in tweet if word in uni_top_common_list]
    if len(tweet) == 0:
        tweet = ['None']
    return tweet

In [None]:
df['Tweet_remove_extra'] = df['Tweet_lemma'].apply(remove_extra_words)
df.head()

Unnamed: 0,Polarity,Tweet,Tweet_regex,Tweet_clean,Tweet_stopword,Tweet_clitics,Tweet_shortforms,Tweet_pos,Tweet_lemma,Tweet_sent,Tweet_remove_extra
514293,0,i miss nikki nu nu already shes always there ...,miss nikki nu nu already shes always there whe...,"[miss, nikki, nu, nu, already, shes, always, t...","[miss, nikki, nu, nu, already, shes, always, n...","[miss, nikki, nu, nu, already, shes, always, n...","[miss, nikki, nu, nu, already, shes, always, n...","[(miss, JJ), (nikki, NN), (nu, JJ), (nu, JJ), ...","[miss, nikki, nu, nu, already, shes, always, n...",miss nikki nu nu already shes always need than...,"[miss, already, always, need]"
142282,0,So I had a dream last night. I remember a sig...,So had dream last night remember sign which cl...,"[So, had, dream, last, night, remember, sign, ...","[So, dream, last, night, remember, sign, clear...","[So, dream, last, night, remember, sign, clear...","[So, dream, last, night, remember, sign, clear...","[(So, RB), (dream, NN), (last, JJ), (night, NN...","[So, dream, last, night, remember, sign, clear...",So dream last night remember sign clearly tell...,"[So, last, night, tell, get, job, cant, say]"
403727,0,@girlyghost ohh poor sickly you (((hugs)) ho...,ohh poor sickly you hugs hope you feel little ...,"[ohh, poor, sickly, you, hugs, hope, you, feel...","[ohh, poor, sickly, hugs, hope, feel, little, ...","[ohh, poor, sickly, hugs, hope, feel, little, ...","[ohh, poor, sickly, hugs, hope, feel, little, ...","[(ohh, JJ), (poor, JJ), (sickly, JJ), (hugs, N...","[ohh, poor, sickly, hug, hope, feel, little, g...",ohh poor sickly hug hope feel little good soon,"[hope, feel, little, good, soon]"
649503,0,it is raining again,it is raining again,"[it, is, raining, again]",[raining],[raining],[raining],"[(raining, VBG)]",[rain],rain,[rain]
610789,0,@MissKeriBaby wish I was in LA right now,wish was in LA right now,"[wish, was, in, LA, right, now]","[wish, LA, right]","[wish, LA, right]","[wish, LA, right]","[(wish, JJ), (LA, NNP), (right, NN)]","[wish, LA, right]",wish LA right,"[wish, right]"


In [None]:
df = make_sentences(df, 'Tweet_remove_extra', 'Tweet_final_sent')
df.head()

Unnamed: 0,Polarity,Tweet,Tweet_regex,Tweet_clean,Tweet_stopword,Tweet_clitics,Tweet_shortforms,Tweet_pos,Tweet_lemma,Tweet_sent,Tweet_remove_extra,Tweet_final_sent
514293,0,i miss nikki nu nu already shes always there ...,miss nikki nu nu already shes always there whe...,"[miss, nikki, nu, nu, already, shes, always, t...","[miss, nikki, nu, nu, already, shes, always, n...","[miss, nikki, nu, nu, already, shes, always, n...","[miss, nikki, nu, nu, already, shes, always, n...","[(miss, JJ), (nikki, NN), (nu, JJ), (nu, JJ), ...","[miss, nikki, nu, nu, already, shes, always, n...",miss nikki nu nu already shes always need than...,"[miss, already, always, need]",miss already always need
142282,0,So I had a dream last night. I remember a sig...,So had dream last night remember sign which cl...,"[So, had, dream, last, night, remember, sign, ...","[So, dream, last, night, remember, sign, clear...","[So, dream, last, night, remember, sign, clear...","[So, dream, last, night, remember, sign, clear...","[(So, RB), (dream, NN), (last, JJ), (night, NN...","[So, dream, last, night, remember, sign, clear...",So dream last night remember sign clearly tell...,"[So, last, night, tell, get, job, cant, say]",So last night tell get job cant say
403727,0,@girlyghost ohh poor sickly you (((hugs)) ho...,ohh poor sickly you hugs hope you feel little ...,"[ohh, poor, sickly, you, hugs, hope, you, feel...","[ohh, poor, sickly, hugs, hope, feel, little, ...","[ohh, poor, sickly, hugs, hope, feel, little, ...","[ohh, poor, sickly, hugs, hope, feel, little, ...","[(ohh, JJ), (poor, JJ), (sickly, JJ), (hugs, N...","[ohh, poor, sickly, hug, hope, feel, little, g...",ohh poor sickly hug hope feel little good soon,"[hope, feel, little, good, soon]",hope feel little good soon
649503,0,it is raining again,it is raining again,"[it, is, raining, again]",[raining],[raining],[raining],"[(raining, VBG)]",[rain],rain,[rain],rain
610789,0,@MissKeriBaby wish I was in LA right now,wish was in LA right now,"[wish, was, in, LA, right, now]","[wish, LA, right]","[wish, LA, right]","[wish, LA, right]","[(wish, JJ), (LA, NNP), (right, NN)]","[wish, LA, right]",wish LA right,"[wish, right]",wish right


In [None]:
tfidf_counts_clean = TfidfVectorizer(tokenizer= word_tokenize, # type of tokenization
                               ngram_range=(1,1)) # number of n-grams
tfidf_data_clean = tfidf_counts_clean.fit_transform(df['Tweet_final_sent'])

In [None]:
tfidf_data_clean.shape

(20000, 213)

In [None]:
X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(tfidf_data_clean, df['Polarity'], test_size=0.25, random_state=2)

In [None]:
print(X_train_tfidf.shape)
print(X_test_tfidf.shape)
print(y_train_tfidf.shape)
print(y_test_tfidf.shape)

(15000, 213)
(5000, 213)
(15000,)
(5000,)


Accuracy is:  65.0
              precision    recall  f1-score   support

           0       0.66      0.59      0.62      2462
           4       0.64      0.71      0.67      2538

    accuracy                           0.65      5000
   macro avg       0.65      0.65      0.65      5000
weighted avg       0.65      0.65      0.65      5000



In [None]:
model = MultinomialNB()
model_run(model, X_train_tfidf, y_train_tfidf)
model_predict(model, X_test_tfidf, y_test_tfidf)

Accuracy is:  64.44
              precision    recall  f1-score   support

           0       0.65      0.61      0.63      2462
           4       0.64      0.68      0.66      2538

    accuracy                           0.64      5000
   macro avg       0.64      0.64      0.64      5000
weighted avg       0.64      0.64      0.64      5000



In [None]:
model = LinearSVC()
model_run(model, X_train_tfidf, y_train_tfidf)
model_predict(model, X_test_tfidf, y_test_tfidf)

Accuracy is:  65.0
              precision    recall  f1-score   support

           0       0.66      0.59      0.62      2462
           4       0.64      0.71      0.67      2538

    accuracy                           0.65      5000
   macro avg       0.65      0.65      0.65      5000
weighted avg       0.65      0.65      0.65      5000



In [None]:
model = LogisticRegression()
model_run(model, X_train_tfidf, y_train_tfidf)
model_predict(model, X_test_tfidf, y_test_tfidf)

Accuracy is:  64.96
              precision    recall  f1-score   support

           0       0.66      0.59      0.63      2462
           4       0.64      0.70      0.67      2538

    accuracy                           0.65      5000
   macro avg       0.65      0.65      0.65      5000
weighted avg       0.65      0.65      0.65      5000

