In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [153]:
import numpy as np
import pandas as pd
import re
import nltk
import matplotlib.pyplot as plt

In [154]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

In [155]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.corpus import sentiwordnet as swn
from nltk import ngrams, FreqDist
from nltk.corpus import wordnet

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [156]:
data_neg = pd.read_csv('/content/drive/MyDrive/COL772_A2/training_negative.csv', encoding='ISO-8859-1')
data_pos = pd.read_csv('/content/drive/MyDrive/COL772_A2/training_positive.csv', encoding='ISO-8859-1')

In [157]:
data = data_neg.append(data_pos)
data = data.iloc[:,1:]

In [158]:
data.tail()

Unnamed: 0,Polarity,Tweet
799995,4,Just woke up. Having no school is the best fee...
799996,4,TheWDB.com - Very cool to hear old Walt interv...
799997,4,Are you ready for your MoJo Makeover? Ask me f...
799998,4,Happy 38th Birthday to my boo of alll time!!! ...
799999,4,happy #charitytuesday @theNSPCC @SparksCharity...


## Sampling Data for Trying out Approaches

In [159]:
df = data.sample(frac=0.05, random_state=1)
# df = data.iloc[:30000]
df.head()

Unnamed: 0,Polarity,Tweet
514293,0,i miss nikki nu nu already shes always there ...
142282,0,So I had a dream last night. I remember a sig...
403727,0,@girlyghost ohh poor sickly you (((hugs)) ho...
649503,0,it is raining again
610789,0,@MissKeriBaby wish I was in LA right now


In [160]:
df['Polarity'] = np.where(df['Polarity'] == 4, 1, 0)
df.head()

Unnamed: 0,Polarity,Tweet
514293,0,i miss nikki nu nu already shes always there ...
142282,0,So I had a dream last night. I remember a sig...
403727,0,@girlyghost ohh poor sickly you (((hugs)) ho...
649503,0,it is raining again
610789,0,@MissKeriBaby wish I was in LA right now


In [161]:
df['Polarity'].value_counts()

1    40161
0    39839
Name: Polarity, dtype: int64

In [162]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [163]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [164]:
example = 'The movie was awesome.'
sid = SentimentIntensityAnalyzer()
sid.polarity_scores(example)

{'compound': 0.6249, 'neg': 0.0, 'neu': 0.423, 'pos': 0.577}

In [165]:
# df[['neg', 'neu', 'pos', 'compound']] = df['Tweet'].apply(sid.polarity_scores).apply(pd.Series)
# df.head()

In [166]:
# df['Polarity_Vader'] = np.where(df['compound'] > 0, 1, 0)
# df.head()

In [167]:
# (sum(df['Polarity'] == df['Polarity_Vader'])/len(df))*100

In [168]:
import seaborn as sns

In [169]:
# for var in ['pos', 'neg', 'neu', 'compound']:
#     plt.figure(figsize=(12,4))
#     sns.distplot(df.query("Polarity==1")[var], bins=30, kde=False, 
#                  color='green', label='Positive')
#     sns.distplot(df.query("Polarity==0")[var], bins=30, kde=False, 
#                  color='red', label='Negative')
#     plt.legend()
#     plt.title(f'Histogram of {var} by true sentiment');

## Pre-Processing Text

In [170]:
def clean_text(tweet):
    tweet = tweet.lower()                                   # Converting to lower case
    tweet = re.sub(r'\b\w+@[^\s]+', ' <MAIL> ', tweet)             # Removing email IDs
    tweet = re.sub(r'@[^\s]+', ' <MENTION> ', tweet)                  # Removing mentions
    tweet = re.sub(r'https?:\/[^\s]+', ' <URL> ', tweet)          # Removing URLs
    tweet = re.sub(r'www.[^\s]+', ' <WEBSITE> ', tweet)               # Removing Websites
    tweet = re.sub(r'\b\w+.com', ' <WEBSITE> ', tweet)             # Removing email IDs
    tweet = re.sub(r'#', ' <HASHTAG> ', tweet)                         # Removing hashtags
    tweet = re.sub(r'_', ' ', tweet)                        # Sometimes hashtags are done with _ representing break between two words
    tweet = re.sub(r'\.{2,}', ' ', tweet)                   # Removing sentence separators
    tweet = re.sub(r"[0-9]+",' ', tweet)                    # Removing numbers as they do not indicate sentiment
    tweet = re.sub(r"\bamp\b", ' ', tweet)                  # Removing &amp signs mis-translated
    tweet = re.sub(r"\bquot\b", ' ', tweet)  
    tweet = re.sub(r"\b\w+;[^\s]+\b", ' ', tweet)  
    if len(tweet) == 0:
      tweet = 'None'
    return ' '.join(tweet.split())

In [171]:
def remove_punc(tweet):
    tweet = re.sub(r"[^\w'\s]+",'', tweet)                  # Removing punctuations apart from clitic
    return tweet

In [172]:
clean_text("I am &amp rachit1jain@gmail.com n't #doing_exceptionally good hello.com &gt;&gt:D") 

"i am & <MAIL> n't <HASHTAG> doing exceptionally good <WEBSITE> &"

In [173]:
clean_text('@')

'@'

In [174]:
def tweet_word_tokenizer(tweet):
    # return word_tokenize(tweet)
    return tweet.split(' ')

In [175]:
clitics = {
    "nt": 'not',
    "ve": 'have',
    "s": 'is',
    "m": 'am',
    "re": 'are',
    "ll": 'will',
    'd': 'would',
    "bout": 'about',
    'didnt': 'did not',
    'havent': 'have not',
    'hasnt': 'has not',
    'wont': 'will not',
    'wouldnt': 'will not',
    'shouldnt': 'should not',
}

In [176]:
# # count = 0
# def handle_clitics(tweet):
#     # global count
#     # count += 1
#     for i in range(len(tweet)):
#         if tweet[i] in clitics.keys():
#             tweet[i] = clitics[tweet[i]]
#     return tweet

In [177]:
def handle_clitics(phrase):
    # specific
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    phrase = re.sub(r"\bdon't\b", "do not", phrase)
    phrase = re.sub(r"\bdoesn't\b", "does not", phrase)
    phrase = re.sub(r"\bdidn't\b", "did not", phrase)
    phrase = re.sub(r"\bdidnt\b", "did not", phrase)
    phrase = re.sub(r"\bhasn't\b", "has not", phrase)
    phrase = re.sub(r"\bhaven't\b", "have not", phrase)
    phrase = re.sub(r"\bhavent\b", "have not", phrase)
    phrase = re.sub(r"\bhadn't\b", "had not", phrase)
    phrase = re.sub(r"\bwon't\b", "will not", phrase)
    phrase = re.sub(r"\bwouldn't\b", "would not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)

    # using regular expressions to expand the contractions
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)

    return phrase

In [178]:
stop = 0
def stopword_removal(tweet):
    # stoplist = stopwords.words('english')
    stoplist = []
    manual_stoplist = ['retweet', 'retwet', 'rt', 'oh', 'dm', 'mt', 'ht', 'ff', 'shoulda', 'woulda', 'coulda', 'might', 'im', 'tb', 'mysql', 'hah', "a", "an", "the", "and", "but", "if",
                  "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over",
                  "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "nor", "only", "own", "same", "so", "than", "too", "very", "s",
                  "t", "just", "don", "now", 'tweet', 'x', 'f', 'go', 'get', 'give']
    stoplist.append(manual_stoplist)
    global stop
    stop += 1
    if stop % 100 == 0:
      print(stop)
    # stopwords = stopwords.words('english')
    tweet = [word for word in tweet if word not in stoplist]
    if len(tweet) == 0:
        tweet = ['None']
    return tweet

In [179]:
# stopwords.words('english')

In [180]:
short_forms = {
    'n': 'and',
    'ya': 'you',
    'luv': 'love',
    'lol': 'laugh',
    'k': 'okay',
    'na': 'no',
    'ily': 'love',
    'im': 'am',
    'morn': 'morning',
    'nght': 'night',
    'no': 'not',
    'Ill': 'will',
    'shoulda': 'should have'
    }

In [181]:
def handle_shortforms(tweet):
    temp = ''
    for word in tweet.split():
        if word in short_forms.keys():
            temp = temp + ' ' + short_forms[word]
        else:
            temp = temp + ' ' + word
    return ' '.join(temp.split())

In [182]:
# handle_shortforms(['I','am','lol','in','practice'])

In [183]:
handle_shortforms('I am a good boy shoulda gone')

'I am a good boy should have gone'

In [184]:
## Maintaining only letters within a tweet and removing every other information since not indicative of sentiment
def maintain_letters(tweet):
    tweet = re.sub(r'[^a-zA-Z]', ' ', tweet)      # since lowering has already been done
    return ' '.join(tweet.split())

In [185]:
maintain_letters('i am a good boy. hero is @terohja 909')

'i am a good boy hero is terohja'

In [186]:
### TO BE MODIFIED ####
# Emoticons store a lot of information
emo_info = {
    # positive emoticons
    ":‑)": " <EMOJI> ",
    ":)": " <EMOJI> ",
    ";)": " <EMOJI> ",
    ":-}": " <EMOJI> ",
    "=]": " <EMOJI> ",
    "=)": " <EMOJI> ",
    ";d": " <EMOJI> ",
    ":d": " <EMOJI> ",
    ":dd": " <EMOJI> ",
    "xd": " <EMOJI> ",
    "<3": " <EMOJI> ",

    # negativve emoticons
    ":‑(": " <EMOJI> ",
    ":‑[": " <EMOJI> ",
    ":(": " <EMOJI> ",
    "=(": " <EMOJI> ",
    "=/": " <EMOJI> ",
    ":{": " <EMOJI> ",
    ":/": " <EMOJI> ",
    ":|": " <EMOJI> ",
    ":-/": " <EMOJI> ",
    ":o": " <EMOJI> "

}

In [187]:
# def remove_emoji(tweet):
#     ":p": " tease ",
#     "xp": " tease "

In [188]:
### TO BE MODIFIED ####
emo_info_order = [k for (k_len, k) in reversed(sorted([(len(k), k) for k in emo_info.keys()]))]

In [189]:
### TO BE MODIFIED ####
def emo_repl(phrase):
    for k in emo_info_order:
        phrase = phrase.replace(k, emo_info[k])
    return phrase

In [190]:
df['Tweet_regex'] = df['Tweet'].apply(clean_text)
df.head()

Unnamed: 0,Polarity,Tweet,Tweet_regex
514293,0,i miss nikki nu nu already shes always there ...,i miss nikki nu nu already shes always there w...
142282,0,So I had a dream last night. I remember a sig...,so i had a dream last night. i remember a sign...
403727,0,@girlyghost ohh poor sickly you (((hugs)) ho...,<MENTION> ohh poor sickly you (((hugs)) hope y...
649503,0,it is raining again,it is raining again
610789,0,@MissKeriBaby wish I was in LA right now,<MENTION> wish i was in la right now


In [191]:
# EDA
df[df['Polarity'] == 0]['Tweet_regex'].str.contains('<MENTION>').value_counts()

False    24832
True     15007
Name: Tweet_regex, dtype: int64

In [192]:
df[df['Polarity'] == 4]['Tweet_regex'].str.contains('<MENTION>').value_counts()

Series([], Name: Tweet_regex, dtype: int64)

In [193]:
# EDA
df[df['Polarity'] == 0]['Tweet_regex'].str.contains('<WEBSITE>').value_counts()

False    37821
True      2018
Name: Tweet_regex, dtype: int64

In [194]:
# EDA
df[df['Polarity'] == 4]['Tweet_regex'].str.contains('<WEBSITE>').value_counts()

Series([], Name: Tweet_regex, dtype: int64)

In [195]:
df['Tweet_emoji'] = df['Tweet_regex'].apply(emo_repl)
df.head()

Unnamed: 0,Polarity,Tweet,Tweet_regex,Tweet_emoji
514293,0,i miss nikki nu nu already shes always there ...,i miss nikki nu nu already shes always there w...,i miss nikki nu nu already shes always there w...
142282,0,So I had a dream last night. I remember a sig...,so i had a dream last night. i remember a sign...,so i had a dream last night. i remember a sign...
403727,0,@girlyghost ohh poor sickly you (((hugs)) ho...,<MENTION> ohh poor sickly you (((hugs)) hope y...,<MENTION> ohh poor sickly you (((hugs)) hope y...
649503,0,it is raining again,it is raining again,it is raining again
610789,0,@MissKeriBaby wish I was in LA right now,<MENTION> wish i was in la right now,<MENTION> wish i was in la right now


In [196]:
df['Tweet_nopunc'] = df['Tweet_emoji'].apply(remove_punc)
# df['Tweet_nopunc'] = df['Tweet_regex'].apply(remove_punc)         # NOT USING EMOJI
df.head()

Unnamed: 0,Polarity,Tweet,Tweet_regex,Tweet_emoji,Tweet_nopunc
514293,0,i miss nikki nu nu already shes always there ...,i miss nikki nu nu already shes always there w...,i miss nikki nu nu already shes always there w...,i miss nikki nu nu already shes always there w...
142282,0,So I had a dream last night. I remember a sig...,so i had a dream last night. i remember a sign...,so i had a dream last night. i remember a sign...,so i had a dream last night i remember a sign ...
403727,0,@girlyghost ohh poor sickly you (((hugs)) ho...,<MENTION> ohh poor sickly you (((hugs)) hope y...,<MENTION> ohh poor sickly you (((hugs)) hope y...,MENTION ohh poor sickly you hugs hope you feel...
649503,0,it is raining again,it is raining again,it is raining again,it is raining again
610789,0,@MissKeriBaby wish I was in LA right now,<MENTION> wish i was in la right now,<MENTION> wish i was in la right now,MENTION wish i was in la right now


In [197]:
df['Tweet_clitics'] = df['Tweet_nopunc'].apply(handle_clitics)
df.head()

Unnamed: 0,Polarity,Tweet,Tweet_regex,Tweet_emoji,Tweet_nopunc,Tweet_clitics
514293,0,i miss nikki nu nu already shes always there ...,i miss nikki nu nu already shes always there w...,i miss nikki nu nu already shes always there w...,i miss nikki nu nu already shes always there w...,i miss nikki nu nu already shes always there w...
142282,0,So I had a dream last night. I remember a sig...,so i had a dream last night. i remember a sign...,so i had a dream last night. i remember a sign...,so i had a dream last night i remember a sign ...,so i had a dream last night i remember a sign ...
403727,0,@girlyghost ohh poor sickly you (((hugs)) ho...,<MENTION> ohh poor sickly you (((hugs)) hope y...,<MENTION> ohh poor sickly you (((hugs)) hope y...,MENTION ohh poor sickly you hugs hope you feel...,MENTION ohh poor sickly you hugs hope you feel...
649503,0,it is raining again,it is raining again,it is raining again,it is raining again,it is raining again
610789,0,@MissKeriBaby wish I was in LA right now,<MENTION> wish i was in la right now,<MENTION> wish i was in la right now,MENTION wish i was in la right now,MENTION wish i was in la right now


In [198]:
df['Tweet_shortforms'] = df['Tweet_clitics'].apply(handle_shortforms)
df.head()

Unnamed: 0,Polarity,Tweet,Tweet_regex,Tweet_emoji,Tweet_nopunc,Tweet_clitics,Tweet_shortforms
514293,0,i miss nikki nu nu already shes always there ...,i miss nikki nu nu already shes always there w...,i miss nikki nu nu already shes always there w...,i miss nikki nu nu already shes always there w...,i miss nikki nu nu already shes always there w...,i miss nikki nu nu already shes always there w...
142282,0,So I had a dream last night. I remember a sig...,so i had a dream last night. i remember a sign...,so i had a dream last night. i remember a sign...,so i had a dream last night i remember a sign ...,so i had a dream last night i remember a sign ...,so i had a dream last night i remember a sign ...
403727,0,@girlyghost ohh poor sickly you (((hugs)) ho...,<MENTION> ohh poor sickly you (((hugs)) hope y...,<MENTION> ohh poor sickly you (((hugs)) hope y...,MENTION ohh poor sickly you hugs hope you feel...,MENTION ohh poor sickly you hugs hope you feel...,MENTION ohh poor sickly you hugs hope you feel...
649503,0,it is raining again,it is raining again,it is raining again,it is raining again,it is raining again,it is raining again
610789,0,@MissKeriBaby wish I was in LA right now,<MENTION> wish i was in la right now,<MENTION> wish i was in la right now,MENTION wish i was in la right now,MENTION wish i was in la right now,MENTION wish i was in la right now


In [199]:
df['Tweet_pure_string'] = df['Tweet_shortforms'].apply(maintain_letters)
df.head()

Unnamed: 0,Polarity,Tweet,Tweet_regex,Tweet_emoji,Tweet_nopunc,Tweet_clitics,Tweet_shortforms,Tweet_pure_string
514293,0,i miss nikki nu nu already shes always there ...,i miss nikki nu nu already shes always there w...,i miss nikki nu nu already shes always there w...,i miss nikki nu nu already shes always there w...,i miss nikki nu nu already shes always there w...,i miss nikki nu nu already shes always there w...,i miss nikki nu nu already shes always there w...
142282,0,So I had a dream last night. I remember a sig...,so i had a dream last night. i remember a sign...,so i had a dream last night. i remember a sign...,so i had a dream last night i remember a sign ...,so i had a dream last night i remember a sign ...,so i had a dream last night i remember a sign ...,so i had a dream last night i remember a sign ...
403727,0,@girlyghost ohh poor sickly you (((hugs)) ho...,<MENTION> ohh poor sickly you (((hugs)) hope y...,<MENTION> ohh poor sickly you (((hugs)) hope y...,MENTION ohh poor sickly you hugs hope you feel...,MENTION ohh poor sickly you hugs hope you feel...,MENTION ohh poor sickly you hugs hope you feel...,MENTION ohh poor sickly you hugs hope you feel...
649503,0,it is raining again,it is raining again,it is raining again,it is raining again,it is raining again,it is raining again,it is raining again
610789,0,@MissKeriBaby wish I was in LA right now,<MENTION> wish i was in la right now,<MENTION> wish i was in la right now,MENTION wish i was in la right now,MENTION wish i was in la right now,MENTION wish i was in la right now,MENTION wish i was in la right now


In [200]:
df['Tweet_token'] = df['Tweet_pure_string'].apply(tweet_word_tokenizer)
df.head()

Unnamed: 0,Polarity,Tweet,Tweet_regex,Tweet_emoji,Tweet_nopunc,Tweet_clitics,Tweet_shortforms,Tweet_pure_string,Tweet_token
514293,0,i miss nikki nu nu already shes always there ...,i miss nikki nu nu already shes always there w...,i miss nikki nu nu already shes always there w...,i miss nikki nu nu already shes always there w...,i miss nikki nu nu already shes always there w...,i miss nikki nu nu already shes always there w...,i miss nikki nu nu already shes always there w...,"[i, miss, nikki, nu, nu, already, shes, always..."
142282,0,So I had a dream last night. I remember a sig...,so i had a dream last night. i remember a sign...,so i had a dream last night. i remember a sign...,so i had a dream last night i remember a sign ...,so i had a dream last night i remember a sign ...,so i had a dream last night i remember a sign ...,so i had a dream last night i remember a sign ...,"[so, i, had, a, dream, last, night, i, remembe..."
403727,0,@girlyghost ohh poor sickly you (((hugs)) ho...,<MENTION> ohh poor sickly you (((hugs)) hope y...,<MENTION> ohh poor sickly you (((hugs)) hope y...,MENTION ohh poor sickly you hugs hope you feel...,MENTION ohh poor sickly you hugs hope you feel...,MENTION ohh poor sickly you hugs hope you feel...,MENTION ohh poor sickly you hugs hope you feel...,"[MENTION, ohh, poor, sickly, you, hugs, hope, ..."
649503,0,it is raining again,it is raining again,it is raining again,it is raining again,it is raining again,it is raining again,it is raining again,"[it, is, raining, again]"
610789,0,@MissKeriBaby wish I was in LA right now,<MENTION> wish i was in la right now,<MENTION> wish i was in la right now,MENTION wish i was in la right now,MENTION wish i was in la right now,MENTION wish i was in la right now,MENTION wish i was in la right now,"[MENTION, wish, i, was, in, la, right, now]"


In [201]:
# Was absolutely useless to use
stem = 0
def stemmer(tweet):
    global stem
    stem += 1
    if stem % 1000:
      print(stem)
    porter_stemmer = PorterStemmer()
    tweet = [porter_stemmer.stem(word) for word in tweet]
    return tweet

In [202]:
stemmer(['I','am','playing','making', 'what','I','do'])

1


['I', 'am', 'play', 'make', 'what', 'I', 'do']

In [203]:
# df['Tweet_stem'] = df['Tweet_token'].apply(stemmer)
# df.head()

In [204]:
def make_sentences(df, col, title):
    df[title] = df[col].apply(lambda x:' '.join([i for i in x]))
    return df

In [205]:
# df = make_sentences(df, 'Tweet_stem', 'Tweet_sent')
# df.head()

KeyError: ignored

In [206]:
df = make_sentences(df, 'Tweet_token', 'Tweet_final_sent')
df.head()

Unnamed: 0,Polarity,Tweet,Tweet_regex,Tweet_emoji,Tweet_nopunc,Tweet_clitics,Tweet_shortforms,Tweet_pure_string,Tweet_token,Tweet_final_sent
514293,0,i miss nikki nu nu already shes always there ...,i miss nikki nu nu already shes always there w...,i miss nikki nu nu already shes always there w...,i miss nikki nu nu already shes always there w...,i miss nikki nu nu already shes always there w...,i miss nikki nu nu already shes always there w...,i miss nikki nu nu already shes always there w...,"[i, miss, nikki, nu, nu, already, shes, always...",i miss nikki nu nu already shes always there w...
142282,0,So I had a dream last night. I remember a sig...,so i had a dream last night. i remember a sign...,so i had a dream last night. i remember a sign...,so i had a dream last night i remember a sign ...,so i had a dream last night i remember a sign ...,so i had a dream last night i remember a sign ...,so i had a dream last night i remember a sign ...,"[so, i, had, a, dream, last, night, i, remembe...",so i had a dream last night i remember a sign ...
403727,0,@girlyghost ohh poor sickly you (((hugs)) ho...,<MENTION> ohh poor sickly you (((hugs)) hope y...,<MENTION> ohh poor sickly you (((hugs)) hope y...,MENTION ohh poor sickly you hugs hope you feel...,MENTION ohh poor sickly you hugs hope you feel...,MENTION ohh poor sickly you hugs hope you feel...,MENTION ohh poor sickly you hugs hope you feel...,"[MENTION, ohh, poor, sickly, you, hugs, hope, ...",MENTION ohh poor sickly you hugs hope you feel...
649503,0,it is raining again,it is raining again,it is raining again,it is raining again,it is raining again,it is raining again,it is raining again,"[it, is, raining, again]",it is raining again
610789,0,@MissKeriBaby wish I was in LA right now,<MENTION> wish i was in la right now,<MENTION> wish i was in la right now,MENTION wish i was in la right now,MENTION wish i was in la right now,MENTION wish i was in la right now,MENTION wish i was in la right now,"[MENTION, wish, i, was, in, la, right, now]",MENTION wish i was in la right now


In [58]:
def pos_tagger(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:         
        return None

In [59]:
#### TO BE MODIFIED ######
count = 0
def pos_tagging(tweet):
    global count
    count += 1
    if count % 100 == 0:
      print(count)
    # tweet = nltk.pos_tag(tweet) 
    tweet = nltk.pos_tag([i for i in tweet if i])
    return tweet

In [60]:
pos_tagging(['','I','am','good'])

[('I', 'PRP'), ('am', 'VBP'), ('good', 'JJ')]

In [61]:
### TO BE MODIFIED ################
def tweet_lemmatizer(tweet):
    lemmatizer = WordNetLemmatizer()
    lemmatized = []
    pos_wordnet = list(map(lambda x: (x[0], pos_tagger(x[1])), tweet))
    for word, tag in pos_wordnet:
        if tag is None:
            lemmatized.append(word)
        else:       
            lemmatized.append(lemmatizer.lemmatize(word, tag))
    lemmatized = ' '.join(lemmatized)
    lemmatized_sent = ', '.join(lemmatized)
    # print(list(lemmatized.split()))
    return list(lemmatized.split())

In [148]:
df['Tweet_pos'] = df['Tweet_token'].apply(pos_tagging)
df.head()

100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700
6800
6900
7000
7100
7200
7300
7400
7500
7600
7700
7800
7900
8000
8100
8200
8300
8400
8500
8600
8700
8800
8900
9000
9100
9200
9300
9400
9500
9600
9700
9800
9900
10000
10100
10200
10300
10400
10500
10600
10700
10800
10900
11000
11100
11200
11300
11400
11500
11600
11700
11800
11900
12000
12100
12200
12300
12400
12500
12600
12700
12800
12900
13000
13100
13200
13300
13400
13500
13600
13700
13800
13900
14000
14100
14200
14300
14400
14500
14600
14700
14800
14900
15000
15100
15200
15300
15400
15500
15600
15700
15800
15900
16000
16100
16200
16300
16400
16500
16600
16700
16800
16900
17000
17100
17200
17300
17400
17500
17600
17700
17800
17900
18000
18100
18200
18300
18400
1850

Unnamed: 0,Polarity,Tweet,Tweet_regex,Tweet_emoji,Tweet_nopunc,Tweet_clitics,Tweet_shortforms,Tweet_pure_string,Tweet_token,Tweet_stem,Tweet_sent,Tweet_normalised,Tweet_stopword,Tweet_final_sent,Tweet_lexicons,Tweet_final_sent_lexicons,Tweet_pos
0,0,i miss nikki nu nu already shes always there ...,i miss nikki nu nu already shes always there w...,i miss nikki nu nu already shes always there w...,i miss nikki nu nu already shes always there w...,i miss nikki nu nu already shes always there w...,i miss nikki nu nu already shes always there w...,i miss nikki nu nu already shes always there w...,"[i, miss, nikki, nu, nu, already, shes, always...","[i, miss, nikki, nu, nu, alreadi, she, alway, ...",i miss nikki nu nu alreadi she alway there whe...,"[miss, nikki, nu, nu, alreadi, she, alway, the...","[miss, nikki, nu, nu, alreadi, she, alway, the...",miss nikki nu nu alreadi she alway there when ...,"[miss, nikki, nu, nu, alreadi, she, alway, the...",miss nikki nu nu alreadi she alway there when ...,"[(i, NN), (miss, VBP), (nikki, JJ), (nu, JJ), ..."
1,0,So I had a dream last night. I remember a sig...,so i had a dream last night. i remember a sign...,so i had a dream last night. i remember a sign...,so i had a dream last night i remember a sign ...,so i had a dream last night i remember a sign ...,so i had a dream last night i remember a sign ...,so i had a dream last night i remember a sign ...,"[so, i, had, a, dream, last, night, i, remembe...","[so, i, had, a, dream, last, night, i, rememb,...",so i had a dream last night i rememb a sign wh...,"[so, had, dream, last, night, rememb, sign, wh...","[so, had, dream, last, night, rememb, sign, wh...",so had dream last night rememb sign which clea...,"[so, had, dream, last, night, rememb, sign, wh...",so had dream last night rememb sign which clea...,"[(so, RB), (i, JJ), (had, VBD), (a, DT), (drea..."
2,0,@girlyghost ohh poor sickly you (((hugs)) ho...,<MENTION> ohh poor sickly you (((hugs)) hope y...,<MENTION> ohh poor sickly you (((hugs)) hope y...,MENTION ohh poor sickly you hugs hope you feel...,MENTION ohh poor sickly you hugs hope you feel...,MENTION ohh poor sickly you hugs hope you feel...,MENTION ohh poor sickly you hugs hope you feel...,"[MENTION, ohh, poor, sickly, you, hugs, hope, ...","[mention, ohh, poor, sickli, you, hug, hope, y...",mention ohh poor sickli you hug hope you feel ...,"[mention, ohh, poor, sickli, you, hug, hope, y...","[mention, ohh, poor, sickli, you, hug, hope, y...",mention ohh poor sickli you hug hope you feel ...,"[mention, ohh, poor, sickli, you, hug, hope, y...",mention ohh poor sickli you hug hope you feel ...,"[(MENTION, NNP), (ohh, PRP), (poor, JJ), (sick..."
3,0,it is raining again,it is raining again,it is raining again,it is raining again,it is raining again,it is raining again,it is raining again,"[it, is, raining, again]","[it, is, rain, again]",it is rain again,"[it, is, rain, again]","[it, is, rain, again, NEGATIVE, NEGATIVE, NEGA...",it is rain again,"[it, is, rain, again, NEGATIVE, NEGATIVE, NEGA...",it is rain again NEGATIVE NEGATIVE NEGATIVE NE...,"[(it, PRP), (is, VBZ), (raining, VBG), (again,..."
4,0,@MissKeriBaby wish I was in LA right now,<MENTION> wish i was in la right now,<MENTION> wish i was in la right now,MENTION wish i was in la right now,MENTION wish i was in la right now,MENTION wish i was in la right now,MENTION wish i was in la right now,"[MENTION, wish, i, was, in, la, right, now]","[mention, wish, i, wa, in, la, right, now]",mention wish i wa in la right now,"[mention, wish, wa, in, la, right, now]","[mention, wish, wa, in, la, right, now, NEGATI...",mention wish wa in la right now,"[mention, wish, wa, in, la, right, now, NEGATI...",mention wish wa in la right now NEGATIVE NEGAT...,"[(MENTION, NNP), (wish, NN), (i, NN), (was, VB..."


In [149]:
df['Tweet_pos'] = df['Tweet_token'].apply(pos_tagging)
df.head()

80100
80200
80300
80400
80500
80600
80700
80800
80900
81000
81100
81200
81300
81400
81500
81600
81700
81800
81900
82000
82100
82200
82300
82400
82500
82600
82700
82800
82900
83000
83100
83200
83300
83400
83500
83600
83700
83800
83900
84000
84100
84200
84300
84400
84500
84600
84700
84800
84900
85000
85100
85200
85300
85400
85500
85600
85700
85800
85900
86000
86100
86200
86300
86400
86500
86600
86700
86800
86900
87000
87100
87200
87300
87400
87500
87600
87700
87800
87900
88000
88100
88200
88300
88400
88500
88600
88700
88800
88900
89000
89100
89200
89300
89400
89500
89600
89700
89800
89900
90000
90100
90200
90300
90400
90500
90600
90700
90800
90900
91000
91100
91200
91300
91400
91500
91600
91700
91800
91900
92000
92100
92200
92300
92400
92500
92600
92700
92800
92900
93000
93100
93200
93300
93400
93500
93600
93700
93800
93900
94000
94100
94200
94300
94400
94500
94600
94700
94800
94900
95000
95100
95200
95300
95400
95500
95600
95700
95800
95900
96000
96100
96200
96300
96400
96500
96600
9670

Unnamed: 0,Polarity,Tweet,Tweet_regex,Tweet_emoji,Tweet_nopunc,Tweet_clitics,Tweet_shortforms,Tweet_pure_string,Tweet_token,Tweet_stem,Tweet_sent,Tweet_normalised,Tweet_stopword,Tweet_final_sent,Tweet_lexicons,Tweet_final_sent_lexicons,Tweet_pos
0,0,i miss nikki nu nu already shes always there ...,i miss nikki nu nu already shes always there w...,i miss nikki nu nu already shes always there w...,i miss nikki nu nu already shes always there w...,i miss nikki nu nu already shes always there w...,i miss nikki nu nu already shes always there w...,i miss nikki nu nu already shes always there w...,"[i, miss, nikki, nu, nu, already, shes, always...","[i, miss, nikki, nu, nu, alreadi, she, alway, ...",i miss nikki nu nu alreadi she alway there whe...,"[miss, nikki, nu, nu, alreadi, she, alway, the...","[miss, nikki, nu, nu, alreadi, she, alway, the...",miss nikki nu nu alreadi she alway there when ...,"[miss, nikki, nu, nu, alreadi, she, alway, the...",miss nikki nu nu alreadi she alway there when ...,"[(i, NN), (miss, VBP), (nikki, JJ), (nu, JJ), ..."
1,0,So I had a dream last night. I remember a sig...,so i had a dream last night. i remember a sign...,so i had a dream last night. i remember a sign...,so i had a dream last night i remember a sign ...,so i had a dream last night i remember a sign ...,so i had a dream last night i remember a sign ...,so i had a dream last night i remember a sign ...,"[so, i, had, a, dream, last, night, i, remembe...","[so, i, had, a, dream, last, night, i, rememb,...",so i had a dream last night i rememb a sign wh...,"[so, had, dream, last, night, rememb, sign, wh...","[so, had, dream, last, night, rememb, sign, wh...",so had dream last night rememb sign which clea...,"[so, had, dream, last, night, rememb, sign, wh...",so had dream last night rememb sign which clea...,"[(so, RB), (i, JJ), (had, VBD), (a, DT), (drea..."
2,0,@girlyghost ohh poor sickly you (((hugs)) ho...,<MENTION> ohh poor sickly you (((hugs)) hope y...,<MENTION> ohh poor sickly you (((hugs)) hope y...,MENTION ohh poor sickly you hugs hope you feel...,MENTION ohh poor sickly you hugs hope you feel...,MENTION ohh poor sickly you hugs hope you feel...,MENTION ohh poor sickly you hugs hope you feel...,"[MENTION, ohh, poor, sickly, you, hugs, hope, ...","[mention, ohh, poor, sickli, you, hug, hope, y...",mention ohh poor sickli you hug hope you feel ...,"[mention, ohh, poor, sickli, you, hug, hope, y...","[mention, ohh, poor, sickli, you, hug, hope, y...",mention ohh poor sickli you hug hope you feel ...,"[mention, ohh, poor, sickli, you, hug, hope, y...",mention ohh poor sickli you hug hope you feel ...,"[(MENTION, NNP), (ohh, PRP), (poor, JJ), (sick..."
3,0,it is raining again,it is raining again,it is raining again,it is raining again,it is raining again,it is raining again,it is raining again,"[it, is, raining, again]","[it, is, rain, again]",it is rain again,"[it, is, rain, again]","[it, is, rain, again, NEGATIVE, NEGATIVE, NEGA...",it is rain again,"[it, is, rain, again, NEGATIVE, NEGATIVE, NEGA...",it is rain again NEGATIVE NEGATIVE NEGATIVE NE...,"[(it, PRP), (is, VBZ), (raining, VBG), (again,..."
4,0,@MissKeriBaby wish I was in LA right now,<MENTION> wish i was in la right now,<MENTION> wish i was in la right now,MENTION wish i was in la right now,MENTION wish i was in la right now,MENTION wish i was in la right now,MENTION wish i was in la right now,"[MENTION, wish, i, was, in, la, right, now]","[mention, wish, i, wa, in, la, right, now]",mention wish i wa in la right now,"[mention, wish, wa, in, la, right, now]","[mention, wish, wa, in, la, right, now, NEGATI...",mention wish wa in la right now,"[mention, wish, wa, in, la, right, now, NEGATI...",mention wish wa in la right now NEGATIVE NEGAT...,"[(MENTION, NNP), (wish, NN), (i, NN), (was, VB..."


In [150]:
df['Tweet_lemma'] = df['Tweet_pos'].apply(tweet_lemmatizer)
df.head()

Unnamed: 0,Polarity,Tweet,Tweet_regex,Tweet_emoji,Tweet_nopunc,Tweet_clitics,Tweet_shortforms,Tweet_pure_string,Tweet_token,Tweet_stem,Tweet_sent,Tweet_normalised,Tweet_stopword,Tweet_final_sent,Tweet_lexicons,Tweet_final_sent_lexicons,Tweet_pos,Tweet_lemma
0,0,i miss nikki nu nu already shes always there ...,i miss nikki nu nu already shes always there w...,i miss nikki nu nu already shes always there w...,i miss nikki nu nu already shes always there w...,i miss nikki nu nu already shes always there w...,i miss nikki nu nu already shes always there w...,i miss nikki nu nu already shes always there w...,"[i, miss, nikki, nu, nu, already, shes, always...","[i, miss, nikki, nu, nu, alreadi, she, alway, ...",i miss nikki nu nu alreadi she alway there whe...,"[miss, nikki, nu, nu, alreadi, she, alway, the...","[miss, nikki, nu, nu, alreadi, she, alway, the...",miss nikki nu nu alreadi she alway there when ...,"[miss, nikki, nu, nu, alreadi, she, alway, the...",miss nikki nu nu alreadi she alway there when ...,"[(i, NN), (miss, VBP), (nikki, JJ), (nu, JJ), ...","[i, miss, nikki, nu, nu, already, shes, always..."
1,0,So I had a dream last night. I remember a sig...,so i had a dream last night. i remember a sign...,so i had a dream last night. i remember a sign...,so i had a dream last night i remember a sign ...,so i had a dream last night i remember a sign ...,so i had a dream last night i remember a sign ...,so i had a dream last night i remember a sign ...,"[so, i, had, a, dream, last, night, i, remembe...","[so, i, had, a, dream, last, night, i, rememb,...",so i had a dream last night i rememb a sign wh...,"[so, had, dream, last, night, rememb, sign, wh...","[so, had, dream, last, night, rememb, sign, wh...",so had dream last night rememb sign which clea...,"[so, had, dream, last, night, rememb, sign, wh...",so had dream last night rememb sign which clea...,"[(so, RB), (i, JJ), (had, VBD), (a, DT), (drea...","[so, i, have, a, dream, last, night, i, rememb..."
2,0,@girlyghost ohh poor sickly you (((hugs)) ho...,<MENTION> ohh poor sickly you (((hugs)) hope y...,<MENTION> ohh poor sickly you (((hugs)) hope y...,MENTION ohh poor sickly you hugs hope you feel...,MENTION ohh poor sickly you hugs hope you feel...,MENTION ohh poor sickly you hugs hope you feel...,MENTION ohh poor sickly you hugs hope you feel...,"[MENTION, ohh, poor, sickly, you, hugs, hope, ...","[mention, ohh, poor, sickli, you, hug, hope, y...",mention ohh poor sickli you hug hope you feel ...,"[mention, ohh, poor, sickli, you, hug, hope, y...","[mention, ohh, poor, sickli, you, hug, hope, y...",mention ohh poor sickli you hug hope you feel ...,"[mention, ohh, poor, sickli, you, hug, hope, y...",mention ohh poor sickli you hug hope you feel ...,"[(MENTION, NNP), (ohh, PRP), (poor, JJ), (sick...","[MENTION, ohh, poor, sickly, you, hug, hope, y..."
3,0,it is raining again,it is raining again,it is raining again,it is raining again,it is raining again,it is raining again,it is raining again,"[it, is, raining, again]","[it, is, rain, again]",it is rain again,"[it, is, rain, again]","[it, is, rain, again, NEGATIVE, NEGATIVE, NEGA...",it is rain again,"[it, is, rain, again, NEGATIVE, NEGATIVE, NEGA...",it is rain again NEGATIVE NEGATIVE NEGATIVE NE...,"[(it, PRP), (is, VBZ), (raining, VBG), (again,...","[it, be, rain, again]"
4,0,@MissKeriBaby wish I was in LA right now,<MENTION> wish i was in la right now,<MENTION> wish i was in la right now,MENTION wish i was in la right now,MENTION wish i was in la right now,MENTION wish i was in la right now,MENTION wish i was in la right now,"[MENTION, wish, i, was, in, la, right, now]","[mention, wish, i, wa, in, la, right, now]",mention wish i wa in la right now,"[mention, wish, wa, in, la, right, now]","[mention, wish, wa, in, la, right, now, NEGATI...",mention wish wa in la right now,"[mention, wish, wa, in, la, right, now, NEGATI...",mention wish wa in la right now NEGATIVE NEGAT...,"[(MENTION, NNP), (wish, NN), (i, NN), (was, VB...","[MENTION, wish, i, be, in, la, right, now]"


In [65]:
def make_sentences(df, col, title):
    df[title] = df[col].apply(lambda x:' '.join([i for i in x]))
    return df

In [211]:
df = make_sentences(df, 'Tweet_token', 'Tweet_sent')
df.head()

Unnamed: 0,Polarity,Tweet,Tweet_regex,Tweet_emoji,Tweet_nopunc,Tweet_clitics,Tweet_shortforms,Tweet_pure_string,Tweet_token,Tweet_final_sent,Tweet_sent
514293,0,i miss nikki nu nu already shes always there ...,i miss nikki nu nu already shes always there w...,i miss nikki nu nu already shes always there w...,i miss nikki nu nu already shes always there w...,i miss nikki nu nu already shes always there w...,i miss nikki nu nu already shes always there w...,i miss nikki nu nu already shes always there w...,"[i, miss, nikki, nu, nu, already, shes, always...",i miss nikki nu nu already shes always there w...,i miss nikki nu nu already shes always there w...
142282,0,So I had a dream last night. I remember a sig...,so i had a dream last night. i remember a sign...,so i had a dream last night. i remember a sign...,so i had a dream last night i remember a sign ...,so i had a dream last night i remember a sign ...,so i had a dream last night i remember a sign ...,so i had a dream last night i remember a sign ...,"[so, i, had, a, dream, last, night, i, remembe...",so i had a dream last night i remember a sign ...,so i had a dream last night i remember a sign ...
403727,0,@girlyghost ohh poor sickly you (((hugs)) ho...,<MENTION> ohh poor sickly you (((hugs)) hope y...,<MENTION> ohh poor sickly you (((hugs)) hope y...,MENTION ohh poor sickly you hugs hope you feel...,MENTION ohh poor sickly you hugs hope you feel...,MENTION ohh poor sickly you hugs hope you feel...,MENTION ohh poor sickly you hugs hope you feel...,"[MENTION, ohh, poor, sickly, you, hugs, hope, ...",MENTION ohh poor sickly you hugs hope you feel...,MENTION ohh poor sickly you hugs hope you feel...
649503,0,it is raining again,it is raining again,it is raining again,it is raining again,it is raining again,it is raining again,it is raining again,"[it, is, raining, again]",it is raining again,it is raining again
610789,0,@MissKeriBaby wish I was in LA right now,<MENTION> wish i was in la right now,<MENTION> wish i was in la right now,MENTION wish i was in la right now,MENTION wish i was in la right now,MENTION wish i was in la right now,MENTION wish i was in la right now,"[MENTION, wish, i, was, in, la, right, now]",MENTION wish i was in la right now,MENTION wish i was in la right now


In [67]:
wordnet.synsets('hello')

[Synset('hello.n.01')]

In [208]:
## TO BE MODIFIED #####
def normalisation_words(tweet):
    tweet = tweet.replace(r'([a-z])\1{1,}', r'\1\1')
    # tweet = re.sub(r'(ha){1,}', r'laugh', tweet)
    tweet = re.sub(r'h+a+[ha]+', r'laaaugh', tweet)     # To give more significance
    # tweet = re.sub(r'(lol){1,}', r'laugh', tweet)
    tweet = re.sub(r'l+o+[lo]+', r'laaaugh', tweet)
    tweet = ' '.join([word if len(wordnet.synsets(word)) > 0 else re.sub(r'([a-z])\1{1,}', r'\1\1', word) for word in tweet.split()])
    tweet = re.sub(r'\b([a-z])\1{1,}', r' ', tweet)     # If only repeated letters are left, remove them
    tweet = re.sub(r"\b[a-zA-Z]{1}\b", ' ', tweet)        # Removing single letters
    # tweet = re.sub(r'lo+l+o+[^\s]+', r'lol', tweet)
    return tweet.split()

In [209]:
normalisation_words('awww hahahhhahahahahahahhahaaaaaaaaaaahhhahaha ppeeee lollipop looooooool happppiest day lolllll lll lool bummer get david carr third day laugh')

['aww',
 'laaugh',
 'ee',
 'laaughipop',
 'laaugh',
 'happiest',
 'day',
 'laaugh',
 'laaugh',
 'bummer',
 'get',
 'david',
 'carr',
 'third',
 'day',
 'laugh']

In [212]:
df['Tweet_normalised'] = df['Tweet_sent'].apply(normalisation_words)
df.head()

Unnamed: 0,Polarity,Tweet,Tweet_regex,Tweet_emoji,Tweet_nopunc,Tweet_clitics,Tweet_shortforms,Tweet_pure_string,Tweet_token,Tweet_final_sent,Tweet_sent,Tweet_normalised
514293,0,i miss nikki nu nu already shes always there ...,i miss nikki nu nu already shes always there w...,i miss nikki nu nu already shes always there w...,i miss nikki nu nu already shes always there w...,i miss nikki nu nu already shes always there w...,i miss nikki nu nu already shes always there w...,i miss nikki nu nu already shes always there w...,"[i, miss, nikki, nu, nu, already, shes, always...",i miss nikki nu nu already shes always there w...,i miss nikki nu nu already shes always there w...,"[miss, nikki, nu, nu, already, shes, always, t..."
142282,0,So I had a dream last night. I remember a sig...,so i had a dream last night. i remember a sign...,so i had a dream last night. i remember a sign...,so i had a dream last night i remember a sign ...,so i had a dream last night i remember a sign ...,so i had a dream last night i remember a sign ...,so i had a dream last night i remember a sign ...,"[so, i, had, a, dream, last, night, i, remembe...",so i had a dream last night i remember a sign ...,so i had a dream last night i remember a sign ...,"[so, had, dream, last, night, remember, sign, ..."
403727,0,@girlyghost ohh poor sickly you (((hugs)) ho...,<MENTION> ohh poor sickly you (((hugs)) hope y...,<MENTION> ohh poor sickly you (((hugs)) hope y...,MENTION ohh poor sickly you hugs hope you feel...,MENTION ohh poor sickly you hugs hope you feel...,MENTION ohh poor sickly you hugs hope you feel...,MENTION ohh poor sickly you hugs hope you feel...,"[MENTION, ohh, poor, sickly, you, hugs, hope, ...",MENTION ohh poor sickly you hugs hope you feel...,MENTION ohh poor sickly you hugs hope you feel...,"[MENTION, ohh, poor, sickly, you, hugs, hope, ..."
649503,0,it is raining again,it is raining again,it is raining again,it is raining again,it is raining again,it is raining again,it is raining again,"[it, is, raining, again]",it is raining again,it is raining again,"[it, is, raining, again]"
610789,0,@MissKeriBaby wish I was in LA right now,<MENTION> wish i was in la right now,<MENTION> wish i was in la right now,MENTION wish i was in la right now,MENTION wish i was in la right now,MENTION wish i was in la right now,MENTION wish i was in la right now,"[MENTION, wish, i, was, in, la, right, now]",MENTION wish i was in la right now,MENTION wish i was in la right now,"[MENTION, wish, was, in, la, right, now]"


In [213]:
df['Tweet_stopword'] = df['Tweet_normalised'].apply(stopword_removal)
df.head()

100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700
6800
6900
7000
7100
7200
7300
7400
7500
7600
7700
7800
7900
8000
8100
8200
8300
8400
8500
8600
8700
8800
8900
9000
9100
9200
9300
9400
9500
9600
9700
9800
9900
10000
10100
10200
10300
10400
10500
10600
10700
10800
10900
11000
11100
11200
11300
11400
11500
11600
11700
11800
11900
12000
12100
12200
12300
12400
12500
12600
12700
12800
12900
13000
13100
13200
13300
13400
13500
13600
13700
13800
13900
14000
14100
14200
14300
14400
14500
14600
14700
14800
14900
15000
15100
15200
15300
15400
15500
15600
15700
15800
15900
16000
16100
16200
16300
16400
16500
16600
16700
16800
16900
17000
17100
17200
17300
17400
17500
17600
17700
17800
17900
18000
18100
18200
18300
18400
1850

Unnamed: 0,Polarity,Tweet,Tweet_regex,Tweet_emoji,Tweet_nopunc,Tweet_clitics,Tweet_shortforms,Tweet_pure_string,Tweet_token,Tweet_final_sent,Tweet_sent,Tweet_normalised,Tweet_stopword
514293,0,i miss nikki nu nu already shes always there ...,i miss nikki nu nu already shes always there w...,i miss nikki nu nu already shes always there w...,i miss nikki nu nu already shes always there w...,i miss nikki nu nu already shes always there w...,i miss nikki nu nu already shes always there w...,i miss nikki nu nu already shes always there w...,"[i, miss, nikki, nu, nu, already, shes, always...",i miss nikki nu nu already shes always there w...,i miss nikki nu nu already shes always there w...,"[miss, nikki, nu, nu, already, shes, always, t...","[miss, nikki, nu, nu, already, shes, always, t..."
142282,0,So I had a dream last night. I remember a sig...,so i had a dream last night. i remember a sign...,so i had a dream last night. i remember a sign...,so i had a dream last night i remember a sign ...,so i had a dream last night i remember a sign ...,so i had a dream last night i remember a sign ...,so i had a dream last night i remember a sign ...,"[so, i, had, a, dream, last, night, i, remembe...",so i had a dream last night i remember a sign ...,so i had a dream last night i remember a sign ...,"[so, had, dream, last, night, remember, sign, ...","[so, had, dream, last, night, remember, sign, ..."
403727,0,@girlyghost ohh poor sickly you (((hugs)) ho...,<MENTION> ohh poor sickly you (((hugs)) hope y...,<MENTION> ohh poor sickly you (((hugs)) hope y...,MENTION ohh poor sickly you hugs hope you feel...,MENTION ohh poor sickly you hugs hope you feel...,MENTION ohh poor sickly you hugs hope you feel...,MENTION ohh poor sickly you hugs hope you feel...,"[MENTION, ohh, poor, sickly, you, hugs, hope, ...",MENTION ohh poor sickly you hugs hope you feel...,MENTION ohh poor sickly you hugs hope you feel...,"[MENTION, ohh, poor, sickly, you, hugs, hope, ...","[MENTION, ohh, poor, sickly, you, hugs, hope, ..."
649503,0,it is raining again,it is raining again,it is raining again,it is raining again,it is raining again,it is raining again,it is raining again,"[it, is, raining, again]",it is raining again,it is raining again,"[it, is, raining, again]","[it, is, raining, again]"
610789,0,@MissKeriBaby wish I was in LA right now,<MENTION> wish i was in la right now,<MENTION> wish i was in la right now,MENTION wish i was in la right now,MENTION wish i was in la right now,MENTION wish i was in la right now,MENTION wish i was in la right now,"[MENTION, wish, i, was, in, la, right, now]",MENTION wish i was in la right now,MENTION wish i was in la right now,"[MENTION, wish, was, in, la, right, now]","[MENTION, wish, was, in, la, right, now]"


In [214]:
df = make_sentences(df, 'Tweet_stopword', 'Tweet_final_sent')
df.head()

Unnamed: 0,Polarity,Tweet,Tweet_regex,Tweet_emoji,Tweet_nopunc,Tweet_clitics,Tweet_shortforms,Tweet_pure_string,Tweet_token,Tweet_final_sent,Tweet_sent,Tweet_normalised,Tweet_stopword
514293,0,i miss nikki nu nu already shes always there ...,i miss nikki nu nu already shes always there w...,i miss nikki nu nu already shes always there w...,i miss nikki nu nu already shes always there w...,i miss nikki nu nu already shes always there w...,i miss nikki nu nu already shes always there w...,i miss nikki nu nu already shes always there w...,"[i, miss, nikki, nu, nu, already, shes, always...",miss nikki nu nu already shes always there whe...,i miss nikki nu nu already shes always there w...,"[miss, nikki, nu, nu, already, shes, always, t...","[miss, nikki, nu, nu, already, shes, always, t..."
142282,0,So I had a dream last night. I remember a sig...,so i had a dream last night. i remember a sign...,so i had a dream last night. i remember a sign...,so i had a dream last night i remember a sign ...,so i had a dream last night i remember a sign ...,so i had a dream last night i remember a sign ...,so i had a dream last night i remember a sign ...,"[so, i, had, a, dream, last, night, i, remembe...",so had dream last night remember sign which cl...,so i had a dream last night i remember a sign ...,"[so, had, dream, last, night, remember, sign, ...","[so, had, dream, last, night, remember, sign, ..."
403727,0,@girlyghost ohh poor sickly you (((hugs)) ho...,<MENTION> ohh poor sickly you (((hugs)) hope y...,<MENTION> ohh poor sickly you (((hugs)) hope y...,MENTION ohh poor sickly you hugs hope you feel...,MENTION ohh poor sickly you hugs hope you feel...,MENTION ohh poor sickly you hugs hope you feel...,MENTION ohh poor sickly you hugs hope you feel...,"[MENTION, ohh, poor, sickly, you, hugs, hope, ...",MENTION ohh poor sickly you hugs hope you feel...,MENTION ohh poor sickly you hugs hope you feel...,"[MENTION, ohh, poor, sickly, you, hugs, hope, ...","[MENTION, ohh, poor, sickly, you, hugs, hope, ..."
649503,0,it is raining again,it is raining again,it is raining again,it is raining again,it is raining again,it is raining again,it is raining again,"[it, is, raining, again]",it is raining again,it is raining again,"[it, is, raining, again]","[it, is, raining, again]"
610789,0,@MissKeriBaby wish I was in LA right now,<MENTION> wish i was in la right now,<MENTION> wish i was in la right now,MENTION wish i was in la right now,MENTION wish i was in la right now,MENTION wish i was in la right now,MENTION wish i was in la right now,"[MENTION, wish, i, was, in, la, right, now]",MENTION wish was in la right now,MENTION wish i was in la right now,"[MENTION, wish, was, in, la, right, now]","[MENTION, wish, was, in, la, right, now]"


In [215]:
df.drop(df[df["Tweet_final_sent"] == ''].index, inplace=True)
df = df.reset_index(drop=True)

In [216]:
df.head()

Unnamed: 0,Polarity,Tweet,Tweet_regex,Tweet_emoji,Tweet_nopunc,Tweet_clitics,Tweet_shortforms,Tweet_pure_string,Tweet_token,Tweet_final_sent,Tweet_sent,Tweet_normalised,Tweet_stopword
0,0,i miss nikki nu nu already shes always there ...,i miss nikki nu nu already shes always there w...,i miss nikki nu nu already shes always there w...,i miss nikki nu nu already shes always there w...,i miss nikki nu nu already shes always there w...,i miss nikki nu nu already shes always there w...,i miss nikki nu nu already shes always there w...,"[i, miss, nikki, nu, nu, already, shes, always...",miss nikki nu nu already shes always there whe...,i miss nikki nu nu already shes always there w...,"[miss, nikki, nu, nu, already, shes, always, t...","[miss, nikki, nu, nu, already, shes, always, t..."
1,0,So I had a dream last night. I remember a sig...,so i had a dream last night. i remember a sign...,so i had a dream last night. i remember a sign...,so i had a dream last night i remember a sign ...,so i had a dream last night i remember a sign ...,so i had a dream last night i remember a sign ...,so i had a dream last night i remember a sign ...,"[so, i, had, a, dream, last, night, i, remembe...",so had dream last night remember sign which cl...,so i had a dream last night i remember a sign ...,"[so, had, dream, last, night, remember, sign, ...","[so, had, dream, last, night, remember, sign, ..."
2,0,@girlyghost ohh poor sickly you (((hugs)) ho...,<MENTION> ohh poor sickly you (((hugs)) hope y...,<MENTION> ohh poor sickly you (((hugs)) hope y...,MENTION ohh poor sickly you hugs hope you feel...,MENTION ohh poor sickly you hugs hope you feel...,MENTION ohh poor sickly you hugs hope you feel...,MENTION ohh poor sickly you hugs hope you feel...,"[MENTION, ohh, poor, sickly, you, hugs, hope, ...",MENTION ohh poor sickly you hugs hope you feel...,MENTION ohh poor sickly you hugs hope you feel...,"[MENTION, ohh, poor, sickly, you, hugs, hope, ...","[MENTION, ohh, poor, sickly, you, hugs, hope, ..."
3,0,it is raining again,it is raining again,it is raining again,it is raining again,it is raining again,it is raining again,it is raining again,"[it, is, raining, again]",it is raining again,it is raining again,"[it, is, raining, again]","[it, is, raining, again]"
4,0,@MissKeriBaby wish I was in LA right now,<MENTION> wish i was in la right now,<MENTION> wish i was in la right now,MENTION wish i was in la right now,MENTION wish i was in la right now,MENTION wish i was in la right now,MENTION wish i was in la right now,"[MENTION, wish, i, was, in, la, right, now]",MENTION wish was in la right now,MENTION wish i was in la right now,"[MENTION, wish, was, in, la, right, now]","[MENTION, wish, was, in, la, right, now]"


In [217]:
# Think if you want to do stratify
X_train, X_test, y_train, y_test = train_test_split(df['Tweet_final_sent'], df['Polarity'], stratify=df['Polarity'], test_size=0.1, random_state=2)

In [218]:
# X_train, X_dev, y_train, y_dev = train_test_split(X_train, y_train, test_size=0.1, random_state=2)

In [219]:
X_train.shape

(72000,)

In [220]:
X_test.shape

(8000,)

In [79]:
# X_dev.shape

In [80]:
X_train


36755    gorgeou day had cute mo old thi morn and sale ...
69117    work doubl on friday leav you with much wors t...
13517                             hate how bore monday are
7955                              my tooth still isnt outt
1281     now am sad wa talk about my babi brother who h...
                               ...                        
1851             mention ahh thank you it is fix now enjoy
39409    walk over tri hashtag michelleobama is speach ...
24603    my pooky here now let is see how long until he...
79946    mention veri excit about my weekend in nyc che...
51873    predepartur catastroph had to take ratbag to t...
Name: Tweet_final_sent, Length: 72000, dtype: object

In [81]:
import csv, collections
from sklearn.base import BaseEstimator
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import FeatureUnion
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
import numpy as np
from sklearn.metrics import roc_curve, auc

In [82]:
def load_sent_word_net():
    sent_scores = collections.defaultdict(list)

    with open("../content/drive/MyDrive/COL772_A2/SentiWordNet_3.0.0.txt","r") as csvfile:
        reader = csv.reader(csvfile, delimiter='\t', quotechar='"')

        for line in reader:
            if line[0].startswith("#"):
                continue
            if len(line) == 1:
                continue
            POS, ID, PosScore, NegScore, SynsetTerms, Glos = line
            if len(POS) == 0 or len(ID) == 0:
                continue
            for term in SynsetTerms.split(" "):
                term = term.split('#')[0]
                # print(term)
                term = term.replace("-", " ").replace("_", " ")
                key = "%s/%s" % (POS, term)
                # print(key)
                sent_scores[key].append((float(PosScore), float(NegScore)))
                # print(sent_scores)
        for key, value in sent_scores.items():
            sent_scores[key] = np.mean(value, axis=0)

        return sent_scores


sent_word_net = load_sent_word_net()

In [83]:
class LinguisticVectorizer(BaseEstimator):

    def get_feature_names(self):
        return np.array(['sent_pos', 'sent_neg', 'nouns', 'adjectives', 'verbs', 'adverbs'])

    def fit(self, documents, y=None):
        return self

    def _get_sentiments(self, d):
        sent = tuple(d.split())
        tagged = nltk.pos_tag(sent)

        pos_vals = []
        neg_vals = []

        nouns = 0.
        adjectives = 0.
        verbs = 0.
        adverbs = 0.

        i = 0
        for w, t in tagged:

            p, n = 0, 0
            sent_pos_type = None
            if t.startswith("NN"):
                #noun
                sent_pos_type = "n"
                nouns += 1
            elif t.startswith("JJ"):
                #adjective
                sent_pos_type = "a"
                adjectives += 1
            elif t.startswith("VB"):
                #verb
                sent_pos_type = "v"
                verbs += 1
            elif t.startswith("RB"):
                #adverb
                sent_pos_type = "r"
                adverbs += 1
            else:
                sent_pos_type = "Nan"

                i += 1
                l = len(sent) - i

                if l == 0:
                    l = 1
                else:
                    pass

            if sent_pos_type is not None:

                sent_word = "%s/%s" % (sent_pos_type, w)

                if sent_word in sent_word_net:
                    p, n = sent_word_net[sent_word]
                elif sent_word == "Nan":
                    p, n = 0, 0

                pos_vals.append(p)
                neg_vals.append(n)

        if i == 0:
            l = len(sent)
        else:
            pass

        avg_pos_val = np.mean(pos_vals)
        avg_neg_val = np.mean(neg_vals)

        return [avg_pos_val, avg_neg_val, nouns / l, adjectives / l, verbs / l, adverbs / l]

    # print(_get_sentiments('This be fantastic'))

    def transform(self, documents):
        pos_val, neg_val, nouns, adjectives, verbs, adverbs = np.array([self._get_sentiments(d) for d in documents]).T
        result = np.array([pos_val, neg_val, nouns, adjectives, verbs, adverbs]).T

        return result

In [84]:
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, RandomizedSearchCV

In [85]:
# Define functions
def create_baseline_models():
    """Create list of baseline models."""
    models = []
    models.append(('log', LogisticRegression(random_state=123, 
                                             max_iter=1000)))
    models.append(('sgd', SGDClassifier(random_state=123)))
    models.append(('mnb', MultinomialNB()))
    return models

def assess(X, y, models, cv=5, scoring=['roc_auc', 
                                        'accuracy', 
                                        'f1']):
    """Provide summary of cross validation results for models."""
    results = pd.DataFrame()
    for name, model in models:
        result = pd.DataFrame(cross_validate(model, X, y, cv=cv, 
                                             scoring=scoring))
        mean = result.mean().rename('{}_mean'.format)
        std = result.std().rename('{}_std'.format)
        results[name] = pd.concat([mean, std], axis=0)
    return results.sort_index()

In [86]:
models = create_baseline_models()
models

[('log',
  LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                     intercept_scaling=1, l1_ratio=None, max_iter=1000,
                     multi_class='auto', n_jobs=None, penalty='l2',
                     random_state=123, solver='lbfgs', tol=0.0001, verbose=0,
                     warm_start=False)),
 ('sgd', SGDClassifier(alpha=0.0001, average=False, class_weight=None,
                early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
                l1_ratio=0.15, learning_rate='optimal', loss='hinge',
                max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
                power_t=0.5, random_state=123, shuffle=True, tol=0.001,
                validation_fraction=0.1, verbose=0, warm_start=False)),
 ('mnb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]

In [87]:
# Preprocess the data
vectoriser = TfidfVectorizer(token_pattern=r'[a-z]+', 
                             stop_words='english', 
                             min_df=30, 
                             max_df=.7)
X_train_simpler = vectoriser.fit_transform(X_train)
# Assess the model
assess(X_train_simpler, y_train, models)

Unnamed: 0,log,sgd,mnb
fit_time_mean,0.948748,0.1259,0.022557
fit_time_std,0.114444,0.00525,0.003963
score_time_mean,0.016027,0.016216,0.022632
score_time_std,0.000369,0.000349,0.004119
test_accuracy_mean,0.746097,0.746458,0.739278
test_accuracy_std,0.003333,0.002156,0.002977
test_f1_mean,0.752671,0.757212,0.744791
test_f1_std,0.002869,0.001636,0.001948
test_roc_auc_mean,0.827126,0.824481,0.820055
test_roc_auc_std,0.00355,0.003068,0.004196


In [88]:
# Create a pipeline
pipe = Pipeline([('vectoriser', TfidfVectorizer(token_pattern=r'[a-z]+')),
                 ('model', SGDClassifier(random_state=123))])
# Prepare a random search
param_distributions = {'vectoriser__min_df': np.arange(10, 1000, 10),
                       'vectoriser__max_df': np.linspace(.2, 1, 40),
                       'model__loss': ['log', 'hinge']}
r_search = RandomizedSearchCV(estimator=pipe, param_distributions=param_distributions, 
                              n_iter=30, cv=5, n_jobs=-1, random_state=123)
r_search.fit(X_train, y_train)
# Save results to a dataframe
r_search_results = pd.DataFrame(r_search.cv_results_).sort_values(by='rank_test_score')

In [89]:
columns = [col for col in r_search_results.columns 
           if re.search(r"split|param_", col)]
r_summary = r_search_results[columns].copy()
r_summary.columns = [re.sub(r'_test_score|param_', '', col) 
                     for col in r_summary.columns]
columns = [col.split('__')[1] if '__' in col else col 
           for col in r_summary.columns ]
r_summary.columns = columns
r_summary.head()

Unnamed: 0,min_df,max_df,loss,split0,split1,split2,split3,split4
4,20,0.220513,hinge,0.768056,0.773611,0.774444,0.773819,0.774167
1,100,0.938462,hinge,0.75875,0.761389,0.764236,0.759444,0.759514
11,100,0.528205,log,0.757431,0.762639,0.765139,0.754792,0.757569
18,120,0.241026,hinge,0.750972,0.756319,0.757153,0.754167,0.754306
9,140,0.671795,hinge,0.751181,0.756528,0.757014,0.753958,0.754028


In [90]:
# Create a pipeline
pipe = Pipeline([('vectoriser', TfidfVectorizer(token_pattern=r'[a-z]+', max_df=.6)),
                 ('model', SGDClassifier(random_state=123, loss='hinge'))])
# Prepare a grid search
param_grid = {'vectoriser__min_df': [30, 90, 150],
              'vectoriser__ngram_range': [(1,1), (1,2)],
              'vectoriser__stop_words': [None, 'english'],
              'model__fit_intercept': [True, False]}
g_search = GridSearchCV(estimator=pipe, param_grid=param_grid, cv=5, n_jobs=-1)
g_search.fit(X_train, y_train)
# Save results to a dataframe
g_search_results = pd.DataFrame(g_search.cv_results_).sort_values(by='rank_test_score')

In [91]:
# columns = [col for col in g_search_results.columns 
#            if re.search(r"split|param_", col)]
# g_summary = g_search_results[columns+['mean_test_score']].copy()
# g_summary.columns = [re.sub(r'_test_score|param_', '', col) 
#                      for col in g_summary.columns]
# columns = [col.split('__')[1] if '__' in col else col 
#            for col in g_summary.columns ]
# g_summary.columns = columns
# g_summary.head()

In [92]:
# # Create a long dataframe
# g_summary_long = pd.melt(g_summary, 
#                          id_vars=['min_df', 
#                                   'ngram_range', 
#                                   'stop_words', 
#                                   'fit_intercept'], 
#                          value_vars=['split0', 
#                                      'split1', 
#                                      'split2', 
#                                      'split3', 
#                                      'split4'])
# g_summary_long.replace({None: 'None'}, inplace=True)
# # Plot performance
# for param in ['ngram_range', 'stop_words', 'fit_intercept']:
#     plt.figure(figsize=(8,4))
#     plt.title(f'Performance by {param}')
#     sns.boxplot(x='value', y=param, data=g_summary_long, orient='h')
#     plt.xlim(.85, .95);

In [93]:
pipe = Pipeline([('vectoriser', TfidfVectorizer(token_pattern=r'[a-z]+', min_df=30, max_df=.6, ngram_range=(1,2))),
                 ('model', SGDClassifier(random_state=123, loss='hinge'))])
pipe.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vectoriser',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=0.6, max_features=None,
                                 min_df=30, ngram_range=(1, 2), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False, token_pat...
                ('model',
                 SGDClassifier(alpha=0.0001, average=False, class_weight=None,
                               early_stopping=False, epsilon=0.1, eta0=0.0,
                               fit_intercept=True, l1_ratio=0.15,
                               learning_rate='optimal', loss='hinge',
                  

In [94]:
coefs = pd.DataFrame(pipe['model'].coef_, 
                     columns=pipe['vectoriser'].get_feature_names())
coefs = coefs.T.rename(columns={0:'coef'}).sort_values('coef')
coefs

Unnamed: 0,coef
not,-5.554187
miss,-5.223724
sad,-5.031330
wish,-3.460169
sorri,-3.267135
...,...
happi,2.223506
love,2.342490
good,2.426037
thank,2.572014


In [221]:
coef_pos_set = set(coefs.iloc[np.where(coefs['coef'] > 1)].index.tolist())
coef_neg_set = set(coefs.iloc[np.where(coefs['coef'] < -1)].index.tolist())

In [222]:
coef_pos_set

{'actual',
 'amaz',
 'awesom',
 'beauti',
 'best',
 'birthday',
 'blast',
 'bless',
 'btw',
 'cant wait',
 'congrat',
 'cool',
 'cute',
 'enjoy',
 'excit',
 'follow',
 'fun',
 'funni',
 'glad',
 'good',
 'goodnight',
 'gorgeou',
 'great',
 'ha ha',
 'happi',
 'hehe',
 'heheh',
 'hello',
 'hey',
 'it work',
 'laugh',
 'listen',
 'love',
 'lt',
 'mind',
 'name',
 'new',
 'nice',
 'not bad',
 'not need',
 'not problem',
 'not wait',
 'not worri',
 'perfect',
 'pleasur',
 'proud',
 'readi',
 'relax',
 'smile',
 'song',
 'sure',
 'sweet',
 'thank',
 'twitter',
 'url',
 'watch',
 'with',
 'with my',
 'woohoo',
 'work on',
 'worth',
 'yay',
 'ye',
 'you',
 'you want',
 'your'}

In [223]:
def words_freq(tweet):
  num_pos = len(set(tweet).intersection(coef_pos_set))
  num_neg = len(set(tweet).intersection(coef_neg_set))
  
  # If there exist positive words in the tweet
  if num_pos:
      for num in range(num_pos):
          tweet.append('POSITIVE')
  if num_neg:
      for num in range(num_neg):
          tweet.append('NEGATIVE')
  return tweet

In [224]:
# df['coef_pos'] = df['Twitter_final_sent'].str.contains('').value_counts()

In [225]:
df['Tweet_lexicons'] = df['Tweet_stopword'].apply(words_freq)
df.head()

Unnamed: 0,Polarity,Tweet,Tweet_regex,Tweet_emoji,Tweet_nopunc,Tweet_clitics,Tweet_shortforms,Tweet_pure_string,Tweet_token,Tweet_final_sent,Tweet_sent,Tweet_normalised,Tweet_stopword,Tweet_lexicons
0,0,i miss nikki nu nu already shes always there ...,i miss nikki nu nu already shes always there w...,i miss nikki nu nu already shes always there w...,i miss nikki nu nu already shes always there w...,i miss nikki nu nu already shes always there w...,i miss nikki nu nu already shes always there w...,i miss nikki nu nu already shes always there w...,"[i, miss, nikki, nu, nu, already, shes, always...",miss nikki nu nu already shes always there whe...,i miss nikki nu nu already shes always there w...,"[miss, nikki, nu, nu, already, shes, always, t...","[miss, nikki, nu, nu, already, shes, always, t...","[miss, nikki, nu, nu, already, shes, always, t..."
1,0,So I had a dream last night. I remember a sig...,so i had a dream last night. i remember a sign...,so i had a dream last night. i remember a sign...,so i had a dream last night i remember a sign ...,so i had a dream last night i remember a sign ...,so i had a dream last night i remember a sign ...,so i had a dream last night i remember a sign ...,"[so, i, had, a, dream, last, night, i, remembe...",so had dream last night remember sign which cl...,so i had a dream last night i remember a sign ...,"[so, had, dream, last, night, remember, sign, ...","[so, had, dream, last, night, remember, sign, ...","[so, had, dream, last, night, remember, sign, ..."
2,0,@girlyghost ohh poor sickly you (((hugs)) ho...,<MENTION> ohh poor sickly you (((hugs)) hope y...,<MENTION> ohh poor sickly you (((hugs)) hope y...,MENTION ohh poor sickly you hugs hope you feel...,MENTION ohh poor sickly you hugs hope you feel...,MENTION ohh poor sickly you hugs hope you feel...,MENTION ohh poor sickly you hugs hope you feel...,"[MENTION, ohh, poor, sickly, you, hugs, hope, ...",MENTION ohh poor sickly you hugs hope you feel...,MENTION ohh poor sickly you hugs hope you feel...,"[MENTION, ohh, poor, sickly, you, hugs, hope, ...","[MENTION, ohh, poor, sickly, you, hugs, hope, ...","[MENTION, ohh, poor, sickly, you, hugs, hope, ..."
3,0,it is raining again,it is raining again,it is raining again,it is raining again,it is raining again,it is raining again,it is raining again,"[it, is, raining, again]",it is raining again,it is raining again,"[it, is, raining, again]","[it, is, raining, again]","[it, is, raining, again]"
4,0,@MissKeriBaby wish I was in LA right now,<MENTION> wish i was in la right now,<MENTION> wish i was in la right now,MENTION wish i was in la right now,MENTION wish i was in la right now,MENTION wish i was in la right now,MENTION wish i was in la right now,"[MENTION, wish, i, was, in, la, right, now]",MENTION wish was in la right now,MENTION wish i was in la right now,"[MENTION, wish, was, in, la, right, now]","[MENTION, wish, was, in, la, right, now, NEGAT...","[MENTION, wish, was, in, la, right, now, NEGAT..."


In [226]:
df = make_sentences(df, 'Tweet_lexicons', 'Tweet_final_sent_lexicons')
df.head()

Unnamed: 0,Polarity,Tweet,Tweet_regex,Tweet_emoji,Tweet_nopunc,Tweet_clitics,Tweet_shortforms,Tweet_pure_string,Tweet_token,Tweet_final_sent,Tweet_sent,Tweet_normalised,Tweet_stopword,Tweet_lexicons,Tweet_final_sent_lexicons
0,0,i miss nikki nu nu already shes always there ...,i miss nikki nu nu already shes always there w...,i miss nikki nu nu already shes always there w...,i miss nikki nu nu already shes always there w...,i miss nikki nu nu already shes always there w...,i miss nikki nu nu already shes always there w...,i miss nikki nu nu already shes always there w...,"[i, miss, nikki, nu, nu, already, shes, always...",miss nikki nu nu already shes always there whe...,i miss nikki nu nu already shes always there w...,"[miss, nikki, nu, nu, already, shes, always, t...","[miss, nikki, nu, nu, already, shes, always, t...","[miss, nikki, nu, nu, already, shes, always, t...",miss nikki nu nu already shes always there whe...
1,0,So I had a dream last night. I remember a sig...,so i had a dream last night. i remember a sign...,so i had a dream last night. i remember a sign...,so i had a dream last night i remember a sign ...,so i had a dream last night i remember a sign ...,so i had a dream last night i remember a sign ...,so i had a dream last night i remember a sign ...,"[so, i, had, a, dream, last, night, i, remembe...",so had dream last night remember sign which cl...,so i had a dream last night i remember a sign ...,"[so, had, dream, last, night, remember, sign, ...","[so, had, dream, last, night, remember, sign, ...","[so, had, dream, last, night, remember, sign, ...",so had dream last night remember sign which cl...
2,0,@girlyghost ohh poor sickly you (((hugs)) ho...,<MENTION> ohh poor sickly you (((hugs)) hope y...,<MENTION> ohh poor sickly you (((hugs)) hope y...,MENTION ohh poor sickly you hugs hope you feel...,MENTION ohh poor sickly you hugs hope you feel...,MENTION ohh poor sickly you hugs hope you feel...,MENTION ohh poor sickly you hugs hope you feel...,"[MENTION, ohh, poor, sickly, you, hugs, hope, ...",MENTION ohh poor sickly you hugs hope you feel...,MENTION ohh poor sickly you hugs hope you feel...,"[MENTION, ohh, poor, sickly, you, hugs, hope, ...","[MENTION, ohh, poor, sickly, you, hugs, hope, ...","[MENTION, ohh, poor, sickly, you, hugs, hope, ...",MENTION ohh poor sickly you hugs hope you feel...
3,0,it is raining again,it is raining again,it is raining again,it is raining again,it is raining again,it is raining again,it is raining again,"[it, is, raining, again]",it is raining again,it is raining again,"[it, is, raining, again]","[it, is, raining, again]","[it, is, raining, again]",it is raining again
4,0,@MissKeriBaby wish I was in LA right now,<MENTION> wish i was in la right now,<MENTION> wish i was in la right now,MENTION wish i was in la right now,MENTION wish i was in la right now,MENTION wish i was in la right now,MENTION wish i was in la right now,"[MENTION, wish, i, was, in, la, right, now]",MENTION wish was in la right now,MENTION wish i was in la right now,"[MENTION, wish, was, in, la, right, now]","[MENTION, wish, was, in, la, right, now, NEGAT...","[MENTION, wish, was, in, la, right, now, NEGAT...",MENTION wish was in la right now NEGATIVE


In [227]:
# Think if you want to do stratify
X_train, X_test, y_train, y_test = train_test_split(df['Tweet_final_sent_lexicons'], df['Polarity'], stratify=df['Polarity'], test_size=0.1, random_state=2)

In [None]:
train_pred = pipe.predict(X_train)
print(classification_report(train_pred, 
                            y_train, 
                            target_names=['negative', 'positive']))

In [None]:
test_pred = pipe.predict(X_test)
print(classification_report(test_pred, 
                            y_test, 
                            target_names=['negative', 'positive']))

              precision    recall  f1-score   support

    negative       0.78      0.80      0.79     77463
    positive       0.81      0.78      0.80     82537

    accuracy                           0.79    160000
   macro avg       0.79      0.79      0.79    160000
weighted avg       0.79      0.79      0.79    160000



In [None]:
for i in range(10):
    lead = X_test.sample(1)
    %timeit pipe.predict(lead)

1000 loops, best of 5: 674 µs per loop
1000 loops, best of 5: 680 µs per loop
1000 loops, best of 5: 669 µs per loop
1000 loops, best of 5: 677 µs per loop
1000 loops, best of 5: 688 µs per loop
1000 loops, best of 5: 653 µs per loop
1000 loops, best of 5: 697 µs per loop
1000 loops, best of 5: 672 µs per loop
1000 loops, best of 5: 664 µs per loop
1000 loops, best of 5: 671 µs per loop


In [None]:
pos_words = coefs[coefs['coef']>0].index.tolist()
neg_words = coefs[coefs['coef']<0].index.tolist()

In [None]:
pos_words_top = coefs[coefs['coef']>1].index.tolist()
neg_words_top = coefs[coefs['coef']<-1].index.tolist()

In [None]:
import pickle
pickle.dump(pos_words_top, open('/content/drive/MyDrive/COL772_A2/pos_words.txt', 'wb'))
pickle.dump(neg_words_top, open('/content/drive/MyDrive/COL772_A2/neg_words.txt', 'wb'))

In [None]:
# tfidf_ngrams = TfidfVectorizer(min_df=5, ngram_range=(1, 3))
# ling_stats = LinguisticVectorizer()
# all_features = FeatureUnion([('ling', ling_stats), ('tfidf', tfidf_ngrams)])
# clf = MultinomialNB(alpha=5)

# pipeline = Pipeline([('all', all_features), ('clf', clf)])

# pipeline.fit(X_train, y_train)

In [95]:
tfidf_ngrams = TfidfVectorizer(ngram_range=(1,3))
ling_stats = LinguisticVectorizer()
# all_features = FeatureUnion([('ling', ling_stats), ('tfidf', tfidf_ngrams)])
clf = MultinomialNB(alpha=5)

pipeline = Pipeline([('tfidf', tfidf_ngrams), ('clf', clf)])

pipeline.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 3), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 MultinomialNB(alpha=5, class_prior=None, fit_prior=True))],
         verbose=False)

In [96]:
# pd.DataFrame(pipeline.predict(X_test)).value_counts()

In [97]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
y_pred_self = pipeline.predict(X_test)

In [98]:
print('F1 Score: ', f1_score(y_test, y_pred_self))

F1 Score:  0.774746687451286


In [99]:
sum(y_pred_self == y_test)/len(y_test)

0.78325

In [100]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred_self).ravel()
(tp, fp, tn, fn)

(2982, 700, 3284, 1034)

In [101]:
confusion_matrix(y_test, y_pred_self)

array([[3284,  700],
       [1034, 2982]])

In [228]:
tfidf_ngrams = TfidfVectorizer(ngram_range=(1, 3))
# ling_stats = LinguisticVectorizer()
all_features = FeatureUnion([('ling', ling_stats), ('tfidf', tfidf_ngrams)])

clf = LogisticRegression(penalty='l1',
                         solver='saga',
                         multi_class='multinomial',
                         tol=1e-5,
                         n_jobs = -1,
                         max_iter = 1000)

pipeline = Pipeline([('tfidf', tfidf_ngrams), ('clf', clf)])

pipeline.fit(X_train, y_train)
y_pred_lr = pipeline.predict(X_test)
print('F1 Score: ', f1_score(y_test, y_pred_lr))
sum(y_pred_lr == y_test)/len(y_test)

KeyboardInterrupt: ignored

In [134]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred_lr).ravel()
(tp, fp, tn, fn)

(3025, 739, 3245, 991)

In [229]:
tfidf_ngrams = TfidfVectorizer(ngram_range=(1, 3))
# ling_stats = LinguisticVectorizer()
all_features = FeatureUnion([('ling', ling_stats), ('tfidf', tfidf_ngrams)])

clf = LogisticRegression(penalty='l2',
                         solver='saga',
                         multi_class='multinomial',
                         tol=1e-5,
                         n_jobs = -1)

pipeline = Pipeline([('tfidf', tfidf_ngrams), ('clf', clf)])

pipeline.fit(X_train, y_train)
y_pred_lr = pipeline.predict(X_test)
print('F1 Score: ', f1_score(y_test, y_pred_lr))
sum(y_pred_lr == y_test)/len(y_test)

F1 Score:  0.789614356624666


0.793375

In [104]:
tfidf_ngrams = TfidfVectorizer(ngram_range=(1, 3))
# ling_stats = LinguisticVectorizer()
all_features = FeatureUnion([('ling', ling_stats), ('tfidf', tfidf_ngrams)])

clf = LogisticRegression(penalty='elasticnet',
                         solver='saga',
                         multi_class='multinomial',
                         tol=1e-5,
                         n_jobs = -1)

pipeline = Pipeline([('tfidf', tfidf_ngrams), ('clf', clf)])

pipeline.fit(X_train, y_train)
y_pred_lr = pipeline.predict(X_test)
print('F1 Score: ', f1_score(y_test, y_pred_lr))
sum(y_pred_lr == y_test)/len(y_test)

ValueError: ignored

In [None]:
pd.DataFrame(y_test)

In [105]:
# tfidf_ngrams = TfidfVectorizer(ngram_range=(1,3))
# ling_stats = LinguisticVectorizer()
# # all_features = FeatureUnion([('ling', ling_stats), ('tfidf', tfidf_ngrams)])
# clf = RandomForestClassifier(max_depth=2, random_state=0)

# pipeline = Pipeline([('tfidf', tfidf_ngrams), ('clf', clf)])

# pipeline.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 3), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='...
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=2, max_features='auto',
                                        max_leaf_nodes=None, max_samples=None,
 

In [106]:
# y_pred_rf = pipeline.predict(X_test)
# print('F1 Score: ', f1_score(y_test, y_pred_rf))
# sum(y_pred_rf == y_test)/len(y_test)

F1 Score:  0.698146595883394


0.594875

In [None]:
res = pd.DataFrame({'Prediction': y_pred_self, 'True':y_test})

In [None]:
res

In [None]:
pd.set_option('max_colwidth', 400)

In [None]:
df.iloc[res[res['Prediction'] != res['True']].index][['Polarity', 'Tweet', 'Tweet_final_sent']]

In [None]:
df.iloc[res[res['Prediction'] == res['True']].index][['Polarity', 'Tweet', 'Tweet_final_sent']]

In [None]:
df.iloc[16775]

In [None]:
# set(stopwords.words('english')).difference(['not', 'very'])

In [None]:
tfidf_ngrams = TfidfVectorizer(min_df=5, ngram_range=(1, 3))
# ling_stats = LinguisticVectorizer()
all_features = FeatureUnion([('tfidf', tfidf_ngrams)])
clf = MultinomialNB(alpha=5)

pipeline = Pipeline([('all', all_features), ('clf', clf)])

pipeline.fit(X_train, y_train)
y_pred_nb = pipeline.predict(X_test)
print('F1 Score: ', f1_score(y_test, y_pred_nb, pos_label=4))
print(sum(y_pred_nb == y_test)/len(y_test))

In [None]:
len(X_test)

In [None]:
tfidf_ngrams = TfidfVectorizer(min_df=20, ngram_range=(1, 1))
ling_stats = LinguisticVectorizer()
all_features = FeatureUnion([('ling', ling_stats), ('tfidf', tfidf_ngrams)])

clf = LogisticRegression(penalty='l2',
                         solver='lbfgs',
                         multi_class='multinomial',
                         tol=1e-5,
                         n_jobs = -1)

pipeline = Pipeline([('all', all_features), ('clf', clf)])

pipeline.fit(X_train, y_train)
y_pred_lr = pipeline.predict(X_test)
print('F1 Score: ', f1_score(y_test, y_pred_lr, pos_label=4))
sum(y_pred_lr == y_test)/len(y_test)

In [None]:
tfidf_ngrams.get_feature_names()

In [None]:
import pickle

In [None]:
pickle.dump(pipeline, open('/content/drive/MyDrive/COL772_A2/model_25.txt', 'wb'))

In [None]:
pos_st = df[df['Polarity'] == 4]['Tweet_normalised'].apply(str.split).sum()
neg_st = df[df['Polarity'] == 0]['Tweet_normalised'].apply(str.split).sum()

In [None]:
pos_uni_freq = FreqDist(ngrams(pos_st, 1))
neg_uni_freq = FreqDist(ngrams(neg_st, 1))
pos_bi_freq = FreqDist(ngrams(pos_st, 2))
neg_bi_freq = FreqDist(ngrams(neg_st, 2))
pos_tri_freq = FreqDist(ngrams(pos_st, 3))
neg_tri_freq = FreqDist(ngrams(neg_st, 3))

In [None]:
pos_uni_top = pos_uni_freq.most_common(1000)
neg_uni_top = neg_uni_freq.most_common(1000)
pos_bi_top = pos_bi_freq.most_common(1000)
neg_bi_top = neg_bi_freq.most_common(1000)
pos_tri_top = pos_tri_freq.most_common(1000)
neg_tri_top = neg_tri_freq.most_common(1000)

In [None]:
pos_uni_top

[(('get',), 3923),
 (('go',), 3437),
 (('laugh',), 3364),
 (('good',), 3222),
 (('love',), 2983),
 (('day',), 2701),
 (('like',), 2011),
 (('thanks',), 1767),
 (('time',), 1686),
 (('well',), 1591),
 (('u',), 1581),
 (('see',), 1562),
 (('today',), 1525),
 (('know',), 1443),
 (('work',), 1413),
 (('make',), 1411),
 (('one',), 1397),
 (('new',), 1373),
 (('think',), 1369),
 (('great',), 1326),
 (('night',), 1209),
 (('watch',), 1192),
 (('back',), 1185),
 (('look',), 1126),
 (('oh',), 1094),
 (('would',), 1056),
 (('twitter',), 1052),
 (('come',), 1041),
 (('morning',), 1016),
 (('happy',), 981),
 (('hope',), 963),
 (('really',), 915),
 (('fun',), 905),
 (('wait',), 898),
 (('much',), 886),
 (('want',), 876),
 (('say',), 871),
 (('need',), 854),
 (('nice',), 818),
 (('home',), 806),
 (('thank',), 805),
 (('take',), 761),
 (('hey',), 757),
 (('tomorrow',), 750),
 (('still',), 716),
 (('yeah',), 703),
 (('tweet',), 698),
 (('follow',), 698),
 (('yes',), 686),
 (('awesome',), 686),
 (('thi

In [None]:
def get_top_words(sent_list):
    word_list = []
    for i in range(len(sent_list)):
        word_list.append(sent_list[i][0][0])
    return word_list

In [None]:
pos_uni_top_words = get_top_words(pos_uni_top)
neg_uni_top_words = get_top_words(neg_uni_top)
pos_bi_top_words = get_top_words(pos_bi_top)
neg_bi_top_words = get_top_words(neg_bi_top)
pos_tri_top_words = get_top_words(pos_tri_top)
neg_tri_top_words = get_top_words(neg_tri_top)

In [None]:
print(len(pos_uni_top_words))
print(len(neg_uni_top_words))


1000
1000


In [None]:
uni_top_common = set(set(pos_uni_top_words) & set(neg_uni_top_words))
pos_best_words = list(set(pos_uni_top_words) - uni_top_common)
neg_best_words = list(set(neg_uni_top_words) - uni_top_common)

In [None]:
uni_top_common_list = list(uni_top_common)

In [None]:
pos_best_words

['aha',
 'usually',
 'interesting',
 'hilarious',
 'nah',
 'proud',
 'choose',
 'lovin',
 'thanx',
 'promise',
 'draw',
 'excited',
 'thankyou',
 'bath',
 'appreciate',
 'gorgeous',
 'wo',
 'ahead',
 'sexy',
 'wop',
 'j',
 'ai',
 'positive',
 'official',
 'review',
 'heh',
 'quote',
 'surprise',
 'germany',
 'hun',
 'pizza',
 'consider',
 'john',
 'sex',
 'war',
 'perfect',
 'profile',
 'topic',
 'matt',
 'choice',
 'demi',
 'fav',
 'design',
 'jk',
 'rd',
 'shal',
 'anyways',
 'indeed',
 'dany',
 'often',
 'congratulation',
 'paint',
 'usual',
 'etc',
 'hanah',
 'experience',
 'adorable',
 'shin',
 'anniversary',
 'aka',
 'brilliant',
 'sweetie',
 'bake',
 'unles',
 'prepare',
 'direct',
 'count',
 'folk',
 'sky',
 'studio',
 'view',
 'quiet',
 'gift',
 'tweps',
 'everybody',
 'space',
 'color',
 'fresh',
 'fabulous',
 'lake',
 'joe',
 'include',
 'bright',
 'peace',
 'info',
 'detail',
 'chilin',
 'treat',
 'michael',
 'doin',
 'wave',
 'american',
 'tip',
 'drinking',
 'secret',
 'k

In [None]:
neg_best_words

['delay',
 'completely',
 'cough',
 'disappointed',
 'fell',
 'cancel',
 'thunder',
 'dnt',
 'kate',
 'ew',
 'piss',
 'blow',
 'terrible',
 'uh',
 'shift',
 'lonely',
 'throat',
 'nightmare',
 'fever',
 'however',
 'blood',
 'confuse',
 'stuck',
 'starbucks',
 'to',
 'broken',
 'traffic',
 'trouble',
 'sat',
 'somewhere',
 'unfortunately',
 'screw',
 'ring',
 'ac',
 'shitty',
 'bank',
 'rid',
 'board',
 'entire',
 'couldnt',
 'shes',
 'painful',
 'wasnt',
 'wah',
 'floor',
 'rip',
 'crappy',
 'doctor',
 'mouth',
 'freeze',
 'key',
 'dread',
 'blackberry',
 'block',
 'laundry',
 'anywhere',
 'co',
 'exhaust',
 'burnt',
 'vet',
 'bummer',
 'blah',
 'no',
 'bo',
 'atm',
 'there',
 'san',
 'shut',
 'slept',
 'arm',
 'tummy',
 'accident',
 'left',
 'wat',
 'france',
 'knee',
 'shame',
 'ear',
 'bored',
 'guted',
 'duno',
 'science',
 'sadly',
 'ouch',
 'leg',
 'report',
 'fml',
 'aint',
 'headache',
 'assignment',
 'darn',
 'revision',
 'depress',
 'weight',
 'schedule',
 'doesnt',
 'gah',


In [None]:
len(pos_best_words)

205

In [None]:
# tfidf_ngrams = TfidfVectorizer(min_df=5, ngram_range=(1, 3))
# # ling_stats = LinguisticVectorizer()
# all_features = FeatureUnion([('tfidf', tfidf_ngrams), ('pos', pos_best_words), ('neg', neg_best_words)])
# clf = MultinomialNB(alpha=1)

# pipeline = Pipeline([('all', all_features), ('clf', clf)])

# pipeline.fit(X_train, y_train)
# y_pred_nb = pipeline.predict(X_test)
# print('F1 Score: ', f1_score(y_test, y_pred_nb, pos_label=4))
# print(sum(y_pred_nb == y_test)/len(y_test))

In [None]:
df['Tweet'].iloc[50]

In [None]:
df['Tweet_sent'].iloc[50]

In [None]:
data.iloc[786897,:]

In [None]:
df.iloc[786897]

In [None]:
def dummy(tweet):
    return tweet

In [None]:
cv = CountVectorizer(  
                      tokenizer=dummy,
                      preprocessor=dummy,
                      ngram_range=(1,1)
                    )

In [None]:
print("Hi")

In [None]:
X = df['Polarity', 'Tweet_final_sent']
X_train, X_test, y_train, y_test = train_test_split(X['Tweet_final_sent'], X['Polarity'], test_size=0.25, random_state=2)

In [None]:
# X = cv.fit_transform(df['Tweet_lemma']).toarray()

In [None]:
X

In [None]:
X.shape

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(X, df['Polarity'], test_size=0.25, random_state=2)

In [None]:
# X_train = X[:80000,:]
# X_test = X[80000:,:]
# y_train = df['Polarity'][:80000]
# y_test = df['Polarity'][80000:]

In [None]:
X_train.shape

In [None]:
X_train

In [None]:
def model_run(model, X_train, y_train):
    model.fit(X_train, y_train)

In [None]:
def model_predict(model, X_test, y_test):
    print('Accuracy is: ', model.score(X_test, y_test)*100)
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))

In [None]:
model = MultinomialNB()
model_run(model, X_train, y_train)
model_predict(model, X_test, y_test)

In [None]:
# model = LogisticRegression()
# model_run(model, X_train, y_train)
# model_predict(model, X_test, y_test)

In [None]:
model_predict(model, X_test, y_test)

In [None]:
model = LinearSVC()
model_run(model, X_train, y_train)
model_predict(model, X_test, y_test)

In [None]:
df.head()

In [None]:
tfidf_counts = TfidfVectorizer(tokenizer= word_tokenize, # type of tokenization
                               ngram_range=(1,1)) # number of n-grams
tfidf_data = tfidf_counts.fit_transform(df['Tweet_sent'])

In [None]:
# tfidf_counts = TfidfVectorizer()
# tfidf_data = tfidf_counts.fit_transform(a)

In [None]:
tfidf_data.shape

In [None]:
X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(tfidf_data, df['Polarity'], test_size=0.25, random_state=2)

In [None]:
print(X_train_tfidf.shape)
print(X_test_tfidf.shape)
print(y_train_tfidf.shape)
print(y_test_tfidf.shape)

In [None]:
model = MultinomialNB()
model_run(model, X_train_tfidf, y_train_tfidf)
model_predict(model, X_test_tfidf, y_test_tfidf)

In [None]:
model = LinearSVC()
model_run(model, X_train_tfidf, y_train_tfidf)
model_predict(model, X_test_tfidf, y_test_tfidf)

In [None]:
# model = LogisticRegression()
# model_run(model, X_train_tfidf, y_train_tfidf)
# model_predict(model, X_test_tfidf, y_test_tfidf)

In [None]:
def remove_extra_words(tweet):
    tweet = [word for word in tweet if word in uni_top_common_list]
    if len(tweet) == 0:
        tweet = ['None']
    return tweet

In [None]:
df['Tweet_remove_extra'] = df['Tweet_lemma'].apply(remove_extra_words)
df.head()

Unnamed: 0,Polarity,Tweet,Tweet_regex,Tweet_clean,Tweet_stopword,Tweet_clitics,Tweet_shortforms,Tweet_pos,Tweet_lemma,Tweet_sent,Tweet_remove_extra
514293,0,i miss nikki nu nu already shes always there ...,miss nikki nu nu already shes always there whe...,"[miss, nikki, nu, nu, already, shes, always, t...","[miss, nikki, nu, nu, already, shes, always, n...","[miss, nikki, nu, nu, already, shes, always, n...","[miss, nikki, nu, nu, already, shes, always, n...","[(miss, JJ), (nikki, NN), (nu, JJ), (nu, JJ), ...","[miss, nikki, nu, nu, already, shes, always, n...",miss nikki nu nu already shes always need than...,"[miss, already, shes, always, need, thank, xxx]"
142282,0,So I had a dream last night. I remember a sig...,So had dream last night remember sign which cl...,"[So, had, dream, last, night, remember, sign, ...","[So, dream, last, night, remember, sign, clear...","[So, dream, last, night, remember, sign, clear...","[So, dream, last, night, remember, sign, clear...","[(So, RB), (dream, NN), (last, JJ), (night, NN...","[So, dream, last, night, remember, sign, clear...",So dream last night remember sign clearly tell...,"[So, dream, last, night, remember, sign, tell,..."
403727,0,@girlyghost ohh poor sickly you (((hugs)) ho...,ohh poor sickly you hugs hope you feel little ...,"[ohh, poor, sickly, you, hugs, hope, you, feel...","[ohh, poor, sickly, hugs, hope, feel, little, ...","[ohh, poor, sickly, hugs, hope, feel, little, ...","[ohh, poor, sickly, hugs, hope, feel, little, ...","[(ohh, JJ), (poor, JJ), (sickly, JJ), (hugs, N...","[ohh, poor, sickly, hug, hope, feel, little, g...",ohh poor sickly hug hope feel little good soon,"[hug, hope, feel, little, good, soon]"
649503,0,it is raining again,it is raining again,"[it, is, raining, again]",[raining],[raining],[raining],"[(raining, VBG)]",[rain],rain,[rain]
610789,0,@MissKeriBaby wish I was in LA right now,wish was in LA right now,"[wish, was, in, LA, right, now]","[wish, LA, right]","[wish, LA, right]","[wish, LA, right]","[(wish, JJ), (LA, NNP), (right, NN)]","[wish, LA, right]",wish LA right,"[wish, LA, right]"


In [None]:
df = make_sentences(df, 'Tweet_remove_extra', 'Tweet_final_sent')
df.head()

Unnamed: 0,Polarity,Tweet,Tweet_regex,Tweet_clean,Tweet_stopword,Tweet_clitics,Tweet_shortforms,Tweet_pos,Tweet_lemma,Tweet_sent,Tweet_remove_extra,Tweet_final_sent
514293,0,i miss nikki nu nu already shes always there ...,miss nikki nu nu already shes always there whe...,"[miss, nikki, nu, nu, already, shes, always, t...","[miss, nikki, nu, nu, already, shes, always, n...","[miss, nikki, nu, nu, already, shes, always, n...","[miss, nikki, nu, nu, already, shes, always, n...","[(miss, JJ), (nikki, NN), (nu, JJ), (nu, JJ), ...","[miss, nikki, nu, nu, already, shes, always, n...",miss nikki nu nu already shes always need than...,"[miss, already, shes, always, need, thank, xxx]",miss already shes always need thank xxx
142282,0,So I had a dream last night. I remember a sig...,So had dream last night remember sign which cl...,"[So, had, dream, last, night, remember, sign, ...","[So, dream, last, night, remember, sign, clear...","[So, dream, last, night, remember, sign, clear...","[So, dream, last, night, remember, sign, clear...","[(So, RB), (dream, NN), (last, JJ), (night, NN...","[So, dream, last, night, remember, sign, clear...",So dream last night remember sign clearly tell...,"[So, dream, last, night, remember, sign, tell,...",So dream last night remember sign tell get job...
403727,0,@girlyghost ohh poor sickly you (((hugs)) ho...,ohh poor sickly you hugs hope you feel little ...,"[ohh, poor, sickly, you, hugs, hope, you, feel...","[ohh, poor, sickly, hugs, hope, feel, little, ...","[ohh, poor, sickly, hugs, hope, feel, little, ...","[ohh, poor, sickly, hugs, hope, feel, little, ...","[(ohh, JJ), (poor, JJ), (sickly, JJ), (hugs, N...","[ohh, poor, sickly, hug, hope, feel, little, g...",ohh poor sickly hug hope feel little good soon,"[hug, hope, feel, little, good, soon]",hug hope feel little good soon
649503,0,it is raining again,it is raining again,"[it, is, raining, again]",[raining],[raining],[raining],"[(raining, VBG)]",[rain],rain,[rain],rain
610789,0,@MissKeriBaby wish I was in LA right now,wish was in LA right now,"[wish, was, in, LA, right, now]","[wish, LA, right]","[wish, LA, right]","[wish, LA, right]","[(wish, JJ), (LA, NNP), (right, NN)]","[wish, LA, right]",wish LA right,"[wish, LA, right]",wish LA right


In [None]:
tfidf_counts_clean = TfidfVectorizer(tokenizer= word_tokenize, # type of tokenization
                               ngram_range=(1,2)) # number of n-grams
tfidf_data_clean = tfidf_counts_clean.fit_transform(df['Tweet_final_sent'])

In [None]:
X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(tfidf_data_clean, df['Polarity'], test_size=0.25, random_state=2)

In [None]:
print(X_train_tfidf.shape)
print(X_test_tfidf.shape)
print(y_train_tfidf.shape)
print(y_test_tfidf.shape)

(30000, 69746)
(10000, 69746)
(30000,)
(10000,)


In [None]:
model = MultinomialNB()
model_run(model, X_train_tfidf, y_train_tfidf)
model_predict(model, X_test_tfidf, y_test_tfidf)

Accuracy is:  70.34
              precision    recall  f1-score   support

           0       0.71      0.70      0.70      5048
           4       0.70      0.71      0.70      4952

    accuracy                           0.70     10000
   macro avg       0.70      0.70      0.70     10000
weighted avg       0.70      0.70      0.70     10000



In [None]:
model = LinearSVC()
model_run(model, X_train_tfidf, y_train_tfidf)
model_predict(model, X_test_tfidf, y_test_tfidf)

Accuracy is:  69.61
              precision    recall  f1-score   support

           0       0.71      0.66      0.69      5048
           4       0.68      0.73      0.70      4952

    accuracy                           0.70     10000
   macro avg       0.70      0.70      0.70     10000
weighted avg       0.70      0.70      0.70     10000



In [None]:
model = LogisticRegression()
model_run(model, X_train_tfidf, y_train_tfidf)
model_predict(model, X_test_tfidf, y_test_tfidf)

Accuracy is:  71.04
              precision    recall  f1-score   support

           0       0.73      0.69      0.71      5048
           4       0.70      0.74      0.72      4952

    accuracy                           0.71     10000
   macro avg       0.71      0.71      0.71     10000
weighted avg       0.71      0.71      0.71     10000



In [None]:
from sklearn.pipeline import Pipeline

In [None]:
text_clf = Pipeline([
    ('tfidf',TfidfVectorizer(preprocessor=None,
                             tokenizer=word_tokenize,
                             analyzer='word',
                             stop_words=None,
                             strip_accents=None,
                             lowercase=True,
                             ngram_range=(1,3),
                             min_df=0.0001,
                             max_df=0.9,
                             binary=False,
                             norm='l2',
                             use_idf=1,
                             smooth_idf=1,
                             sublinear_tf=1)),
    ('clf', LogisticRegression(penalty='l2',
                               solver='saga',
                               multi_class='multinomial',
                              tol=1e-5,
                              n_jobs = -1)),
])

In [None]:

text_clf.fit(X_train,y_train)