In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [331]:
import pandas as pd
import re
import nltk
import matplotlib.pyplot as plt

In [3]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

In [4]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.corpus import sentiwordnet as swn
from nltk import ngrams, FreqDist
from nltk.corpus import wordnet

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [5]:
data_neg = pd.read_csv('/content/drive/MyDrive/COL772_A2/training_negative.csv', encoding='latin-1')
data_pos = pd.read_csv('/content/drive/MyDrive/COL772_A2/training_positive.csv', encoding='latin-1')

In [6]:
data = data_neg.append(data_pos)
data = data.iloc[:,1:]

In [73]:
data.tail()

Unnamed: 0,Polarity,Tweet
799995,4,Just woke up. Having no school is the best fee...
799996,4,TheWDB.com - Very cool to hear old Walt interv...
799997,4,Are you ready for your MoJo Makeover? Ask me f...
799998,4,Happy 38th Birthday to my boo of alll time!!! ...
799999,4,happy #charitytuesday @theNSPCC @SparksCharity...


## Sampling Data for Trying out Approaches

In [217]:
# df = data.sample(frac=0.2)
df = data.iloc[:30000]
df.head()

Unnamed: 0,Polarity,Tweet
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


## Pre-Processing Text

In [218]:
def clean_text(tweet):
    tweet = tweet.lower()                                   # Converting to lower case
    tweet = re.sub(r'\b\w+@[^\s]+', ' ', tweet)             # Removing email IDs
    tweet = re.sub(r'@[^\s]+', ' ', tweet)                  # Removing mentions
    tweet = re.sub(r'https?:\/[^\s]+', ' ', tweet)          # Removing URLs
    tweet = re.sub(r'www.[^\s]+', ' ', tweet)               # Removing Websites
    tweet = re.sub(r'#', '', tweet)                         # Removing hashtags
    tweet = re.sub(r'_', ' ', tweet)                        # Sometimes hashtags are done with _ representing break between two words
    tweet = re.sub(r'\.{2,}', ' ', tweet)                   # Removing sentence separators
    tweet = re.sub(r"[0-9]+",' ', tweet)                    # Removing numbers as they do not indicate sentiment
    # tweet = re.sub(r"\b[a-zA-Z]{1}\b", ' ', tweet)        # Removing single letters
    tweet = re.sub(r"\bamp\b", ' ', tweet)                  # Removing &amp signs mis-translated
    tweet = re.sub(r"\bquot\b", ' ', tweet)                
    if len(tweet) == 0:
      tweet = 'None'
    return ' '.join(tweet.split())

In [219]:
def remove_punc(tweet):
    tweet = re.sub(r"[^\w'\s]+",'', tweet)                  # Removing punctuations apart from clitic
    return tweet

In [220]:
clean_text("I am &amp rachit1jain@gmail n't doing good :D") 

"i am & n't doing good :d"

In [221]:
clean_text('@')

'@'

In [222]:
def tweet_word_tokenizer(tweet):
    # return word_tokenize(tweet)
    return tweet.split(' ')

In [223]:
clitics = {
    "nt": 'not',
    "ve": 'have',
    "s": 'is',
    "m": 'am',
    "re": 'are',
    "ll": 'will',
    'd': 'would',
    "bout": 'about',
    'didnt': 'did not',
    'havent': 'have not',
    'hasnt': 'has not',
    'wont': 'will not',
    'wouldnt': 'will not',
    'shouldnt': 'should not',
}

In [224]:
# # count = 0
# def handle_clitics(tweet):
#     # global count
#     # count += 1
#     for i in range(len(tweet)):
#         if tweet[i] in clitics.keys():
#             tweet[i] = clitics[tweet[i]]
#     return tweet

In [225]:
def handle_clitics(phrase):
    # specific
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    phrase = re.sub(r"\bdon't\b", "do not", phrase)
    phrase = re.sub(r"\bdoesn't\b", "does not", phrase)
    phrase = re.sub(r"\bdidn't\b", "did not", phrase)
    phrase = re.sub(r"\bdidnt\b", "did not", phrase)
    phrase = re.sub(r"\bhasn't\b", "has not", phrase)
    phrase = re.sub(r"\bhaven't\b", "have not", phrase)
    phrase = re.sub(r"\bhavent\b", "have not", phrase)
    phrase = re.sub(r"\bhadn't\b", "had not", phrase)
    phrase = re.sub(r"\bwon't\b", "will not", phrase)
    phrase = re.sub(r"\bwouldn't\b", "would not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)

    # using regular expressions to expand the contractions
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)

    return phrase

In [226]:
def stopword_removal(tweet):
    # stopwords = stopwords.words('english')
    tweet = [word for word in tweet if word not in stopwords.words('english')]
    if len(tweet) == 0:
        tweet = ['None']
    return tweet

In [227]:
# stopwords.words('english')

In [228]:
short_forms = {
    'n': 'and',
    'ya': 'you',
    'luv': 'love',
    'lol': 'laugh',
    'k': 'okay',
    'na': 'no',
    'ily': 'love',
    'im': 'am',
    'morn': 'morning',
    'nght': 'night',
    'no': 'not',
    'Ill': 'will',
    'shoulda': 'should have'
    }

In [229]:
def handle_shortforms(tweet):
    temp = ''
    for word in tweet.split():
        if word in short_forms.keys():
            temp = temp + ' ' + short_forms[word]
        else:
            temp = temp + ' ' + word
    return ' '.join(temp.split())

In [230]:
# handle_shortforms(['I','am','lol','in','practice'])

In [231]:
handle_shortforms('I am a good boy shoulda gone')

'I am a good boy should have gone'

In [254]:
## Maintaining only letters within a tweet and removing every other information since not indicative of sentiment
def maintain_letters(tweet):
    tweet = re.sub(r'[^a-z]', ' ', tweet)      # since lowering has already been done
    return ' '.join(tweet.split())

In [257]:
maintain_letters('i am a good boy. hero is @terohja 909')

'i am a good boy hero is terohja'

In [238]:
### TO BE MODIFIED ####
# Emoticons store a lot of information
emo_info = {
    # positive emoticons
    ":‑)": " happy ",
    ":)": " happy ",
    ";)": " happy ",
    ":-}": " good ",
    "=]": " good ",
    "=)": " good ",
    ";d": " laugh ",
    ":d": " laugh ",
    ":dd": " laugh ",
    "xd": " laugh ",
    ":p": " tease ",
    "xp": " tease ",
    "<3": " love ",

    # negativve emoticons
    ":‑(": " sad ",
    ":‑[": " sad ",
    ":(": " sad ",
    "=(": " sad ",
    "=/": " sad ",
    ":{": " sad ",
    ":/": " upset ",
    ":|": " upset ",
    ":-/": " upset ",
    ":o": " shock "

}

In [243]:
### TO BE MODIFIED ####
emo_info_order = [k for (k_len, k) in reversed(sorted([(len(k), k) for k in emo_info.keys()]))]

In [244]:
### TO BE MODIFIED ####
def emo_repl(phrase):
    for k in emo_info_order:
        phrase = phrase.replace(k, emo_info[k])
    return phrase

In [245]:
df['Tweet_regex'] = df['Tweet'].apply(clean_text)
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,Polarity,Tweet,Tweet_regex,Tweet_emoji
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...","- awww, that's a bummer. you shoulda got david...","- awww, that's a bummer. you shoulda got david..."
1,0,is upset that he can't update his Facebook by ...,is upset that he can't update his facebook by ...,is upset that he can't update his facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...,i dived many times for the ball. managed to sa...,i dived many times for the ball. managed to sa...
3,0,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all....","no, it's not behaving at all. i'm mad. why am ...","no, it's not behaving at all. i'm mad. why am ..."


In [247]:
df['Tweet_emoji'] = df['Tweet_regex'].apply(emo_repl)
df.head()

   Polarity  ...                                        Tweet_emoji
0         0  ...  - awww, that's a bummer. you shoulda got david...
1         0  ...  is upset that he can't update his facebook by ...
2         0  ...  i dived many times for the ball. managed to sa...
3         0  ...     my whole body feels itchy and like its on fire
4         0  ...  no, it's not behaving at all. i'm mad. why am ...

[5 rows x 4 columns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [248]:
df['Tweet_nopunc'] = df['Tweet_emoji'].apply(remove_punc)
df.head()

   Polarity  ...                                       Tweet_nopunc
0         0  ...   awww that's a bummer you shoulda got david ca...
1         0  ...  is upset that he can't update his facebook by ...
2         0  ...  i dived many times for the ball managed to sav...
3         0  ...     my whole body feels itchy and like its on fire
4         0  ...  no it's not behaving at all i'm mad why am i h...

[5 rows x 5 columns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [249]:
df['Tweet_clitics'] = df['Tweet_nopunc'].apply(handle_clitics)
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,Polarity,Tweet,Tweet_regex,Tweet_emoji,Tweet_nopunc,Tweet_clitics
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...","- awww, that's a bummer. you shoulda got david...","- awww, that's a bummer. you shoulda got david...",awww that's a bummer you shoulda got david ca...,awww that is a bummer you shoulda got david c...
1,0,is upset that he can't update his Facebook by ...,is upset that he can't update his facebook by ...,is upset that he can't update his facebook by ...,is upset that he can't update his facebook by ...,is upset that he can not update his facebook b...
2,0,@Kenichan I dived many times for the ball. Man...,i dived many times for the ball. managed to sa...,i dived many times for the ball. managed to sa...,i dived many times for the ball managed to sav...,i dived many times for the ball managed to sav...
3,0,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all....","no, it's not behaving at all. i'm mad. why am ...","no, it's not behaving at all. i'm mad. why am ...",no it's not behaving at all i'm mad why am i h...,no it is not behaving at all i am mad why am i...


In [250]:
df['Tweet_shortforms'] = df['Tweet_clitics'].apply(handle_shortforms)
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,Polarity,Tweet,Tweet_regex,Tweet_emoji,Tweet_nopunc,Tweet_clitics,Tweet_shortforms
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...","- awww, that's a bummer. you shoulda got david...","- awww, that's a bummer. you shoulda got david...",awww that's a bummer you shoulda got david ca...,awww that is a bummer you shoulda got david c...,awww that is a bummer you should have got davi...
1,0,is upset that he can't update his Facebook by ...,is upset that he can't update his facebook by ...,is upset that he can't update his facebook by ...,is upset that he can't update his facebook by ...,is upset that he can not update his facebook b...,is upset that he can not update his facebook b...
2,0,@Kenichan I dived many times for the ball. Man...,i dived many times for the ball. managed to sa...,i dived many times for the ball. managed to sa...,i dived many times for the ball managed to sav...,i dived many times for the ball managed to sav...,i dived many times for the ball managed to sav...
3,0,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all....","no, it's not behaving at all. i'm mad. why am ...","no, it's not behaving at all. i'm mad. why am ...",no it's not behaving at all i'm mad why am i h...,no it is not behaving at all i am mad why am i...,not it is not behaving at all i am mad why am ...


In [258]:
df['Tweet_pure_string'] = df['Tweet_shortforms'].apply(maintain_letters)
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,Polarity,Tweet,Tweet_regex,Tweet_emoji,Tweet_nopunc,Tweet_clitics,Tweet_shortforms,Tweet_token,Tweet_stopword,Tweet_pure_string
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...","- awww, that's a bummer. you shoulda got david...","- awww, that's a bummer. you shoulda got david...",awww that's a bummer you shoulda got david ca...,awww that is a bummer you shoulda got david c...,awww that is a bummer you should have got davi...,"[awww, that, is, a, bummer, you, should, have,...","[awww, bummer, got, david, carr, third, day, l...",awww that is a bummer you should have got davi...
1,0,is upset that he can't update his Facebook by ...,is upset that he can't update his facebook by ...,is upset that he can't update his facebook by ...,is upset that he can't update his facebook by ...,is upset that he can not update his facebook b...,is upset that he can not update his facebook b...,"[is, upset, that, he, can, not, update, his, f...","[upset, update, facebook, texting, might, cry,...",is upset that he can not update his facebook b...
2,0,@Kenichan I dived many times for the ball. Man...,i dived many times for the ball. managed to sa...,i dived many times for the ball. managed to sa...,i dived many times for the ball managed to sav...,i dived many times for the ball managed to sav...,i dived many times for the ball managed to sav...,"[i, dived, many, times, for, the, ball, manage...","[dived, many, times, ball, managed, save, rest...",i dived many times for the ball managed to sav...
3,0,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire,"[my, whole, body, feels, itchy, and, like, its...","[whole, body, feels, itchy, like, fire]",my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all....","no, it's not behaving at all. i'm mad. why am ...","no, it's not behaving at all. i'm mad. why am ...",no it's not behaving at all i'm mad why am i h...,no it is not behaving at all i am mad why am i...,not it is not behaving at all i am mad why am ...,"[not, it, is, not, behaving, at, all, i, am, m...","[behaving, mad, see]",not it is not behaving at all i am mad why am ...


In [259]:
df['Tweet_token'] = df['Tweet_pure_string'].apply(tweet_word_tokenizer)
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,Polarity,Tweet,Tweet_regex,Tweet_emoji,Tweet_nopunc,Tweet_clitics,Tweet_shortforms,Tweet_token,Tweet_stopword,Tweet_pure_string
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...","- awww, that's a bummer. you shoulda got david...","- awww, that's a bummer. you shoulda got david...",awww that's a bummer you shoulda got david ca...,awww that is a bummer you shoulda got david c...,awww that is a bummer you should have got davi...,"[awww, that, is, a, bummer, you, should, have,...","[awww, bummer, got, david, carr, third, day, l...",awww that is a bummer you should have got davi...
1,0,is upset that he can't update his Facebook by ...,is upset that he can't update his facebook by ...,is upset that he can't update his facebook by ...,is upset that he can't update his facebook by ...,is upset that he can not update his facebook b...,is upset that he can not update his facebook b...,"[is, upset, that, he, can, not, update, his, f...","[upset, update, facebook, texting, might, cry,...",is upset that he can not update his facebook b...
2,0,@Kenichan I dived many times for the ball. Man...,i dived many times for the ball. managed to sa...,i dived many times for the ball. managed to sa...,i dived many times for the ball managed to sav...,i dived many times for the ball managed to sav...,i dived many times for the ball managed to sav...,"[i, dived, many, times, for, the, ball, manage...","[dived, many, times, ball, managed, save, rest...",i dived many times for the ball managed to sav...
3,0,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire,"[my, whole, body, feels, itchy, and, like, its...","[whole, body, feels, itchy, like, fire]",my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all....","no, it's not behaving at all. i'm mad. why am ...","no, it's not behaving at all. i'm mad. why am ...",no it's not behaving at all i'm mad why am i h...,no it is not behaving at all i am mad why am i...,not it is not behaving at all i am mad why am ...,"[not, it, is, not, behaving, at, all, i, am, m...","[behaving, mad, see]",not it is not behaving at all i am mad why am ...


In [260]:
df['Tweet_stopword'] = df['Tweet_token'].apply(stopword_removal)
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,Polarity,Tweet,Tweet_regex,Tweet_emoji,Tweet_nopunc,Tweet_clitics,Tweet_shortforms,Tweet_token,Tweet_stopword,Tweet_pure_string
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...","- awww, that's a bummer. you shoulda got david...","- awww, that's a bummer. you shoulda got david...",awww that's a bummer you shoulda got david ca...,awww that is a bummer you shoulda got david c...,awww that is a bummer you should have got davi...,"[awww, that, is, a, bummer, you, should, have,...","[awww, bummer, got, david, carr, third, day, l...",awww that is a bummer you should have got davi...
1,0,is upset that he can't update his Facebook by ...,is upset that he can't update his facebook by ...,is upset that he can't update his facebook by ...,is upset that he can't update his facebook by ...,is upset that he can not update his facebook b...,is upset that he can not update his facebook b...,"[is, upset, that, he, can, not, update, his, f...","[upset, update, facebook, texting, might, cry,...",is upset that he can not update his facebook b...
2,0,@Kenichan I dived many times for the ball. Man...,i dived many times for the ball. managed to sa...,i dived many times for the ball. managed to sa...,i dived many times for the ball managed to sav...,i dived many times for the ball managed to sav...,i dived many times for the ball managed to sav...,"[i, dived, many, times, for, the, ball, manage...","[dived, many, times, ball, managed, save, rest...",i dived many times for the ball managed to sav...
3,0,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire,"[my, whole, body, feels, itchy, and, like, its...","[whole, body, feels, itchy, like, fire]",my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all....","no, it's not behaving at all. i'm mad. why am ...","no, it's not behaving at all. i'm mad. why am ...",no it's not behaving at all i'm mad why am i h...,no it is not behaving at all i am mad why am i...,not it is not behaving at all i am mad why am ...,"[not, it, is, not, behaving, at, all, i, am, m...","[behaving, mad, see]",not it is not behaving at all i am mad why am ...


In [261]:
most = 0
for lis in df['Tweet_stopword']:
  most = max(most, len(lis))
most

28

In [132]:
## Was absolutely useless to use
# def stemmer(tweet):
#     porter_stemmer = PorterStemmer()
#     tweet = [porter_stemmer.stem(word) for word in tweet]
#     return tweet

In [133]:
# stemmer(['I','am','playing','making', 'what','I','do'])

In [134]:
# df['Tweet_stem'] = df['Tweet_shortforms'].apply(stemmer)
# df.head()

In [262]:
def pos_tagger(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:         
        return None

In [263]:
#### TO BE MODIFIED ######
def pos_tagging(tweet):
    # tweet = nltk.pos_tag(tweet) 
    tweet = nltk.pos_tag([i for i in tweet if i])
    return tweet

In [264]:
pos_tagging(['','I','am','good'])

[('I', 'PRP'), ('am', 'VBP'), ('good', 'JJ')]

In [265]:
### TO BE MODIFIED ################
def tweet_lemmatizer(tweet):
    lemmatizer = WordNetLemmatizer()
    lemmatized = []
    pos_wordnet = list(map(lambda x: (x[0], pos_tagger(x[1])), tweet))
    for word, tag in pos_wordnet:
        if tag is None:
            lemmatized.append(word)
        else:       
            lemmatized.append(lemmatizer.lemmatize(word, tag))
    lemmatized = ' '.join(lemmatized)
    lemmatized_sent = ', '.join(lemmatized)
    # print(list(lemmatized.split()))
    return list(lemmatized.split())

In [266]:
df['Tweet_pos'] = df['Tweet_stopword'].apply(pos_tagging)
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,Polarity,Tweet,Tweet_regex,Tweet_emoji,Tweet_nopunc,Tweet_clitics,Tweet_shortforms,Tweet_token,Tweet_stopword,Tweet_pure_string,Tweet_pos
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...","- awww, that's a bummer. you shoulda got david...","- awww, that's a bummer. you shoulda got david...",awww that's a bummer you shoulda got david ca...,awww that is a bummer you shoulda got david c...,awww that is a bummer you should have got davi...,"[awww, that, is, a, bummer, you, should, have,...","[awww, bummer, got, david, carr, third, day, l...",awww that is a bummer you should have got davi...,"[(awww, JJ), (bummer, NN), (got, VBD), (david,..."
1,0,is upset that he can't update his Facebook by ...,is upset that he can't update his facebook by ...,is upset that he can't update his facebook by ...,is upset that he can't update his facebook by ...,is upset that he can not update his facebook b...,is upset that he can not update his facebook b...,"[is, upset, that, he, can, not, update, his, f...","[upset, update, facebook, texting, might, cry,...",is upset that he can not update his facebook b...,"[(upset, JJ), (update, JJ), (facebook, NN), (t..."
2,0,@Kenichan I dived many times for the ball. Man...,i dived many times for the ball. managed to sa...,i dived many times for the ball. managed to sa...,i dived many times for the ball managed to sav...,i dived many times for the ball managed to sav...,i dived many times for the ball managed to sav...,"[i, dived, many, times, for, the, ball, manage...","[dived, many, times, ball, managed, save, rest...",i dived many times for the ball managed to sav...,"[(dived, VBD), (many, JJ), (times, NNS), (ball..."
3,0,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire,"[my, whole, body, feels, itchy, and, like, its...","[whole, body, feels, itchy, like, fire]",my whole body feels itchy and like its on fire,"[(whole, JJ), (body, NN), (feels, NNS), (itchy..."
4,0,"@nationwideclass no, it's not behaving at all....","no, it's not behaving at all. i'm mad. why am ...","no, it's not behaving at all. i'm mad. why am ...",no it's not behaving at all i'm mad why am i h...,no it is not behaving at all i am mad why am i...,not it is not behaving at all i am mad why am ...,"[not, it, is, not, behaving, at, all, i, am, m...","[behaving, mad, see]",not it is not behaving at all i am mad why am ...,"[(behaving, VBG), (mad, JJ), (see, NN)]"


In [268]:
df.iloc[0]['Tweet_pure_string']

'awww that is a bummer you should have got david carr of third day to do it laugh'

In [269]:
df['Tweet_lemma'] = df['Tweet_pos'].apply(tweet_lemmatizer)
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,Polarity,Tweet,Tweet_regex,Tweet_emoji,Tweet_nopunc,Tweet_clitics,Tweet_shortforms,Tweet_token,Tweet_stopword,Tweet_pure_string,Tweet_pos,Tweet_lemma
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...","- awww, that's a bummer. you shoulda got david...","- awww, that's a bummer. you shoulda got david...",awww that's a bummer you shoulda got david ca...,awww that is a bummer you shoulda got david c...,awww that is a bummer you should have got davi...,"[awww, that, is, a, bummer, you, should, have,...","[awww, bummer, got, david, carr, third, day, l...",awww that is a bummer you should have got davi...,"[(awww, JJ), (bummer, NN), (got, VBD), (david,...","[awww, bummer, get, david, carr, third, day, l..."
1,0,is upset that he can't update his Facebook by ...,is upset that he can't update his facebook by ...,is upset that he can't update his facebook by ...,is upset that he can't update his facebook by ...,is upset that he can not update his facebook b...,is upset that he can not update his facebook b...,"[is, upset, that, he, can, not, update, his, f...","[upset, update, facebook, texting, might, cry,...",is upset that he can not update his facebook b...,"[(upset, JJ), (update, JJ), (facebook, NN), (t...","[upset, update, facebook, texting, might, cry,..."
2,0,@Kenichan I dived many times for the ball. Man...,i dived many times for the ball. managed to sa...,i dived many times for the ball. managed to sa...,i dived many times for the ball managed to sav...,i dived many times for the ball managed to sav...,i dived many times for the ball managed to sav...,"[i, dived, many, times, for, the, ball, manage...","[dived, many, times, ball, managed, save, rest...",i dived many times for the ball managed to sav...,"[(dived, VBD), (many, JJ), (times, NNS), (ball...","[dive, many, time, ball, manage, save, rest, g..."
3,0,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire,"[my, whole, body, feels, itchy, and, like, its...","[whole, body, feels, itchy, like, fire]",my whole body feels itchy and like its on fire,"[(whole, JJ), (body, NN), (feels, NNS), (itchy...","[whole, body, feel, itchy, like, fire]"
4,0,"@nationwideclass no, it's not behaving at all....","no, it's not behaving at all. i'm mad. why am ...","no, it's not behaving at all. i'm mad. why am ...",no it's not behaving at all i'm mad why am i h...,no it is not behaving at all i am mad why am i...,not it is not behaving at all i am mad why am ...,"[not, it, is, not, behaving, at, all, i, am, m...","[behaving, mad, see]",not it is not behaving at all i am mad why am ...,"[(behaving, VBG), (mad, JJ), (see, NN)]","[behave, mad, see]"


In [270]:
def make_sentences(df, col, title):
    df[title] = df[col].apply(lambda x:' '.join([i for i in x]))
    return df

In [271]:
df = make_sentences(df, 'Tweet_lemma', 'Tweet_sent')
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,Polarity,Tweet,Tweet_regex,Tweet_emoji,Tweet_nopunc,Tweet_clitics,Tweet_shortforms,Tweet_token,Tweet_stopword,Tweet_pure_string,Tweet_pos,Tweet_lemma,Tweet_sent
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...","- awww, that's a bummer. you shoulda got david...","- awww, that's a bummer. you shoulda got david...",awww that's a bummer you shoulda got david ca...,awww that is a bummer you shoulda got david c...,awww that is a bummer you should have got davi...,"[awww, that, is, a, bummer, you, should, have,...","[awww, bummer, got, david, carr, third, day, l...",awww that is a bummer you should have got davi...,"[(awww, JJ), (bummer, NN), (got, VBD), (david,...","[awww, bummer, get, david, carr, third, day, l...",awww bummer get david carr third day laugh
1,0,is upset that he can't update his Facebook by ...,is upset that he can't update his facebook by ...,is upset that he can't update his facebook by ...,is upset that he can't update his facebook by ...,is upset that he can not update his facebook b...,is upset that he can not update his facebook b...,"[is, upset, that, he, can, not, update, his, f...","[upset, update, facebook, texting, might, cry,...",is upset that he can not update his facebook b...,"[(upset, JJ), (update, JJ), (facebook, NN), (t...","[upset, update, facebook, texting, might, cry,...",upset update facebook texting might cry result...
2,0,@Kenichan I dived many times for the ball. Man...,i dived many times for the ball. managed to sa...,i dived many times for the ball. managed to sa...,i dived many times for the ball managed to sav...,i dived many times for the ball managed to sav...,i dived many times for the ball managed to sav...,"[i, dived, many, times, for, the, ball, manage...","[dived, many, times, ball, managed, save, rest...",i dived many times for the ball managed to sav...,"[(dived, VBD), (many, JJ), (times, NNS), (ball...","[dive, many, time, ball, manage, save, rest, g...",dive many time ball manage save rest go bound
3,0,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire,"[my, whole, body, feels, itchy, and, like, its...","[whole, body, feels, itchy, like, fire]",my whole body feels itchy and like its on fire,"[(whole, JJ), (body, NN), (feels, NNS), (itchy...","[whole, body, feel, itchy, like, fire]",whole body feel itchy like fire
4,0,"@nationwideclass no, it's not behaving at all....","no, it's not behaving at all. i'm mad. why am ...","no, it's not behaving at all. i'm mad. why am ...",no it's not behaving at all i'm mad why am i h...,no it is not behaving at all i am mad why am i...,not it is not behaving at all i am mad why am ...,"[not, it, is, not, behaving, at, all, i, am, m...","[behaving, mad, see]",not it is not behaving at all i am mad why am ...,"[(behaving, VBG), (mad, JJ), (see, NN)]","[behave, mad, see]",behave mad see


In [279]:
wordnet.synsets('helloo')

[]

In [296]:
## TO BE MODIFIED #####
def normalisation_words(tweet):
    tweet = tweet.replace(r'([a-z])\1{1,}', r'\1\1')
    tweet = ' '.join([word if len(wordnet.synsets(word)) > 0 else re.sub(r'([a-z])\1{1,}', r'\1', word) for word in tweet.split()])
    tweet = tweet.replace(r'(ha)\1{1,}', r'\1')
    return tweet

In [297]:
normalisation_words('awww hahahahahaha bummer get david carr third day laugh')

'aw hahahahahaha bummer get david car third day laugh'

In [299]:
df['Tweet_normalised'] = df['Tweet_sent'].apply(normalisation_words)
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,Polarity,Tweet,Tweet_regex,Tweet_emoji,Tweet_nopunc,Tweet_clitics,Tweet_shortforms,Tweet_token,Tweet_stopword,Tweet_pure_string,Tweet_pos,Tweet_lemma,Tweet_sent,Tweet_normalised
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...","- awww, that's a bummer. you shoulda got david...","- awww, that's a bummer. you shoulda got david...",awww that's a bummer you shoulda got david ca...,awww that is a bummer you shoulda got david c...,awww that is a bummer you should have got davi...,"[awww, that, is, a, bummer, you, should, have,...","[awww, bummer, got, david, carr, third, day, l...",awww that is a bummer you should have got davi...,"[(awww, JJ), (bummer, NN), (got, VBD), (david,...","[awww, bummer, get, david, carr, third, day, l...",awww bummer get david carr third day laugh,aw bummer get david car third day laugh
1,0,is upset that he can't update his Facebook by ...,is upset that he can't update his facebook by ...,is upset that he can't update his facebook by ...,is upset that he can't update his facebook by ...,is upset that he can not update his facebook b...,is upset that he can not update his facebook b...,"[is, upset, that, he, can, not, update, his, f...","[upset, update, facebook, texting, might, cry,...",is upset that he can not update his facebook b...,"[(upset, JJ), (update, JJ), (facebook, NN), (t...","[upset, update, facebook, texting, might, cry,...",upset update facebook texting might cry result...,upset update facebok texting might cry result ...
2,0,@Kenichan I dived many times for the ball. Man...,i dived many times for the ball. managed to sa...,i dived many times for the ball. managed to sa...,i dived many times for the ball managed to sav...,i dived many times for the ball managed to sav...,i dived many times for the ball managed to sav...,"[i, dived, many, times, for, the, ball, manage...","[dived, many, times, ball, managed, save, rest...",i dived many times for the ball managed to sav...,"[(dived, VBD), (many, JJ), (times, NNS), (ball...","[dive, many, time, ball, manage, save, rest, g...",dive many time ball manage save rest go bound,dive many time ball manage save rest go bound
3,0,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire,"[my, whole, body, feels, itchy, and, like, its...","[whole, body, feels, itchy, like, fire]",my whole body feels itchy and like its on fire,"[(whole, JJ), (body, NN), (feels, NNS), (itchy...","[whole, body, feel, itchy, like, fire]",whole body feel itchy like fire,whole body feel itchy like fire
4,0,"@nationwideclass no, it's not behaving at all....","no, it's not behaving at all. i'm mad. why am ...","no, it's not behaving at all. i'm mad. why am ...",no it's not behaving at all i'm mad why am i h...,no it is not behaving at all i am mad why am i...,not it is not behaving at all i am mad why am ...,"[not, it, is, not, behaving, at, all, i, am, m...","[behaving, mad, see]",not it is not behaving at all i am mad why am ...,"[(behaving, VBG), (mad, JJ), (see, NN)]","[behave, mad, see]",behave mad see,behave mad see


In [301]:
df.drop(df[df["Tweet_normalised"] == ''].index, inplace=True)
df = df.reset_index(drop=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [302]:
df.head()

Unnamed: 0,Polarity,Tweet,Tweet_regex,Tweet_emoji,Tweet_nopunc,Tweet_clitics,Tweet_shortforms,Tweet_token,Tweet_stopword,Tweet_pure_string,Tweet_pos,Tweet_lemma,Tweet_sent,Tweet_normalised
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...","- awww, that's a bummer. you shoulda got david...","- awww, that's a bummer. you shoulda got david...",awww that's a bummer you shoulda got david ca...,awww that is a bummer you shoulda got david c...,awww that is a bummer you should have got davi...,"[awww, that, is, a, bummer, you, should, have,...","[awww, bummer, got, david, carr, third, day, l...",awww that is a bummer you should have got davi...,"[(awww, JJ), (bummer, NN), (got, VBD), (david,...","[awww, bummer, get, david, carr, third, day, l...",awww bummer get david carr third day laugh,aw bummer get david car third day laugh
1,0,is upset that he can't update his Facebook by ...,is upset that he can't update his facebook by ...,is upset that he can't update his facebook by ...,is upset that he can't update his facebook by ...,is upset that he can not update his facebook b...,is upset that he can not update his facebook b...,"[is, upset, that, he, can, not, update, his, f...","[upset, update, facebook, texting, might, cry,...",is upset that he can not update his facebook b...,"[(upset, JJ), (update, JJ), (facebook, NN), (t...","[upset, update, facebook, texting, might, cry,...",upset update facebook texting might cry result...,upset update facebok texting might cry result ...
2,0,@Kenichan I dived many times for the ball. Man...,i dived many times for the ball. managed to sa...,i dived many times for the ball. managed to sa...,i dived many times for the ball managed to sav...,i dived many times for the ball managed to sav...,i dived many times for the ball managed to sav...,"[i, dived, many, times, for, the, ball, manage...","[dived, many, times, ball, managed, save, rest...",i dived many times for the ball managed to sav...,"[(dived, VBD), (many, JJ), (times, NNS), (ball...","[dive, many, time, ball, manage, save, rest, g...",dive many time ball manage save rest go bound,dive many time ball manage save rest go bound
3,0,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire,"[my, whole, body, feels, itchy, and, like, its...","[whole, body, feels, itchy, like, fire]",my whole body feels itchy and like its on fire,"[(whole, JJ), (body, NN), (feels, NNS), (itchy...","[whole, body, feel, itchy, like, fire]",whole body feel itchy like fire,whole body feel itchy like fire
4,0,"@nationwideclass no, it's not behaving at all....","no, it's not behaving at all. i'm mad. why am ...","no, it's not behaving at all. i'm mad. why am ...",no it's not behaving at all i'm mad why am i h...,no it is not behaving at all i am mad why am i...,not it is not behaving at all i am mad why am ...,"[not, it, is, not, behaving, at, all, i, am, m...","[behaving, mad, see]",not it is not behaving at all i am mad why am ...,"[(behaving, VBG), (mad, JJ), (see, NN)]","[behave, mad, see]",behave mad see,behave mad see


In [313]:
# Think if you want to do stratify
X_train, X_test, y_train, y_test = train_test_split(df['Tweet_normalised'], df['Polarity'], test_size=0.1, random_state=2)

In [314]:
X_train, X_dev, y_train, y_dev = train_test_split(X_train, y_train, test_size=0.1, random_state=2)

In [315]:
X_train.shape

(24256,)

In [316]:
X_test.shape

(2995,)

In [317]:
X_dev.shape

(2696,)

In [323]:
import csv, collections
from sklearn.base import BaseEstimator
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import FeatureUnion
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
import numpy as np
from sklearn.metrics import roc_curve, auc

In [325]:
def load_sent_word_net():
    sent_scores = collections.defaultdict(list)

    with open("../content/drive/MyDrive/COL772_A2/SentiWordNet_3.0.0.txt","r") as csvfile:
        reader = csv.reader(csvfile, delimiter='\t', quotechar='"')

        for line in reader:
            if line[0].startswith("#"):
                continue
            if len(line) == 1:
                continue
            POS, ID, PosScore, NegScore, SynsetTerms, Glos = line
            if len(POS) == 0 or len(ID) == 0:
                continue
            for term in SynsetTerms.split(" "):
                term = term.split('#')[0]
                # print(term)
                term = term.replace("-", " ").replace("_", " ")
                key = "%s/%s" % (POS, term)
                # print(key)
                sent_scores[key].append((float(PosScore), float(NegScore)))
                # print(sent_scores)
        for key, value in sent_scores.items():
            sent_scores[key] = np.mean(value, axis=0)

        return sent_scores


sent_word_net = load_sent_word_net()

In [326]:
class LinguisticVectorizer(BaseEstimator):

    def get_feature_names(self):
        return np.array(['sent_pos', 'sent_neg', 'nouns', 'adjectives', 'verbs', 'adverbs'])

    def fit(self, documents, y=None):
        return self

    def _get_sentiments(self, d):
        sent = tuple(d.split())
        tagged = nltk.pos_tag(sent)

        pos_vals = []
        neg_vals = []

        nouns = 0.
        adjectives = 0.
        verbs = 0.
        adverbs = 0.

        i = 0
        for w, t in tagged:

            p, n = 0, 0
            sent_pos_type = None
            if t.startswith("NN"):
                #noun
                sent_pos_type = "n"
                nouns += 1
            elif t.startswith("JJ"):
                #adjective
                sent_pos_type = "a"
                adjectives += 1
            elif t.startswith("VB"):
                #verb
                sent_pos_type = "v"
                verbs += 1
            elif t.startswith("RB"):
                #adverb
                sent_pos_type = "r"
                adverbs += 1
            else:
                sent_pos_type = "Nan"

                i += 1
                l = len(sent) - i

                if l == 0:
                    l = 1
                else:
                    pass

            if sent_pos_type is not None:

                sent_word = "%s/%s" % (sent_pos_type, w)

                if sent_word in sent_word_net:
                    p, n = sent_word_net[sent_word]
                elif sent_word == "Nan":
                    p, n = 0, 0

                pos_vals.append(p)
                neg_vals.append(n)

        if i == 0:
            l = len(sent)
        else:
            pass

        avg_pos_val = np.mean(pos_vals)
        avg_neg_val = np.mean(neg_vals)

        return [avg_pos_val, avg_neg_val, nouns / l, adjectives / l, verbs / l, adverbs / l]

    # print(_get_sentiments('This be fantastic'))

    def transform(self, documents):
        pos_val, neg_val, nouns, adjectives, verbs, adverbs = np.array([self._get_sentiments(d) for d in documents]).T
        result = np.array([pos_val, neg_val, nouns, adjectives, verbs, adverbs]).T

        return result

In [328]:
tfidf_ngrams = TfidfVectorizer(min_df=5, ngram_range=(1, 3))
ling_stats = LinguisticVectorizer()
all_features = FeatureUnion([('ling', ling_stats), ('tfidf', tfidf_ngrams)])
clf = MultinomialNB(alpha=5)

pipeline = Pipeline([('all', all_features), ('clf', clf)])

pipeline.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('all',
                 FeatureUnion(n_jobs=None,
                              transformer_list=[('ling',
                                                 LinguisticVectorizer()),
                                                ('tfidf',
                                                 TfidfVectorizer(analyzer='word',
                                                                 binary=False,
                                                                 decode_error='strict',
                                                                 dtype=<class 'numpy.float64'>,
                                                                 encoding='utf-8',
                                                                 input='content',
                                                                 lowercase=True,
                                                                 max_df=1.0,
                                                      

In [343]:
pd.DataFrame(pipeline.predict(X_test)).value_counts()

0    2995
dtype: int64

In [333]:
y_train_pred = pipeline.predict_proba(X_train)[:, 1] 
y_test_pred = pipeline.predict_proba(X_test)[:, 1]
train_fpr, train_tpr, train_thresholds = roc_curve(y_train, y_train_pred)
test_fpr, test_tpr, test_thresholds = roc_curve(y_test, y_test_pred)



In [338]:
len(y_test_pred)

2995

In [339]:
plt.plot(train_fpr, train_tpr, label="train AUC =" + str(auc(train_fpr, train_tpr)))
plt.plot(test_fpr, test_tpr, label="test AUC =" + str(auc(test_fpr, test_tpr)))
plt.legend()
plt.title("AUC PLOTS")
plt.grid()
plt.show()

In [340]:
trauc=round(auc(train_fpr, train_tpr),3)
teauc=round(auc(test_fpr, test_tpr),3)
print('Train AUC=',trauc)
print('Test AUC=',teauc)

In [None]:
pos_st = df[df['Polarity'] == 4]['Tweet_lemma'].sum()
neg_st = df[df['Polarity'] == 0]['Tweet_lemma'].sum()

In [303]:
df.iloc[208]

Polarity                                                             0
Tweet                @twista202 I think I want to read some books b...
Tweet_regex          i think i want to read some books but the libr...
Tweet_emoji          i think i want to read some books but the libr...
Tweet_nopunc         i think i want to read some books but the libr...
Tweet_clitics        i think i want to read some books but the libr...
Tweet_shortforms     i think i want to read some books but the libr...
Tweet_token          [i, think, i, want, to, read, some, books, but...
Tweet_stopword                     [think, want, read, books, library]
Tweet_pure_string    i think i want to read some books but the libr...
Tweet_pos            [(think, NN), (want, VBP), (read, VBN), (books...
Tweet_lemma                         [think, want, read, book, library]
Tweet_sent                                think want read book library
Tweet_normalised                          think want read book library
Name: 

In [None]:
# neg_st

In [None]:
pos_uni_freq = FreqDist(ngrams(pos_st, 1))
neg_uni_freq = FreqDist(ngrams(neg_st, 1))
pos_bi_freq = FreqDist(ngrams(pos_st, 2))
neg_bi_freq = FreqDist(ngrams(neg_st, 2))
pos_tri_freq = FreqDist(ngrams(pos_st, 3))
neg_tri_freq = FreqDist(ngrams(neg_st, 3))

In [None]:
pos_uni_freq

In [None]:
pos_uni_top = pos_uni_freq.most_common(1000)
neg_uni_top = neg_uni_freq.most_common(1000)
pos_bi_top = pos_bi_freq.most_common(1000)
neg_bi_top = neg_bi_freq.most_common(1000)
pos_tri_top = pos_tri_freq.most_common(1000)
neg_tri_top = neg_tri_freq.most_common(1000)

In [None]:
len(pos_uni_top)

In [None]:
def get_top_words(sent_list):
    word_list = []
    for i in range(len(sent_list)):
        word_list.append(sent_list[i][0][0])
    return word_list

In [None]:
pos_uni_top_words = get_top_words(pos_uni_top)
neg_uni_top_words = get_top_words(neg_uni_top)
pos_bi_top_words = get_top_words(pos_bi_top)
neg_bi_top_words = get_top_words(neg_bi_top)
pos_tri_top_words = get_top_words(pos_tri_top)
neg_tri_top_words = get_top_words(neg_tri_top)

In [None]:
print(len(pos_uni_top_words))
print(len(neg_uni_top_words))


In [None]:
uni_top_common = set(set(pos_uni_top_words) & set(neg_uni_top_words))
pos_best_words = list(set(pos_uni_top_words) - uni_top_common)
neg_best_words = list(set(neg_uni_top_words) - uni_top_common)

In [None]:
uni_top_common_list = list(uni_top_common)

In [None]:
pos_best_words

In [None]:
neg_best_words

In [None]:
len(pos_best_words)

In [None]:
df['Tweet'].iloc[50]

In [None]:
df['Tweet_sent'].iloc[50]

In [None]:
data.iloc[786897,:]

In [None]:
df.iloc[786897]

In [None]:
def dummy(tweet):
    return tweet

In [None]:
cv = CountVectorizer(  
                      tokenizer=dummy,
                      preprocessor=dummy,
                      ngram_range=(1,1)
                    )

In [None]:
print("Hi")

In [None]:
X = df['Polarity', 'Tweet_final_sent']
X_train, X_test, y_train, y_test = train_test_split(X['Tweet_final_sent'], X['Polarity'], test_size=0.25, random_state=2)

In [None]:
# X = cv.fit_transform(df['Tweet_lemma']).toarray()

In [None]:
X

In [None]:
X.shape

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(X, df['Polarity'], test_size=0.25, random_state=2)

In [None]:
# X_train = X[:80000,:]
# X_test = X[80000:,:]
# y_train = df['Polarity'][:80000]
# y_test = df['Polarity'][80000:]

In [None]:
X_train.shape

In [None]:
X_train

In [None]:
def model_run(model, X_train, y_train):
    model.fit(X_train, y_train)

In [None]:
def model_predict(model, X_test, y_test):
    print('Accuracy is: ', model.score(X_test, y_test)*100)
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))

In [None]:
model = MultinomialNB()
model_run(model, X_train, y_train)
model_predict(model, X_test, y_test)

In [None]:
# model = LogisticRegression()
# model_run(model, X_train, y_train)
# model_predict(model, X_test, y_test)

In [None]:
model_predict(model, X_test, y_test)

In [None]:
model = LinearSVC()
model_run(model, X_train, y_train)
model_predict(model, X_test, y_test)

In [None]:
df.head()

In [None]:
tfidf_counts = TfidfVectorizer(tokenizer= word_tokenize, # type of tokenization
                               ngram_range=(1,1)) # number of n-grams
tfidf_data = tfidf_counts.fit_transform(df['Tweet_sent'])

In [None]:
# tfidf_counts = TfidfVectorizer()
# tfidf_data = tfidf_counts.fit_transform(a)

In [None]:
tfidf_data.shape

In [None]:
X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(tfidf_data, df['Polarity'], test_size=0.25, random_state=2)

In [None]:
print(X_train_tfidf.shape)
print(X_test_tfidf.shape)
print(y_train_tfidf.shape)
print(y_test_tfidf.shape)

In [None]:
model = MultinomialNB()
model_run(model, X_train_tfidf, y_train_tfidf)
model_predict(model, X_test_tfidf, y_test_tfidf)

In [None]:
model = LinearSVC()
model_run(model, X_train_tfidf, y_train_tfidf)
model_predict(model, X_test_tfidf, y_test_tfidf)

In [None]:
# model = LogisticRegression()
# model_run(model, X_train_tfidf, y_train_tfidf)
# model_predict(model, X_test_tfidf, y_test_tfidf)

In [None]:
def remove_extra_words(tweet):
    tweet = [word for word in tweet if word in uni_top_common_list]
    if len(tweet) == 0:
        tweet = ['None']
    return tweet

In [None]:
df['Tweet_remove_extra'] = df['Tweet_lemma'].apply(remove_extra_words)
df.head()

Unnamed: 0,Polarity,Tweet,Tweet_regex,Tweet_clean,Tweet_stopword,Tweet_clitics,Tweet_shortforms,Tweet_pos,Tweet_lemma,Tweet_sent,Tweet_remove_extra
514293,0,i miss nikki nu nu already shes always there ...,miss nikki nu nu already shes always there whe...,"[miss, nikki, nu, nu, already, shes, always, t...","[miss, nikki, nu, nu, already, shes, always, n...","[miss, nikki, nu, nu, already, shes, always, n...","[miss, nikki, nu, nu, already, shes, always, n...","[(miss, JJ), (nikki, NN), (nu, JJ), (nu, JJ), ...","[miss, nikki, nu, nu, already, shes, always, n...",miss nikki nu nu already shes always need than...,"[miss, already, shes, always, need, thank, xxx]"
142282,0,So I had a dream last night. I remember a sig...,So had dream last night remember sign which cl...,"[So, had, dream, last, night, remember, sign, ...","[So, dream, last, night, remember, sign, clear...","[So, dream, last, night, remember, sign, clear...","[So, dream, last, night, remember, sign, clear...","[(So, RB), (dream, NN), (last, JJ), (night, NN...","[So, dream, last, night, remember, sign, clear...",So dream last night remember sign clearly tell...,"[So, dream, last, night, remember, sign, tell,..."
403727,0,@girlyghost ohh poor sickly you (((hugs)) ho...,ohh poor sickly you hugs hope you feel little ...,"[ohh, poor, sickly, you, hugs, hope, you, feel...","[ohh, poor, sickly, hugs, hope, feel, little, ...","[ohh, poor, sickly, hugs, hope, feel, little, ...","[ohh, poor, sickly, hugs, hope, feel, little, ...","[(ohh, JJ), (poor, JJ), (sickly, JJ), (hugs, N...","[ohh, poor, sickly, hug, hope, feel, little, g...",ohh poor sickly hug hope feel little good soon,"[hug, hope, feel, little, good, soon]"
649503,0,it is raining again,it is raining again,"[it, is, raining, again]",[raining],[raining],[raining],"[(raining, VBG)]",[rain],rain,[rain]
610789,0,@MissKeriBaby wish I was in LA right now,wish was in LA right now,"[wish, was, in, LA, right, now]","[wish, LA, right]","[wish, LA, right]","[wish, LA, right]","[(wish, JJ), (LA, NNP), (right, NN)]","[wish, LA, right]",wish LA right,"[wish, LA, right]"


In [None]:
df = make_sentences(df, 'Tweet_remove_extra', 'Tweet_final_sent')
df.head()

Unnamed: 0,Polarity,Tweet,Tweet_regex,Tweet_clean,Tweet_stopword,Tweet_clitics,Tweet_shortforms,Tweet_pos,Tweet_lemma,Tweet_sent,Tweet_remove_extra,Tweet_final_sent
514293,0,i miss nikki nu nu already shes always there ...,miss nikki nu nu already shes always there whe...,"[miss, nikki, nu, nu, already, shes, always, t...","[miss, nikki, nu, nu, already, shes, always, n...","[miss, nikki, nu, nu, already, shes, always, n...","[miss, nikki, nu, nu, already, shes, always, n...","[(miss, JJ), (nikki, NN), (nu, JJ), (nu, JJ), ...","[miss, nikki, nu, nu, already, shes, always, n...",miss nikki nu nu already shes always need than...,"[miss, already, shes, always, need, thank, xxx]",miss already shes always need thank xxx
142282,0,So I had a dream last night. I remember a sig...,So had dream last night remember sign which cl...,"[So, had, dream, last, night, remember, sign, ...","[So, dream, last, night, remember, sign, clear...","[So, dream, last, night, remember, sign, clear...","[So, dream, last, night, remember, sign, clear...","[(So, RB), (dream, NN), (last, JJ), (night, NN...","[So, dream, last, night, remember, sign, clear...",So dream last night remember sign clearly tell...,"[So, dream, last, night, remember, sign, tell,...",So dream last night remember sign tell get job...
403727,0,@girlyghost ohh poor sickly you (((hugs)) ho...,ohh poor sickly you hugs hope you feel little ...,"[ohh, poor, sickly, you, hugs, hope, you, feel...","[ohh, poor, sickly, hugs, hope, feel, little, ...","[ohh, poor, sickly, hugs, hope, feel, little, ...","[ohh, poor, sickly, hugs, hope, feel, little, ...","[(ohh, JJ), (poor, JJ), (sickly, JJ), (hugs, N...","[ohh, poor, sickly, hug, hope, feel, little, g...",ohh poor sickly hug hope feel little good soon,"[hug, hope, feel, little, good, soon]",hug hope feel little good soon
649503,0,it is raining again,it is raining again,"[it, is, raining, again]",[raining],[raining],[raining],"[(raining, VBG)]",[rain],rain,[rain],rain
610789,0,@MissKeriBaby wish I was in LA right now,wish was in LA right now,"[wish, was, in, LA, right, now]","[wish, LA, right]","[wish, LA, right]","[wish, LA, right]","[(wish, JJ), (LA, NNP), (right, NN)]","[wish, LA, right]",wish LA right,"[wish, LA, right]",wish LA right


In [None]:
tfidf_counts_clean = TfidfVectorizer(tokenizer= word_tokenize, # type of tokenization
                               ngram_range=(1,2)) # number of n-grams
tfidf_data_clean = tfidf_counts_clean.fit_transform(df['Tweet_final_sent'])

In [None]:
X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(tfidf_data_clean, df['Polarity'], test_size=0.25, random_state=2)

In [None]:
print(X_train_tfidf.shape)
print(X_test_tfidf.shape)
print(y_train_tfidf.shape)
print(y_test_tfidf.shape)

(30000, 69746)
(10000, 69746)
(30000,)
(10000,)


In [None]:
model = MultinomialNB()
model_run(model, X_train_tfidf, y_train_tfidf)
model_predict(model, X_test_tfidf, y_test_tfidf)

Accuracy is:  70.34
              precision    recall  f1-score   support

           0       0.71      0.70      0.70      5048
           4       0.70      0.71      0.70      4952

    accuracy                           0.70     10000
   macro avg       0.70      0.70      0.70     10000
weighted avg       0.70      0.70      0.70     10000



In [None]:
model = LinearSVC()
model_run(model, X_train_tfidf, y_train_tfidf)
model_predict(model, X_test_tfidf, y_test_tfidf)

Accuracy is:  69.61
              precision    recall  f1-score   support

           0       0.71      0.66      0.69      5048
           4       0.68      0.73      0.70      4952

    accuracy                           0.70     10000
   macro avg       0.70      0.70      0.70     10000
weighted avg       0.70      0.70      0.70     10000



In [None]:
model = LogisticRegression()
model_run(model, X_train_tfidf, y_train_tfidf)
model_predict(model, X_test_tfidf, y_test_tfidf)

Accuracy is:  71.04
              precision    recall  f1-score   support

           0       0.73      0.69      0.71      5048
           4       0.70      0.74      0.72      4952

    accuracy                           0.71     10000
   macro avg       0.71      0.71      0.71     10000
weighted avg       0.71      0.71      0.71     10000



In [None]:
from sklearn.pipeline import Pipeline

In [None]:
text_clf = Pipeline([
    ('tfidf',TfidfVectorizer(preprocessor=None,
                             tokenizer=word_tokenize,
                             analyzer='word',
                             stop_words=None,
                             strip_accents=None,
                             lowercase=True,
                             ngram_range=(1,3),
                             min_df=0.0001,
                             max_df=0.9,
                             binary=False,
                             norm='l2',
                             use_idf=1,
                             smooth_idf=1,
                             sublinear_tf=1)),
    ('clf', LogisticRegression(penalty='l2',
                               solver='saga',
                               multi_class='multinomial',
                              tol=1e-5,
                              n_jobs = -1)),
])

In [None]:

text_clf.fit(X_train,y_train)