In [1]:
import pandas as pd
import re
import nltk


In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

In [3]:
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/rachitjain/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rachitjain/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/rachitjain/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
data_neg = pd.read_csv('Data/training_negative.csv', encoding='latin-1')
data_pos = pd.read_csv('Data/training_positive.csv', encoding='latin-1')

In [5]:
data = data_neg.append(data_pos)
data = data.iloc[:,1:]

## Sampling Data for Trying out Approaches

In [6]:
df = data.sample(n=40000, random_state=1)
df.tail()

Unnamed: 0,Polarity,Tweet
101123,0,multiple #fail
610384,4,Wow people actually asked me for directions t...
786897,4,@wedylawliet Try &quot;Fly With Me&quot; by th...
457740,0,I already have a case of the Mooondays and i...
149496,0,I've been up for too long and its 8:30 I shou...


## Pre-Processing Text

In [7]:
def clean_text(tweet):
    tweet = tweet.lower()                                   # Converting to lower case
    tweet = re.sub(r'@[^\s]+', ' ', tweet)                  # Removing mentions
    tweet = re.sub(r'https?:\/[^\s]+', ' ', tweet)          # Removing URLs
    tweet = re.sub(r'www.[^\s]+', ' ', tweet)               # Removing Email IDs
    tweet = re.sub(r'#', '', tweet)                         # Removing hashtags
    tweet = re.sub(r'_', ' ', tweet)                        # Sometimes hashtags are done with _ representing break between two words
    tweet = re.sub(r'\.{2,}', ' ', tweet)                   # Segregating sentences
    tweet = re.sub(r"[0-9]+",' ', tweet)                    # Removing numbers as they do not indicate sentiment
    tweet = re.sub(r"[^\w\d\s]+",' ', tweet)                # Removing unnecessary punctuations apart from apostrophe which can be used in clitics
    return ' '.join(tweet.split())

In [8]:
re.sub(r"[^\w\d\s]+",'', "I am n't doing good") 

'I am nt doing good'

In [9]:
def tokenizer(tweet):
    return word_tokenize(tweet)

In [10]:
clitics = {
    "nt": 'not',
    "ve": 'have',
    "s": 'is',
    "m": 'am',
    "re": 'are',
    "ll": 'I will',
    "bout": 'about'
}

In [11]:
# count = 0
def handle_clitics(tweet):
    # global count
    # count += 1
    for i in range(len(tweet)):
        if tweet[i] in clitics.keys():
            tweet[i] = clitics[tweet[i]]
    return tweet

In [12]:
def stopword_removal(tweet):
    tweet = [word for word in tweet if word not in stopwords.words('english')]
    if len(tweet) == 0:
        tweet = ['None']
    return tweet

In [13]:
short_forms = {
    'n': 'and',
    'ya': 'you',
    'luv': 'love',
    'lol': 'laugh',
    'k': 'okay',
    'na': 'no',
    'ily': 'love',
    }

In [14]:
def handle_shortforms(tweet):
    for i in range(len(tweet)):
        if tweet[i] in short_forms.keys():
            tweet[i] = short_forms[tweet[i]]
    return tweet

In [15]:
df['Tweet'] = df['Tweet'].apply(clean_text)
df.head()

Unnamed: 0,Polarity,Tweet
514293,0,i miss nikki nu nu already shes always there w...
142282,0,so i had a dream last night i remember a sign ...
403727,0,ohh poor sickly you hugs hope you feel a littl...
649503,0,it is raining again
610789,0,wish i was in la right now


In [16]:
df['Tweet'] = df['Tweet'].apply(tokenizer)
df.head()

Unnamed: 0,Polarity,Tweet
514293,0,"[i, miss, nikki, nu, nu, already, shes, always..."
142282,0,"[so, i, had, a, dream, last, night, i, remembe..."
403727,0,"[ohh, poor, sickly, you, hugs, hope, you, feel..."
649503,0,"[it, is, raining, again]"
610789,0,"[wish, i, was, in, la, right, now]"


In [17]:
df['Tweet'] = df['Tweet'].apply(stopword_removal)
df.head()

Unnamed: 0,Polarity,Tweet
514293,0,"[miss, nikki, nu, nu, already, shes, always, n..."
142282,0,"[dream, last, night, remember, sign, clearly, ..."
403727,0,"[ohh, poor, sickly, hugs, hope, feel, little, ..."
649503,0,[raining]
610789,0,"[wish, la, right]"


In [18]:
df['Tweet'] = df['Tweet'].apply(handle_clitics)
df.iloc[:30]

Unnamed: 0,Polarity,Tweet
514293,0,"[miss, nikki, nu, nu, already, shes, always, n..."
142282,0,"[dream, last, night, remember, sign, clearly, ..."
403727,0,"[ohh, poor, sickly, hugs, hope, feel, little, ..."
649503,0,[raining]
610789,0,"[wish, la, right]"
67315,0,"[nala, olowalu, still, full, tummy, bread, bas..."
33521,4,"[site, seem, want, load, must, getting, lot, h..."
256032,0,"[time, sleep, hav, actually, work, tmrw]"
657012,0,"[oxford, month]"
180587,4,"[time, cup, tea, fruit, bagels, going, turn, d..."


In [19]:
df['Tweet'] = df['Tweet'].apply(handle_shortforms)
df.iloc[:30]

Unnamed: 0,Polarity,Tweet
514293,0,"[miss, nikki, nu, nu, already, shes, always, n..."
142282,0,"[dream, last, night, remember, sign, clearly, ..."
403727,0,"[ohh, poor, sickly, hugs, hope, feel, little, ..."
649503,0,[raining]
610789,0,"[wish, la, right]"
67315,0,"[nala, olowalu, still, full, tummy, bread, bas..."
33521,4,"[site, seem, want, load, must, getting, lot, h..."
256032,0,"[time, sleep, hav, actually, work, tmrw]"
657012,0,"[oxford, month]"
180587,4,"[time, cup, tea, fruit, bagels, going, turn, d..."


In [20]:
def stemmer(tweet):
    porter_stemmer = PorterStemmer()
    tweet = [porter_stemmer.stem(word) for word in tweet]
    return tweet

In [21]:
stemmer(['I','am','playing','making', 'what','I','do'])

['i', 'am', 'play', 'make', 'what', 'i', 'do']

In [22]:
df['Tweet'] = df['Tweet'].apply(stemmer)
df.iloc[:30]

Unnamed: 0,Polarity,Tweet
514293,0,"[miss, nikki, nu, nu, alreadi, she, alway, nee..."
142282,0,"[dream, last, night, rememb, sign, clearli, to..."
403727,0,"[ohh, poor, sickli, hug, hope, feel, littl, be..."
649503,0,[rain]
610789,0,"[wish, la, right]"
67315,0,"[nala, olowalu, still, full, tummi, bread, bas..."
33521,4,"[site, seem, want, load, must, get, lot, hit]"
256032,0,"[time, sleep, hav, actual, work, tmrw]"
657012,0,"[oxford, month]"
180587,4,"[time, cup, tea, fruit, bagel, go, turn, day, ..."


In [23]:
# # Has to be done after POS Tagging
# def lemmatizer(tweet):
#     word_net = WordNetLemmatizer()
#     tweet = [word_net.lemmatize(word) for word in tweet]
#     return tweet

In [24]:
# lemmatizer(['I','am','playing','making', 'what','I','do'])

In [25]:
# word_net = WordNetLemmatizer()
# word_net.lemmatize('playing', 'v')

In [26]:
df['Tweet'].iloc[50]

['yeah',
 'physio',
 'crap',
 'still',
 'month',
 'away',
 'form',
 'real',
 'physic',
 'activ',
 'couldnt',
 'even',
 'sit',
 'straignthen',
 'leg']

In [30]:
def dummy(tweet):
    return tweet

In [40]:
cv = CountVectorizer(  
                      tokenizer=dummy,
                      preprocessor=dummy
                    )
# X_train = vec.fit_transform(X_train).toarray()
# X_train

# docs = [
#     ['hello', 'world', '.'],
#     ['hello', 'world'],
#     ['again', 'hello', 'world']
# ]



X = cv.fit_transform(df['Tweet']).toarray()
print(len(cv.get_feature_names()))
# ['.', 'again', 'hello', 'world']

23635


In [43]:
X.shape

(40000, 23635)

In [44]:
# cv.get_feature_names()

In [45]:
X_train, X_test, y_train, y_test = train_test_split(X, df['Polarity'], test_size=0.25, random_state=42)

In [48]:
X_train.shape

(30000, 23635)

In [49]:
model = MultinomialNB()
model.fit(X_train, y_train)

MultinomialNB()

In [50]:
model.score(X_test, y_test)

0.7394

In [55]:
model.predict(cv.transform(['Hate this app! simply rubbish!']))

array([4])