In [1]:
pip install emot

Collecting emot
  Downloading emot-3.1-py3-none-any.whl (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.5/61.5 kB[0m [31m740.5 kB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emot
Successfully installed emot-3.1
[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from tqdm import tqdm
from bs4 import BeautifulSoup
import time
import nltk
import string
import re
import pickle
import unidecode 
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import confusion_matrix, classification_report, f1_score, accuracy_score
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from emot.emo_unicode import EMOTICONS_EMO 
from emot.emo_unicode import EMOJI_UNICODE,UNICODE_EMOJI

import warnings
warnings.filterwarnings('ignore')

In [3]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...


True

In [4]:
bow = CountVectorizer(min_df=5)
tfidf = TfidfVectorizer()

In [5]:
lr_model = LogisticRegression()
mnb_model = MultinomialNB()
bnb_model = BernoulliNB()
dtc_model = DecisionTreeClassifier(max_depth=50)
svc_model = LinearSVC()
rfc_model = RandomForestClassifier(max_depth=25)
gbc_model = GradientBoostingClassifier(verbose=True, learning_rate=1.25, n_estimators=100, max_depth=25)
abc_model = AdaBoostClassifier(learning_rate=0.5)
xgb_model = XGBClassifier(verbosity=2)

In [6]:
def training_model(model,x_train,y_train) :
    model.fit(x_train,y_train)
    return model

In [7]:
def testing_model(model,vectorizer,y_train) :
    pred = model.predict(vectorizer)
    f1 = f1_score(y_train,pred)
    acc = accuracy_score(y_train,pred)
    print(f"F1 Score : {f1}")
    print(f"Acc. : {acc}")

In [8]:
path = "../input/sentiment140/training.1600000.processed.noemoticon.csv"
df = pd.read_csv(path, header=None, encoding='latin')
df.columns = ['sentiment', 'id', 'date', 'flag', 'user', 'text']
df = df[['sentiment','text']]
df.drop_duplicates(inplace=True)
df['sentiment'] = df['sentiment'].apply(lambda x:1 if x == 4 else 0)
df.head()  

Unnamed: 0,sentiment,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [9]:
X,y = df['text'],df['sentiment']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [10]:
# Defining dictionary containing all emojis with their meanings.
emoticons_manual = {':)': 'smile', ':-)': 'smile', ';d': 'wink', ':-E': 'vampire', ':(': 'sad', 
          ':-(': 'sad', ':-<': 'sad', ':P': 'raspberry', ':O': 'surprised',
          ':-@': 'shocked', ':@': 'shocked',':-$': 'confused', ':\\': 'annoyed', 
          ':#': 'mute', ':X': 'mute', ':^)': 'smile', ':-&': 'confused', '$_$': 'greedy',
          '@@': 'eyeroll', ':-!': 'confused', ':-D': 'smile', ':-0': 'yell', 'O.o': 'confused',
          '<(-_-)>': 'robot', 'd[-_-]b': 'dj', ":'-)": 'sadsmile', ';)': 'wink', 
          ';-)': 'wink', 'O:-)': 'angel','O*-)': 'angel','(:-D': 'gossip', '=^.^=': 'cat'}

## Defining set containing all stopwords in english.
stopwordlist = ['a', 'about', 'above', 'after', 'again', 'ain', 'all', 'am', 'an',
             'and','any','are', 'as', 'at', 'be', 'because', 'been', 'before',
             'being', 'below', 'between','both', 'by', 'can', 'd', 'did', 'do',
             'does', 'doing', 'down', 'during', 'each','few', 'for', 'from', 
             'further', 'had', 'has', 'have', 'having', 'he', 'her', 'here',
             'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in',
             'into','is', 'it', 'its', 'itself', 'just', 'll', 'm', 'ma',
             'me', 'more', 'most','my', 'myself', 'now', 'o', 'of', 'on', 'once',
             'only', 'or', 'other', 'our', 'ours','ourselves', 'out', 'own', 're',
             's', 'same', 'she', "shes", 'should', "shouldve",'so', 'some', 'such',
             't', 'than', 'that', "thatll", 'the', 'their', 'theirs', 'them',
             'themselves', 'then', 'there', 'these', 'they', 'this', 'those', 
             'through', 'to', 'too','under', 'until', 'up', 've', 'very', 'was',
             'we', 'were', 'what', 'when', 'where','which','while', 'who', 'whom',
             'why', 'will', 'with', 'won', 'y', 'you', "youd","youll", "youre",
             "youve", 'your', 'yours', 'yourself', 'yourselves']

# Defining regex patterns.
urlPattern        = r"((http://)[^ ]*|(https://)[^ ]*|( www\.)[^ ]*)"
userPattern       = '@[^\s]+'
alphaPattern      = "[^a-zA-Z0-9]"
sequencePattern   = r"(.)\1\1+"
seqReplacePattern = r"\1\1"

wordLemm = WordNetLemmatizer()

emoticons = EMOTICONS_EMO
emoticons.update(emoticons_manual)

In [11]:

def preprocessing_pipeline(textdata):
    textdata = list(textdata)
    processedText = []
    
    for tweet in textdata:
        tweet = tweet.lower()
        
        # Remove all URls
        tweet = re.sub(urlPattern,'',tweet)
        # Replace all emojis.
        for emot in EMOTICONS_EMO:
            tweet = re.sub(u'('+re.escape(emot)+')', " " + "_".join(EMOTICONS_EMO[emot].replace(",","").split())+" ", tweet)
        # Remove @USERNAME.
        tweet = re.sub(userPattern, '', tweet)        
        # Replace all non alphabets.
        tweet = re.sub(alphaPattern, " ", tweet)
        # Replace 3 or more consecutive letters by 2 letter.
        tweet = re.sub(sequencePattern, seqReplacePattern, tweet)
        # Remove all punctuations left
        tweet = tweet.translate(str.maketrans('','',string.punctuation))
        
        """remove html tags from text"""
        soup = BeautifulSoup(tweet,"html.parser")
        tweet = soup.get_text(separator=" ")
        
        """Remove accented characters from text"""
        tweet = unidecode.unidecode(tweet)

        tweetwords = ''
        for word in tweet.split():
            # Checking if the word is a stopword.
            if word not in stopwordlist:
                if len(word)>1:
                    # Lemmatizing the word.
                    word = wordLemm.lemmatize(word)
                    tweetwords += (word+' ')
            
        processedText.append(tweetwords)
        
    return processedText

In [12]:
def training_pipeline(x,y,model,vectorizer) :
    vect = vectorizer.fit_transform(x)
    model = training_model(model,vect,y)
    testing_model(model,vect,y)

In [13]:
def testing_pipeline(x,y,model,vectorizer) :
    pow = vectorizer.transform(x)
    testing_model(model,pow,y)

In [14]:
def predicting_pipeline(x,model,vectorizer) :
    pow = vectorizer.transform(x)
    pred = model.predict_proba(pow)
    pred1 = model.predict(pow)
    pred2 = model.predict_log_proba(pow)
    return [pred,pred1,pred2]

In [15]:
X_train, X_test = preprocessing_pipeline(X_train), preprocessing_pipeline(X_test)

In [16]:
models = [lr_model, mnb_model, bnb_model, dtc_model, svc_model, rfc_model, gbc_model, abc_model, xgb_model]
model_names = ['Logistic_Regression', 'Multinomial_NB', 'Bernoulli_NB', 'Decision_Tree', 'Linear_SVC', 'Random_Forest', 'Gradient_Boosting', 'AdaBoost', 'XGBoost']
vectorizers = [bow, tfidf]
vectorizer_names = ['bag_of_words','tfidf']

for v in range(len(vectorizers)) :
    for m in range(len(models) ):
        print("============================================")
        print(f"Model : {models[m]} with {vectorizers[v]}") 
        print("Training Accuracy : ")
        training_pipeline(X_train,y_train, models[m], vectorizers[v])
        print("Testing Accuracy : ")
        testing_pipeline(X_test,y_test,models[m],vectorizers[v])
        pickle.dump(models[m], open(f'{model_names[m]}_{vectorizer_names[v]}.pkl','wb'))
        print("=============================================")

Model : LogisticRegression() with CountVectorizer(min_df=5)
Training Accuracy : 
F1 Score : 0.8008510086628792
Acc. : 0.7963048323851258
Testing Accuracy : 
F1 Score : 0.7929080236276344
Acc. : 0.7871465149539526
Model : MultinomialNB() with CountVectorizer(min_df=5)
Training Accuracy : 
F1 Score : 0.7838583257866163
Acc. : 0.784867935012534
Testing Accuracy : 
F1 Score : 0.7755549586842146
Acc. : 0.7757491183592801
Model : BernoulliNB() with CountVectorizer(min_df=5)
Training Accuracy : 
F1 Score : 0.7874709726535235
Acc. : 0.7853833452253913
Testing Accuracy : 
F1 Score : 0.7802874998833451
Acc. : 0.7770119877880526
Model : DecisionTreeClassifier(max_depth=50) with CountVectorizer(min_df=5)
Training Accuracy : 
F1 Score : 0.7630004951517257
Acc. : 0.7291254917313363
Testing Accuracy : 
F1 Score : 0.7334429652400746
Acc. : 0.6944834706177642
Model : LinearSVC() with CountVectorizer(min_df=5)
Training Accuracy : 
F1 Score : 0.8059426044685167
Acc. : 0.801308178999678
Testing Accuracy :

In [17]:
training_pipeline(X_train, y_train, gbc_model, tfidf)

      Iter       Train Loss   Remaining Time 
         1           1.1561          345.40m
         2           1.0847          312.28m
         3           1.0388          293.53m
         4           1.0057          275.27m
         5           0.9812          257.80m
         6           0.9591          245.72m
         7           0.9397          235.72m
         8           0.9239          226.70m
         9           0.9095          219.97m
        10           0.8966          212.97m
        20           0.8119          166.11m
        30           0.7678          134.60m
        40           0.7346          110.03m
        50           0.7097           88.49m
        60           1.9327           68.90m
        70           1.9180           50.41m
        80           1.9055           32.95m
        90           1.8914           16.24m
       100           1.8824            0.00s
F1 Score : 0.8617954135611848
Acc. : 0.8590554338285902
