In [1]:
import numpy as np
import pandas as pd
import emoji
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import naive_bayes

# nltk.download('omw-1.4')
# nltk.download('wordnet')
# nltk.download('stopwords')
# nltk.download('punkt')

In [71]:
# Setup the data for emoji
df_emojis = pd.read_csv("Emoji_Sentiment_Data.csv", 
                       usecols = ['Emoji', 'Negative', 'Neutral', 'Positive'])
df_emojis.head(10)

Unnamed: 0,Emoji,Negative,Neutral,Positive
0,😂,3614,4163,6845
1,❤,355,1334,6361
2,♥,252,1942,4950
3,😍,329,1390,4640
4,😭,2412,1218,1896
5,😘,193,702,2753
6,😊,189,754,2243
7,👌,274,728,1923
8,💕,99,683,1618
9,👏,243,634,1459


In [72]:
df_emoticons = pd.read_csv("Emoji_Sentiment_Data.csv")

df_emoticons.head(40)

Unnamed: 0,Emoji,Unicode codepoint,Occurrences,Position,Negative,Neutral,Positive,Unicode name,Unicode block
0,😂,0x1f602,14622,0.805101,3614,4163,6845,FACE WITH TEARS OF JOY,Emoticons
1,❤,0x2764,8050,0.746943,355,1334,6361,HEAVY BLACK HEART,Dingbats
2,♥,0x2665,7144,0.753806,252,1942,4950,BLACK HEART SUIT,Miscellaneous Symbols
3,😍,0x1f60d,6359,0.765292,329,1390,4640,SMILING FACE WITH HEART-SHAPED EYES,Emoticons
4,😭,0x1f62d,5526,0.803352,2412,1218,1896,LOUDLY CRYING FACE,Emoticons
5,😘,0x1f618,3648,0.85448,193,702,2753,FACE THROWING A KISS,Emoticons
6,😊,0x1f60a,3186,0.813302,189,754,2243,SMILING FACE WITH SMILING EYES,Emoticons
7,👌,0x1f44c,2925,0.805223,274,728,1923,OK HAND SIGN,Miscellaneous Symbols and Pictographs
8,💕,0x1f495,2400,0.765726,99,683,1618,TWO HEARTS,Miscellaneous Symbols and Pictographs
9,👏,0x1f44f,2336,0.78713,243,634,1459,CLAPPING HANDS SIGN,Miscellaneous Symbols and Pictographs


In [73]:
df_emojis.Emoji.values

array(['😂', '❤', '♥', '😍', '😭', '😘', '😊', '👌', '💕', '👏', '😁', '☺', '♡',
       '👍', '😩', '🙏', '✌', '😏', '😉', '🙌', '🙈', '💪', '😄', '😒', '💃', '💖',
       '😃', '😔', '😱', '🎉', '😜', '☯', '🌸', '💜', '💙', '✨', '😳', '💗', '★',
       '█', '☀', '😡', '😎', '😢', '💋', '😋', '🙊', '😴', '🎶', '💞', '😌', '🔥',
       '💯', '🔫', '💛', '💁', '💚', '♫', '😞', '😆', '😝', '😪', '�', '😫', '😅',
       '👊', '💀', '😀', '😚', '😻', '©', '👀', '💘', '🐓', '☕', '👋', '✋', '🎊',
       '🍕', '❄', '😥', '😕', '💥', '💔', '😤', '😈', '►', '✈', '🔝', '😰', '⚽',
       '😑', '👑', '😹', '👉', '🍃', '🎁', '😠', '🐧', '☆', '🍀', '🎈', '🎅', '😓',
       '😣', '😐', '✊', '😨', '😖', '💤', '💓', '👎', '💦', '✔', '😷', '⚡', '🙋',
       '🎄', '💩', '🎵', '➡', '😛', '😬', '👯', '💎', '🌿', '🎂', '🌟', '🔮', '❗',
       '👫', '🏆', '✖', '☝', '😙', '⛄', '👅', '♪', '🍂', '💏', '🔪', '🌴', '👈',
       '🌹', '🙆', '➜', '👻', '💰', '🍻', '🙅', '🌞', '🍁', '⭐', '▪', '🎀', '━',
       '☷', '🐷', '🙉', '🌺', '💅', '🐶', '🌚', '👽', '🎤', '👭', '🎧', '👆', '🍸',
       '🍷', '®', '🍉', '😇', '☑', '🏃', '😿', '│', '💣', '🍺', '▶', '😲

#### Set to Binary Polarity and Normalize to 0 and 1

In [74]:
# compare the polarity of the dataset and turn the polarity to binary
# 0 = negative, 1= positive
polarity_ls = []
for index, row in df_emojis.iterrows():
    
    # polarity == sentiment
    # initial polarity is negative
    polarity = 0 
    
    # positive if positive value is greater than negative value
    arg_1 = row['Positive'] > row['Negative']
    
    # positive if neutral value is odd and positive and negative value are equal
    arg_2 = row['Positive'] == row['Negative'] and row['Neutral'] % 2 != 0 
    
    # positive if either of the two arguments are true
    if arg_1 or arg_2:
        polarity = 1
    polarity_ls.append([polarity,row["Emoji"]])

df_emojis = pd.DataFrame(polarity_ls, columns=['sentiment','emoji'])

df_emojis.head(10)

Unnamed: 0,sentiment,emoji
0,1,😂
1,1,❤
2,1,♥
3,1,😍
4,0,😭
5,1,😘
6,1,😊
7,1,👌
8,1,💕
9,1,👏


In [75]:
df_emojis.head(40)

Unnamed: 0,sentiment,emoji
0,1,😂
1,1,❤
2,1,♥
3,1,😍
4,0,😭
5,1,😘
6,1,😊
7,1,👌
8,1,💕
9,1,👏


In [76]:
# # print out the emoticons and sentiment values
# e_c, p = 0, 0
# for index, row in df_emojis.iterrows():
#     print(f"{row['emoji']} = {row['sentiment']}")
#     p += 1 if row['sentiment'] else 0
#     e_c += 1

In [77]:
# print(f'Total Positive Emojis are ({p}:{e_c}) or {round(p / e_c * 100)}%')

#### Processing the inputs - Extraction of emoji and texts

In [78]:
# get emojis list only
get_emojis_1str = lambda word_list: [match["emoji"] for word in word_list for match in emoji.emoji_list(word)]

In [79]:
def extract_text_and_emoji(text):
    global allchars, emoji_list
    # remove all tagging and links, not need for sentiments
    remove_keys = ('@', 'http://', '&', '#')
    clean_text = ' '.join(txt for txt in text.split() if not txt.startswith(remove_keys))
#     print(clean_text)
    
    # setup the input, get the characters and the emoji lists
    allchars = [str for str in text]
    emoji_list = get_emojis_1str(text)
    
    # extract text
    clean_text = ' '.join([str for str in clean_text.split() if not any(i in str for i in emoji_list)])
    
    # extract emoji
    clean_emoji = ''.join([str for str in text.split() if any(i in str for i in emoji_list)])
    return (clean_text, clean_emoji)

In [80]:
def get_emoji_sentiment(emoji_ls , emoji_df):
    emoji_val_ls = []
    for e in emoji_ls:
        get_emo_senti = [row['sentiment'] for index, row in emoji_df.iterrows() if row['emoji'] == e]
        emoji_val_ls.append(get_emo_senti[0])
    return emoji_val_ls

In [81]:
df_org = pd.read_csv("1k_data_emoji_tweets_senti_posneg.csv",index_col=0)

df_org.loc[74:91]

Unnamed: 0,sentiment,post
74,0,so sorry I havent been checking my tweets 😧 We...
75,1,thanks for the recent follow. Much appreciated...
76,1,Good luck 😊
77,1,From soup and sandwiches to slow cooked pork
78,0,please answer request. 😧
79,0,Mine hasn't been sending stuff 😧
80,0,mannn
81,0,the new one just seems a bit unstable 😧
82,0,Oh well never liked Clay anyway 😧
83,1,It really is uplifting and inspiring


In [82]:
print(extract_text_and_emoji("One year ago today 😧"))

('One year ago today', '😧')


In [83]:
temp_df_emoji = []
temp_df_text = []

for text in df_org.post:
    temp_df_text.append(extract_text_and_emoji(text)[0])
    temp_df_emoji.append(extract_text_and_emoji(text)[1])
    
df_org["text"], df_org["emoji"] = temp_df_text, temp_df_emoji


In [84]:
df_org.loc[74:91]

Unnamed: 0,sentiment,post,text,emoji
74,0,so sorry I havent been checking my tweets 😧 We...,so sorry I havent been checking my tweets We a...,😧
75,1,thanks for the recent follow. Much appreciated...,thanks for the recent follow. Much appreciated...,😊
76,1,Good luck 😊,Good luck,😊
77,1,From soup and sandwiches to slow cooked pork,From soup and sandwiches to slow cooked pork,
78,0,please answer request. 😧,please answer request.,😧
79,0,Mine hasn't been sending stuff 😧,Mine hasn't been sending stuff,😧
80,0,mannn,mannn,
81,0,the new one just seems a bit unstable 😧,the new one just seems a bit unstable,😧
82,0,Oh well never liked Clay anyway 😧,Oh well never liked Clay anyway,😧
83,1,It really is uplifting and inspiring,It really is uplifting and inspiring,


In [85]:
import re

def remove_html_tags(text):
    """Remove html tags from a string"""
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

In [86]:
df_org["cleaned_text"] = df_org.text.apply(remove_html_tags)

In [87]:
import string

NON_PUNCTUATION = r"[!\"#\$%&\(\)\*\+,-\./:;<=>\?@\[\\\]\^_`{\|}~0-9]"

def remove_punctuation(text):
    no_punctuation_text = re.sub(NON_PUNCTUATION, "", text, 0, re.MULTILINE)
    return no_punctuation_text

df_org["cleaned_text"] = df_org["cleaned_text"].apply(remove_punctuation)

In [88]:
from nltk.tokenize import TreebankWordTokenizer
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

def do_lemmatization(review):
    lemma_new = ""
    word_list = TreebankWordTokenizer().tokenize(review)
    tags = nltk.pos_tag(word_list)
    for word, tag in tags:
        if tag in ["JJ","JJR","JJS","NN","NNS","NNP","NNPS","RB","RBR","RBS","UH","VB","VBD","VBG","VBN","VBP","VBZ"]:
            word = word.lower()
            lemma = lemmatizer.lemmatize(word)
            lemma_new = lemma_new + " " + lemma
    return lemma_new

df_org["cleaned_text"] = df_org["cleaned_text"].apply(do_lemmatization)

In [89]:
stop_words = stopwords.words('english')

temp_stop_words = ["not","no",'do','very','don',"don't","ain","aren","aren't","couldn","couldn't",'didn',"didn't",'doesn',"doesn't",'hadn',"hadn't",'hasn',"hasn't",'haven',"haven't",'isn',"isn't",'mightn',"mightn't",'mustn',"mustn't",'needn',"needn't",'shan',"shan't",'shouldn',"shouldn't",'wasn',"wasn't",'weren',"weren't",'won',"won't",'wouldn',"wouldn't"]
for sword in temp_stop_words:
    if sword in stop_words:
        stop_words.remove(sword)

In [90]:
# TFIDF vectorizer

vectorizer = TfidfVectorizer(use_idf=True, lowercase=True,
                            strip_accents='ascii', stop_words=stop_words)

In [91]:
# dependent variable will be linked as:
# 0 = negative, 1 = positive
y = df_org.sentiment
# convert 'sentence' from text to features
X = vectorizer.fit_transform(df_org.cleaned_text)

print(y.shape)
print(X.shape)
print(f'{X.shape[0]} observations X {X.shape[1]} unique words')

(1000,)
(1000, 1929)
1000 observations X 1929 unique words


In [92]:
# Test Train Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,random_state=None)

# we will train a naive bayes classifier
clf = naive_bayes.MultinomialNB()
# clf = naive_bayes.BernoulliNB()

clf.fit(X_train, y_train)

# test our models accuracy
clf.score(X_test,y_test)

0.64

In [93]:
from sklearn.svm import SVC

model_svm = SVC()

model_svm.fit(X_train, y_train)

model_svm.score(X_test, y_test)

0.68

In [94]:
def get_sentiment(s_input,model):
    # turn input into array
    input_array= np.array([s_input])
    # vectorize the input
    input_vector = vectorizer.transform(input_array)
    # predict the score of vector
    pred_senti = model.predict(input_vector)

    return pred_senti[0]

In [95]:
def get_text_emoji_sentiment(input_test):
    # separate text and emoji
    (ext_text, ext_emoji) = extract_text_and_emoji(input_test)
#     print(f'\tExtracted: "{ext_text}" , {ext_emoji}')

    # get text sentiment
    senti_text = get_sentiment(ext_text,clf)
#     print(f'\tText value: {senti_text}')

    # get emoji sentiment
    senti_emoji_value = sum(get_emoji_sentiment(ext_emoji, df_emojis))
    print_emo_val_avg = 0 if len(ext_emoji) == 0 else senti_emoji_value/len(ext_emoji)
#     print(f'\tEmoji average value: {print_emo_val_avg}')

    # avg the sentiment of emojis and text
    senti_avg = (senti_emoji_value + senti_text) / (len(ext_emoji) + 1)
#     print(f'\tAverage value: {senti_avg}')

    # set value of avg sentiment to either pos or neg 
#     senti_truth = "Positive" if senti_avg >= 0.5 else "Negative"
    
#     return senti_truth
    return senti_avg

In [96]:
y_pred = []

for tweets in df_org.post:
    y_pred.append(get_text_emoji_sentiment(tweets))
    
y_pred

[0.0,
 1.0,
 0.0,
 1.0,
 1.0,
 1.0,
 0.0,
 1.0,
 1.0,
 0.0,
 1.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 1.0,
 0.5,
 0.0,
 1.0,
 0.0,
 0.0,
 1.0,
 0.5,
 0.0,
 1.0,
 1.0,
 0.0,
 0.0,
 0.5,
 0.5,
 0.0,
 1.0,
 1.0,
 0.0,
 1.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.5,
 0.0,
 0.3333333333333333,
 1.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.5,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.5,
 1.0,
 0.0,
 0.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.0,
 1.0,
 1.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.5,
 1.0,
 1.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 1.0,
 1.0,
 0.5,
 0.0,
 1.0,
 1.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.5,
 0.0,
 1.0,
 0.0,
 1.0,
 1.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.5,
 1.0,
 1.0,
 1.0,
 0.0,
 1.0,
 0.0,
 1.0,
 0.0,
 0.5,
 1.0,
 1.0,
 0.5,
 1.0,
 0.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.0,
 0.5,
 1.0,
 0.5,
 0.0,
 1.0,
 0.5,
 1.0,
 0.0,
 0.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 0.0,
 1.0,
 0.0,
 

In [97]:
df_org["prediction"] = y_pred

In [98]:
df_org.head(20)

Unnamed: 0,sentiment,post,text,emoji,cleaned_text,prediction
0,0,One year ago today 😧 .1,One year ago today .1,😧,year ago today,0.0
1,1,keep smiling happy.1,keep smiling happy.1,,keep smiling happy,1.0
2,0,It's hard to imagine anyone but Robin 😧 but st...,It's hard to imagine anyone but Robin but stil...,😧,'s hard imagine anyone robin still exciting n...,0.0
3,1,Good luck to Rich riding for great project in ...,Good luck to Rich riding for great project in ...,,good luck rich riding great project sunday do...,1.0
4,1,He didn't play for a year,He didn't play for a year,,did n't play year,1.0
5,1,.NOTHING CHANGED BUT THE DATE .. APRIL 28 MAKE S,.NOTHING CHANGED BUT THE DATE .. APRIL 28 MAKE S,,nothing changed but the date april make s,1.0
6,0,they would have sustained their fame if they d...,they would have sustained their fame if they did,😧,have sustained fame did,0.0
7,1,Who wants some good vibes? Watch this on YouTu...,Who wants some good vibes? Watch this on YouTu...,,want good vibe watch youtube tv application n...,1.0
8,1,Seokjinie went to Everland today~ I guess he f...,Seokjinie went to Everland today~ I guess he f...,,seokjinie went everland today guess forgot 's...,1.0
9,0,BES RIP,BES RIP,,be rip,0.0
