In [1]:
import pandas as pd
import numpy as np

In [2]:
df_emoji = pd.read_csv("dataset/Emoji_Sentiment_Data.csv", usecols = ['Emoji', 'Negative', 'Neutral', 'Positive'])
df_emoji

Unnamed: 0,Emoji,Negative,Neutral,Positive
0,😂,3614,4163,6845
1,❤,355,1334,6361
2,♥,252,1942,4950
3,😍,329,1390,4640
4,😭,2412,1218,1896
...,...,...,...,...
964,➛,0,1,0
965,♝,0,1,0
966,❋,0,1,0
967,✆,0,1,0


In [3]:
polarity_ls = []
for index, row in df_emoji.iterrows():
    polarity = 0 
    arg_1 = row['Positive'] > row['Negative']
    arg_2 = row['Positive'] == row['Negative'] and row['Neutral'] % 2 != 0 

    if arg_1 or arg_2:
        polarity = 1
    polarity_ls.append(polarity)

new_df_emoji = pd.DataFrame(polarity_ls, columns=['sentiment'])
new_df_emoji['emoji'] = df_emoji['Emoji'].values
new_df_emoji

Unnamed: 0,sentiment,emoji
0,1,😂
1,1,❤
2,1,♥
3,1,😍
4,0,😭
...,...,...
964,1,➛
965,1,♝
966,1,❋
967,1,✆


In [4]:
df_posts = pd.read_csv("dataset/processed_tweet_dataset.csv")
df_posts = df_posts.drop([df_posts.columns[0]], axis=1)
df_posts

Unnamed: 0,sentiment,post
0,0,"- Awww, that's a bummer. You shoulda got David..."
1,0,Picked Mich St to win it all from the get go. ...
2,0,throat is closing up and i had some string che...
3,0,"If he doesn't get better in a few days, he cou..."
4,0,I'm sure everyone has ruined my gift to you Wh...
...,...,...
9995,1,- i know now what is that haha X)
9996,1,- had a great time with some of the best peopl...
9997,1,"Tyreseee, when you're heading to The Netherlan..."
9998,1,"don't know what you could possibly mean, dear ..."


## Classification Using Naive Bayes

In [5]:
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import naive_bayes
from sklearn.metrics import roc_auc_score

In [6]:
stopset = set(stopwords.words('english'))
vectorizer = TfidfVectorizer(use_idf=True, lowercase=True,strip_accents='ascii', stop_words=stopset)

In [7]:
# print out the emoticons and sentiment values
e_c, p = 0, 0
for index, row in new_df_emoji.iterrows():
    print(f"{row['emoji']} = {row['sentiment']}")
    p += 1 if row['sentiment'] else 0
    e_c += 1

😂 = 1
❤ = 1
♥ = 1
😍 = 1
😭 = 0
😘 = 1
😊 = 1
👌 = 1
💕 = 1
👏 = 1
😁 = 1
☺ = 1
♡ = 1
👍 = 1
😩 = 0
🙏 = 1
✌ = 1
😏 = 1
😉 = 1
🙌 = 1
🙈 = 1
💪 = 1
😄 = 1
😒 = 0
💃 = 1
💖 = 1
😃 = 1
😔 = 0
😱 = 1
🎉 = 1
😜 = 1
☯ = 1
🌸 = 1
💜 = 1
💙 = 1
✨ = 1
😳 = 1
💗 = 1
★ = 1
█ = 0
☀ = 1
😡 = 0
😎 = 1
😢 = 1
💋 = 1
😋 = 1
🙊 = 1
😴 = 0
🎶 = 1
💞 = 1
😌 = 1
🔥 = 1
💯 = 1
🔫 = 0
💛 = 1
💁 = 1
💚 = 1
♫ = 1
😞 = 0
😆 = 1
😝 = 1
😪 = 0
� = 1
😫 = 0
😅 = 1
👊 = 1
💀 = 0
😀 = 1
😚 = 1
😻 = 1
© = 1
👀 = 1
💘 = 1
🐓 = 1
☕ = 1
👋 = 1
✋ = 1
🎊 = 1
🍕 = 1
❄ = 1
😥 = 1
😕 = 0
💥 = 1
💔 = 0
😤 = 0
😈 = 1
► = 1
✈ = 1
🔝 = 1
😰 = 0
⚽ = 1
😑 = 0
👑 = 1
😹 = 1
👉 = 1
🍃 = 1
🎁 = 1
😠 = 0
🐧 = 1
☆ = 1
🍀 = 1
🎈 = 1
🎅 = 1
😓 = 0
😣 = 0
😐 = 0
✊ = 1
😨 = 0
😖 = 0
💤 = 1
💓 = 1
👎 = 0
💦 = 1
✔ = 1
😷 = 0
⚡ = 1
🙋 = 1
🎄 = 1
💩 = 0
🎵 = 1
➡ = 1
😛 = 1
😬 = 1
👯 = 1
💎 = 1
🌿 = 1
🎂 = 1
🌟 = 1
🔮 = 1
❗ = 1
👫 = 1
🏆 = 1
✖ = 1
☝ = 1
😙 = 1
⛄ = 1
👅 = 1
♪ = 1
🍂 = 1
💏 = 1
🔪 = 1
🌴 = 1
👈 = 1
🌹 = 1
🙆 = 1
➜ = 1
👻 = 1
💰 = 1
🍻 = 1
🙅 = 0
🌞 = 1
🍁 = 1
⭐ = 1
▪ = 1
🎀 = 1
━ = 1
☷ = 1
🐷 = 1
🙉 = 1
🌺 = 1
💅 = 1
🐶 = 1
🌚 = 1
👽 = 1
🎤 = 1
👭 = 1
🎧 = 

In [8]:
print(f'Total Positive Emojis are ({p}:{e_c}) or {round(p / e_c * 100)}%')

Total Positive Emojis are (795:969) or 82%


In [9]:
new_df_post = df_posts

In [10]:
y = new_df_post.sentiment
X = vectorizer.fit_transform(new_df_post.post)

print(y.shape)
print(X.shape)
print(f'{X.shape[0]} observations X {X.shape[1]} unique words')

(10000,)
(10000, 13339)
10000 observations X 13339 unique words


In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,random_state=None)

clf = naive_bayes.MultinomialNB()
clf.fit(X_train, y_train)

roc_auc_score(y_test, clf.predict_proba(X_test)[:,1])

0.8064455120081719

## Processing input - Extraction of Emoji & Texts

In [12]:
import emoji
text = "#samplesenti @emojitweets i ❤❤❤ sentiment &quot; analysis &quot; http://senti.com/pic_01.jpg "
def extract_text_and_emoji(text = text):
    global allchars, emoji_list
    remove_keys = ('@', 'http://', '&', '#')
    clean_text = ' '.join(txt for txt in text.split() if not txt.startswith(remove_keys))
    
    allchars = [str for str in text]
    emoji_list = [c for c in allchars if c in emoji.EMOJI_DATA]
    
    clean_text = ' '.join([str for str in clean_text.split() if not any(i in str for i in emoji_list)])
    
    clean_emoji = ''.join([str for str in text.split() if any(i in str for i in emoji_list)])
    return (clean_text, clean_emoji)

allchars, emoji_list = 0, 0
(ct, ce) = extract_text_and_emoji()
print('\nAll Char:', allchars)
print('\nAll Emoji:',emoji_list)
print('\n', ct)
print('\n',ce)


All Char: ['#', 's', 'a', 'm', 'p', 'l', 'e', 's', 'e', 'n', 't', 'i', ' ', '@', 'e', 'm', 'o', 'j', 'i', 't', 'w', 'e', 'e', 't', 's', ' ', 'i', ' ', '❤', '❤', '❤', ' ', 's', 'e', 'n', 't', 'i', 'm', 'e', 'n', 't', ' ', '&', 'q', 'u', 'o', 't', ';', ' ', 'a', 'n', 'a', 'l', 'y', 's', 'i', 's', ' ', '&', 'q', 'u', 'o', 't', ';', ' ', 'h', 't', 't', 'p', ':', '/', '/', 's', 'e', 'n', 't', 'i', '.', 'c', 'o', 'm', '/', 'p', 'i', 'c', '_', '0', '1', '.', 'j', 'p', 'g', ' ']

All Emoji: ['❤', '❤', '❤']

 i sentiment analysis

 ❤❤❤


In [21]:
def get_sentiment(s_input = 'i sentiment analysis'):
    input_array= np.array([s_input])
    input_vector = vectorizer.transform(input_array)
    pred_senti = clf.predict(input_vector)

    return pred_senti[0]
print(get_sentiment())

0


In [22]:
def get_emoji_sentiment(emoji_ls = '❤❤❤', emoji_df = new_df_emoji):
    emoji_val_ls = []
    for e in emoji_ls:
        get_emo_senti = [row['sentiment'] for index, row in emoji_df.iterrows() if row['emoji'] == e]
        emoji_val_ls.append(get_emo_senti[0])
    return emoji_val_ls

ges = get_emoji_sentiment()
print('Sentiment value of each emoji:',ges)

Sentiment value of each emoji: [1, 1, 1]


In [23]:
def get_text_emoji_sentiment(input_test = 'i ❤❤❤ sentiment analysis'):
    
    (ext_text, ext_emoji) = extract_text_and_emoji(input_test)
    print(f'\tExtracted: "{ext_text}" , {ext_emoji}')

    senti_text = get_sentiment(ext_text)
    print(f'\tText value: {senti_text}')

    senti_emoji_value = sum(get_emoji_sentiment(ext_emoji, new_df_emoji))
    print_emo_val_avg = 0 if len(ext_emoji) == 0 else senti_emoji_value/len(ext_emoji)
    print(f'\tEmoji average value: {print_emo_val_avg}')

    senti_avg = (senti_emoji_value + senti_text) / (len(ext_emoji) + 1)
    print(f'\tAverage value: {senti_avg}')
    
    senti_truth = "Positive" if senti_avg >= 0.5 else "Negative"
    
    return senti_truth

print(get_text_emoji_sentiment())

	Extracted: "i sentiment analysis" , ❤❤❤
	Text value: 0
	Emoji average value: 1.0
	Average value: 0.75
Positive


In [24]:
def print_senti_status(test):
    print('========================================')
    print(f'Your input is "{test}" \n')
    sentiment = get_text_emoji_sentiment(test)
    print(f'\nYour input is of "{sentiment}" sentiment'.upper())
    print('========================================')

In [25]:
print_senti_status("I hate dancing 😩😖😨")

Your input is "I hate dancing 😩😖😨" 

	Extracted: "I hate dancing" , 😩😖😨
	Text value: 0
	Emoji average value: 0.0
	Average value: 0.0

YOUR INPUT IS OF "NEGATIVE" SENTIMENT


In [28]:
print_senti_status("I hate dancing 😄😄😄")

Your input is "I hate dancing 😄😄😄" 

	Extracted: "I hate dancing" , 😄😄😄
	Text value: 0
	Emoji average value: 1.0
	Average value: 0.75

YOUR INPUT IS OF "POSITIVE" SENTIMENT


In [None]:
#😊😛😄😐😢😲😘😍😧😉😁😒😀😔😧😆😭