<a href="https://colab.research.google.com/github/santosh50/Text_to_Emoji/blob/main/sentence_to_emoji_conversion.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import numpy as np
import pandas as pd

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Reading  datasets
mapping = pd.read_csv("/content/drive/My Drive/NLP_Project_Assignments_2021/20_Expressionismum/Dataset/Mapping.csv")
train = pd.read_csv("/content/drive/My Drive/NLP_Project_Assignments_2021/20_Expressionismum/Dataset/Train_balanced.csv")
test = pd.read_csv("/content/drive/My Drive/NLP_Project_Assignments_2021/20_Expressionismum/Dataset/Test.csv")

In [None]:
# Creating a dictionary to associate emojis to their labels
mapping = mapping.drop(['Unnamed: 0'], axis = 1)
emoticons = mapping['emoticons'].tolist()
mapp = {}
for emo in range(0,len(emoticons)):
    mapp[emo]= emoticons[emo]

In [None]:
mapp

{0: '😜',
 1: '📸',
 2: '😍',
 3: '😂',
 4: '😉',
 5: '🎄',
 6: '📷',
 7: '🔥',
 8: '😘',
 9: '❤',
 10: '😁',
 11: '🇺🇸',
 12: '☀',
 13: '✨',
 14: '💙',
 15: '💕',
 16: '😎',
 17: '😊',
 18: '💜',
 19: '💯'}

In [None]:
# Random sampling of 1000 tweets from train dataset
from sklearn.utils import resample
train = train.drop(['Unnamed: 0'], axis = 1)
train=resample(train,replace=False,n_samples=1000,random_state=123)
train.head()

Unnamed: 0,text,label
43073,Ladies night.... #funtimes @ MKT BAR\n,8
34567,Happiness depends upon ourselves #california #...,11
25967,#SWEETCHICKBLOCKPARTY #WILLY.B #BLEUMARLI #PAP...,1
8850,Date night at Mo's w my baby. @ Moe's Place\n,2
26616,Thanks for the best day fam besides the part w...,14


In [None]:
x_train = train['text'].values
y_train = train['label'].values
x_train.shape

(1000,)

# **Embedding**

In [None]:
# Creating a dictionary for 50 dimension GLoVe embeddings of each word
f = open("/content/drive/My Drive/NLP/glove.6B.50d.txt", encoding='utf8')
embedding_index = {}

for line in f:
    values = line.split()
    word = values[0]
    emb = np.array(values[1:], dtype ='float')
    embedding_index[word] = emb

embedding_index['america'].shape

(50,)

In [None]:
# Function that returns embeddings for input text 
def get_embedding_output(X):
    maxLen = 20
    embedding_output = np.zeros((len(X), maxLen, 50))
    
    for ix in range(X.shape[0]):
        my_example = X[ix].split()
              
        for ij in range(len(my_example)): 
            if (embedding_index.get(my_example[ij].lower()) is not None) and (ij<maxLen):
                embedding_output[ix][ij] = embedding_index[my_example[ij].lower()]
            
    return embedding_output

# Embedding train data
x_train_embed = get_embedding_output(x_train)
x_train_embed.shape

(1000, 20, 50)

In [None]:
# Target variable is converted to categorical for multiclass classification
from keras.utils import to_categorical
y_train = to_categorical(y_train)
y_train[1]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
       0., 0., 0.], dtype=float32)

# **SMOTE**

In [None]:
#SMOTE - Synthetic Minority Oversampling Technique
from imblearn.over_sampling import SMOTE
x_train_embed.shape

(1000, 20, 50)

In [None]:
# Reshaping train data to ensure no. of parameters remain intact during oversampling
x_train_embed = x_train_embed.reshape(-1, 1000)
x_train_embed.shape

(1000, 1000)

In [None]:
oversample = SMOTE()
X, y = oversample.fit_resample(x_train_embed, y_train)

In [None]:
# Reshaping data after oversampling, no. of samples have increased
X = X.reshape(-1, 20, 50)
X.shape, y.shape

((1440, 20, 50), (1440, 20))

# **Split**

In [None]:
# Creating a train-test split
from sklearn.model_selection import train_test_split
x1, x2, y1, y2 = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
x1.shape, y1.shape

((1152, 20, 50), (1152, 20))

# **LSTM**

In [None]:
from keras.models import Sequential 
from keras.layers import LSTM, Dense, Dropout, Bidirectional

In [None]:
# Building the predictive model to be trained
model = Sequential()
model.add(Bidirectional(LSTM(units = 512, return_sequences=True),input_shape = (20,50)))
model.add(Dropout(0.4))
model.add(Bidirectional(LSTM(units=256)))
model.add(Dropout(0.4))
model.add(Dense(units=20, activation='softmax'))
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional (Bidirectional (None, 20, 1024)          2306048   
_________________________________________________________________
dropout (Dropout)            (None, 20, 1024)          0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 512)               2623488   
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense (Dense)                (None, 20)                10260     
Total params: 4,939,796
Trainable params: 4,939,796
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics =['accuracy'])

In [None]:
# Training model with train and validation datasets
hist = model.fit(x1, y1, validation_split=0.2, shuffle=True, batch_size=32, epochs=25)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [None]:
# Evaluating model with test data
model.evaluate(x2, y2)



[3.5122342109680176, 0.4583333432674408]

# **Testing**

In [None]:
x_test = test['TEXT'].values
x_test.shape

(25958,)

In [None]:
# Feeding some input tweets to the model, obtaining predictions
for x in x_test[10:20]:
  x = np.array([x])
  embed = get_embedding_output(x)
  pred = model.predict(embed)
  pred_copy = pred
  l1 = np.argmax(pred_copy, axis=1)[0]
  pred_copy[0][l1] = np.min(pred_copy)
  l2 = np.argmax(pred_copy, axis=1)[0]
  print(x[0], mapp[l1], mapp[l2])

La La Land @ Griffith Park Observatory-Los Angeles ,CA
 🇺🇸 📸
Friends. #Seattle @ Seattle, Washington
 😂 💙
#GETIT #GOTIT #GOOD #WHATEVERIWANT #BOW @ Oakland, California
 💜 💕
Hanging with my bestie for the day ️#mtlove #hyalite #daysoff #fishing #endofsummer…
 😊 ✨
Stoked to be sitting in with @user at the @user tonight!! Therese folks are @ Great…
 ✨ 😍
S/O Luxe Doll @user Her YouTube review on her channel for @user will be uploaded…
 😘 ❤
We’re proud to be Americans #sas2017 #tpusa @ West Palm Beach,…
 🇺🇸 😍
- This was gonna be my lunch for tommorow....BUT, it never made it to the refrigerator , 12…
 🔥 😁
Winter or summer, Chicago is pretty any time of the year... #mycity #chicago #mychicagopix…
 😘 💙
Shoutout to Delores and Herbert for the drinks, bed, and good times @user @ Ottawa, Illinois
 💯 😉
