<a href="https://colab.research.google.com/github/paresh-nayyar/CSCI-544_NLP/blob/master/Embedding_Attention.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook is for training fastText and word2Vec on our dataset

In [1]:
import re
import json
import string
import requests
import pandas as pd
from gensim.models import FastText

In [2]:
#Reading the data
url = "https://raw.githubusercontent.com/paresh-nayyar/CSCI-544_NLP/master/data/tweet_data.json?token=GHSAT0AAAAAABTGDUW5BL7HVQUMWF5NWTCOYSZTHCQ"
url = requests.get(url)
text = url.text
data = json.loads(text)
df = pd.DataFrame(list(data.items()), columns=['tweetID', 'tweet'])
print(df.head(5))
print(df.shape)

     tweetID                                              tweet
0      46149  DON ko11mulko ki police sMs kar rahi hai par D...
1   40317002  khushi ki talaash me sabhi haiyn. kya maiyn au...
2   40319032  Are dostoin aao aur apne thoughts share karo. ...
3  308624342  Na tum aa sako gay na phir baat hogi, Yeh kais...
4  596311632  meethe gud me mil gaye til:)\nudi patang aur k...
(134693, 2)


In [3]:
#Basic Preprocessing
def cleanSentence(sentence):
    sentence = sentence.lower()
    sentence = sentence.rstrip().replace("\n"," ").replace("\r"," ")
    sentence = re.sub(' +', ' ', sentence)
    sentence = sentence.translate(str.maketrans('', '', string.punctuation))
    return sentence

df["tweet"]= df['tweet'].apply(lambda x : cleanSentence(x))
df.head()

Unnamed: 0,tweetID,tweet
0,46149,don ko11mulko ki police sms kar rahi hai par d...
1,40317002,khushi ki talaash me sabhi haiyn kya maiyn aur...
2,40319032,are dostoin aao aur apne thoughts share karo l...
3,308624342,na tum aa sako gay na phir baat hogi yeh kaisi...
4,596311632,meethe gud me mil gaye til udi patang aur khil...


In [4]:
#tokenizing
def tokenize(sentence):
    tokens = sentence.split(" ")
    tokens = [x for x in tokens if len(x)>0]
    return tokens

df["tokens"] = df["tweet"].apply(lambda x : tokenize(x))
df.head()

Unnamed: 0,tweetID,tweet,tokens
0,46149,don ko11mulko ki police sms kar rahi hai par d...,"[don, ko11mulko, ki, police, sms, kar, rahi, h..."
1,40317002,khushi ki talaash me sabhi haiyn kya maiyn aur...,"[khushi, ki, talaash, me, sabhi, haiyn, kya, m..."
2,40319032,are dostoin aao aur apne thoughts share karo l...,"[are, dostoin, aao, aur, apne, thoughts, share..."
3,308624342,na tum aa sako gay na phir baat hogi yeh kaisi...,"[na, tum, aa, sako, gay, na, phir, baat, hogi,..."
4,596311632,meethe gud me mil gaye til udi patang aur khil...,"[meethe, gud, me, mil, gaye, til, udi, patang,..."


In [5]:
FastTextData = df["tokens"].tolist()
print(FastTextData[:1])

[['don', 'ko11mulko', 'ki', 'police', 'sms', 'kar', 'rahi', 'hai', 'par', 'don', 'ko', 'sms', 'karna', 'mushkil', 'hi', 'nahi', 'namumkin', 'hai', 'isliye', 'don', 'khud', 'sms', 'kar', 'raha', 'hai', 'happy', 'diwali', 'don']]


In [6]:
#FastText training

model = FastText(vector_size=300, window=10, hs=0, negative = 1, min_count=1)
model.build_vocab(FastTextData)
model.train(FastTextData, total_examples=len(FastTextData), epochs=10)

(20543621, 23060530)

In [7]:
model.save("fasttext_model/fasttext_model.model")
print("Model saved!!!")

Model saved!!!


In [8]:
print(model.wv['gussa'].shape)
print(model.wv['angry'].shape)

(300,)
(300,)


In [9]:
model.wv.similarity('gussa', 'angry')

0.24080479

In [10]:
model.wv.similarity('gussa', 'happy')

-0.05222138

In [11]:
model.wv.most_similar(positive=['happy'], topn = 50)

[('happy♥', 0.9964889883995056),
 ('happyd', 0.9962071776390076),
 ('happy😁😁', 0.9961686730384827),
 ('happy😂🎉💃', 0.9960911273956299),
 ('happyok03', 0.9959768652915955),
 ('happysms2', 0.9958986639976501),
 ('happy0997', 0.995539128780365),
 ('happylie8', 0.9954560399055481),
 ('happyclub', 0.9954178929328918),
 ('happyolwayz', 0.9944841265678406),
 ('happysoul966', 0.9936054348945618),
 ('happy280', 0.9929526448249817),
 ('happydefault', 0.9925441741943359),
 ('happy2020', 0.9919973611831665),
 ('happysoul', 0.9909070134162903),
 ('happy2019', 0.9894585013389587),
 ('happysaif', 0.9893147945404053),
 ('happylohri2020', 0.9892993569374084),
 ('happyunicorn', 0.9884064793586731),
 ('happyjsk', 0.9882059097290039),
 ('happyget', 0.9874106645584106),
 ('happyevil', 0.9867644906044006),
 ('happyhorizons', 0.9848529100418091),
 ('happyhika92', 0.9847056865692139),
 ('momhappy', 0.9844312071800232),
 ('happyyyy', 0.9841019511222839),
 ('🙏happy', 0.9829357266426086),
 ('happybdayviru', 0.982