In [1]:
# Loading libraries
import re
import time
import pandas as pd
from tqdm.auto import tqdm
from collections import Counter

from nltk import sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from gensim.models import Word2Vec


In [2]:
# Loading NLTK dataset
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/pushpendu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/pushpendu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/pushpendu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
# Time calculation
def cal_elapsed_time(s):
    return print("Elapsed time:\t", round((time.time() - s),2))
s_time = time.time()
cal_elapsed_time(s=s_time)

Elapsed time:	 0.0


In [4]:
# Stop words like a,the, this, so etc
stop_words = set(stopwords.words('english'))

# Convert plural verb into sungular verb
lemmatizer = WordNetLemmatizer()
print("Length of stopwords: ", len(stop_words))
print("\nLemmatization")
print("rocks :", lemmatizer.lemmatize("rocks")) 

Length of stopwords:  179

Lemmatization
rocks : rock


# Data loading and cleaning

In [5]:
# Removing na values from dataframe
def data_na_value_cleaning(data):
    print("\nBefore cleaning, Data Shape : ", data.shape)
    print("\nBefore removing Null values: ---------------")
    print(data.isna().sum())
    
    data.dropna(inplace=True)
    data.reset_index(inplace=True,drop=True)
    
    print("After removing Null values: ---------------")
    print(data.isna().sum())
    print("\nAfter cleaning, Data Shape : ", data.shape)
    
    return data


In [6]:
# Removing duplicate values
def duplicate_content_removal(data, col, ini_row):
    print("\nBefore removing duplicates, number of data was : ", ini_row)
    duplicate_count = data[col].duplicated().sum()
    print("\nNumber of Duplicates: ", duplicate_count)

    description_data = data[col].drop_duplicates()
    cleaned_row = len(description_data)
    
    if (ini_row - cleaned_row) > 0:
        print("\nTotal data reduction : ", (ini_row - cleaned_row))
        print("\nAfter removing duplicates, number of data is :", cleaned_row)
    else:
        print("\nDataset doesn't content any duplicate data.")
    
    return list(description_data)

In [7]:
# Download the dataset from below link
# https://www.kaggle.com/stefanbergstein/byo-tweets-predict-your-myers-briggs-personality?select=mbti_1.csv

# Loading the dataset
data_df = pd.read_csv("data/mbti_1.csv")
data_df.head()

Unnamed: 0,type,posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...


### Data Description

This dataset is from Myers-Briggs Personality

type: Personality types (16 type of personality is there)

posts: Personality wise comments

In [8]:
data_df = data_na_value_cleaning(data_df)


Before cleaning, Data Shape :  (8675, 2)

Before removing Null values: ---------------
type     0
posts    0
dtype: int64
After removing Null values: ---------------
type     0
posts    0
dtype: int64

After cleaning, Data Shape :  (8675, 2)


In [9]:
posts = duplicate_content_removal(data_df, 'posts', data_df.shape[0])


Before removing duplicates, number of data was :  8675

Number of Duplicates:  0

Dataset doesn't content any duplicate data.


##### Data cleaning

In [10]:
def remove_link_punc(string):
    # removing links
    temp_string = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', ' ', string)

    # removing all everything except a-z english letters
    regex = re.compile('[^a-zA-Z]')
    temp_string = regex.sub(' ', temp_string)

    # removing extra spaces
    clean_string = re.sub(' +', ' ', temp_string).lower()    

    return clean_string


In [11]:
def data_cleaning(content):
    sentences = []
    for idx in tqdm(range(len(content))):
        
        if content[idx] !="":
            # Sentence tokenization using NLTK library
            for each_sent in  sent_tokenize(str(content[idx])):
                
                if each_sent != "":
                    temp_sent = []
                    # Removing link and punctuation
                    each_sent = remove_link_punc(each_sent.lower())

                    # Removing stopwords and applying lemmatization
                    for each_word in each_sent.split():
                        if each_word not in stop_words and len(each_word)>= 3:
                            temp_sent.append(lemmatizer.lemmatize(each_word))

                    # Only taking word list length is greater than equals to 5
                    if len(temp_sent) >= 5:
                        sentences.append(temp_sent)
    
    return sentences


In [12]:
sent_corpus = data_cleaning(posts)

HBox(children=(FloatProgress(value=0.0, max=8675.0), HTML(value='')))




In [17]:
# Sentence words stats
len_count = []
for l in sent_corpus:
    len_count.append(len(l))

print("Total number of Sentences : ", len(len_count))
word_sent_df = pd.DataFrame(sorted(Counter(len_count).items()), columns=["No of Words in each Sentence","No of sentence"])
word_sent_df.head(10)

Total number of Sentences :  395702


Unnamed: 0,No of Words in each Sentence,No of sentence
0,5,51230
1,6,47073
2,7,41657
3,8,36835
4,9,31599
5,10,27083
6,11,22830
7,12,19160
8,13,16286
9,14,14001


In [20]:
# data after cleaning and preprocessing
print(sent_corpus[0])

['enfp', 'intj', 'moment', 'sportscenter', 'top', 'ten', 'play', 'prank', 'life', 'changing', 'experience', 'life', 'repeat', 'today', 'may', 'perc', 'experience', 'immerse', 'last', 'thing', 'infj', 'friend', 'posted', 'facebook', 'committing', 'suicide', 'next', 'day']


## Model training

##### Using Gensim model to triain word2vec model

In [13]:
from gensim.models import Word2Vec

In [14]:
s_time = time.time()
print("Model Training Started...")
model = Word2Vec(sentences=sent_corpus, size=200, window=4, min_count=1, workers=4)
cal_elapsed_time(s_time)


Model Training Started...
Elapsed time:	 37.59


In [23]:
print("Total number of unique words loaded in Model : ", len(model.wv.vocab))

Total number of unique words loaded in Model :  83717


In [24]:
# Saving the model
model.save("data/model/trainned_model.model")

In [25]:
# Loading the model
model = Word2Vec.load("data/model/trainned_model.model")

In [26]:
# Find similar words for efficiency
model.wv.most_similar("efficiency", topn=10)

[('practicality', 0.8367217779159546),
 ('monetary', 0.8339818716049194),
 ('measurable', 0.8308203220367432),
 ('regulation', 0.8286392688751221),
 ('scope', 0.821826696395874),
 ('strategic', 0.8177478909492493),
 ('eliminating', 0.8165096044540405),
 ('requiring', 0.8137727975845337),
 ('reducing', 0.8120754361152649),
 ('intrinsic', 0.8112998008728027)]

In [27]:
# Get vector form of word king
model.wv.get_vector('king')

array([-0.32107702,  0.02634325,  0.5982603 , -0.6479276 ,  0.3632543 ,
       -0.30940267,  0.11201022, -1.4546527 , -0.6258229 ,  0.06379458,
       -0.5263066 , -0.14645165, -0.08704363, -0.45985267, -0.02743918,
       -0.14499499,  1.0008129 ,  0.00337885, -0.08546805,  0.728745  ,
       -0.5122128 , -0.03007157, -0.04415359,  0.09485263,  0.8781621 ,
       -0.9580598 , -0.7981698 , -0.4311728 ,  0.67374474, -0.8651508 ,
        0.5282616 ,  0.04314229,  0.82111883, -0.93143344,  0.17256826,
        0.3960089 ,  0.15104313,  0.620224  ,  0.08957284, -0.33669117,
       -0.6600646 ,  0.5423459 , -0.41448766, -1.0866758 , -0.41825002,
        0.93874204, -0.4660616 , -0.2024687 ,  0.26992255,  0.10713856,
       -0.47868192,  0.11337369, -0.43584147,  0.28840148, -0.457471  ,
       -1.4268371 , -0.04075855, -0.3103302 , -0.00450425,  0.7783772 ,
        0.50324357, -0.43334493,  0.44846568, -1.1994292 ,  0.12398864,
        0.48461846, -0.6658262 ,  0.77683586, -0.4329836 ,  0.90

In [18]:
# Get vector form of word king
print("Shape of the vector : ", model.wv.get_vector('king').shape)

Shape of the vector :  (200,)


In [41]:
# Another way to get vectorized form of word2vec
model.wv.word_vec('king')

array([-0.32107702,  0.02634325,  0.5982603 , -0.6479276 ,  0.3632543 ,
       -0.30940267,  0.11201022, -1.4546527 , -0.6258229 ,  0.06379458,
       -0.5263066 , -0.14645165, -0.08704363, -0.45985267, -0.02743918,
       -0.14499499,  1.0008129 ,  0.00337885, -0.08546805,  0.728745  ,
       -0.5122128 , -0.03007157, -0.04415359,  0.09485263,  0.8781621 ,
       -0.9580598 , -0.7981698 , -0.4311728 ,  0.67374474, -0.8651508 ,
        0.5282616 ,  0.04314229,  0.82111883, -0.93143344,  0.17256826,
        0.3960089 ,  0.15104313,  0.620224  ,  0.08957284, -0.33669117,
       -0.6600646 ,  0.5423459 , -0.41448766, -1.0866758 , -0.41825002,
        0.93874204, -0.4660616 , -0.2024687 ,  0.26992255,  0.10713856,
       -0.47868192,  0.11337369, -0.43584147,  0.28840148, -0.457471  ,
       -1.4268371 , -0.04075855, -0.3103302 , -0.00450425,  0.7783772 ,
        0.50324357, -0.43334493,  0.44846568, -1.1994292 ,  0.12398864,
        0.48461846, -0.6658262 ,  0.77683586, -0.4329836 ,  0.90

In [28]:
# Vectorized shape of each words in model
model.wv.get_vector('king').shape

(200,)

In [29]:
# List of word similarity
model.wv.n_similarity(['king','male'],['queen','female'])

0.880749

In [34]:
# Distance between two words
model.wv.distance('king','queen')

0.15679800510406494

In [35]:
# Similar word for king
model.wv.similar_by_word('king')

[('stephen', 0.8628365993499756),
 ('claiborne', 0.8594867587089539),
 ('smith', 0.8592435717582703),
 ('adam', 0.8448503613471985),
 ('donnie', 0.8437778353691101),
 ('queen', 0.8432019948959351),
 ('lion', 0.839264988899231),
 ('scissorhands', 0.8383902311325073),
 ('martin', 0.8365925550460815),
 ('trilogy', 0.8352822661399841)]

In [31]:
# Distance between two words
model.wv.distance("king", "queen")

0.18909430503845215

In [32]:
# Distance between two words
model.wv.distance("king", "male")

1.0550790503621101

In [35]:
# Distance between two words
model.wv.distance("queen", "female")

0.979871017858386

In [36]:
# Odd one out from list of words
model.wv.doesnt_match(["king", "george","stephen","truck"])

'truck'

In [37]:
# word pairs evaluation
model.wv.evaluate_word_pairs('data/SimLex-999/SimLex-999_2.txt',)

((-0.19954722283193066, 0.5133649428804943),
 SpearmanrResult(correlation=-0.19780219780219782, pvalue=0.51713091148483),
 0.0)

In [38]:
# Words analogy from list of words
model.wv.evaluate_word_analogies('data/questions-words.txt')

(0.11880279103929489,
 [{'section': 'capital-common-countries',
   'correct': [('ATHENS', 'GREECE', 'PARIS', 'FRANCE'),
    ('BERLIN', 'GERMANY', 'MADRID', 'SPAIN'),
    ('BERLIN', 'GERMANY', 'PARIS', 'FRANCE'),
    ('OTTAWA', 'CANADA', 'TOKYO', 'JAPAN'),
    ('PARIS', 'FRANCE', 'ROME', 'ITALY'),
    ('PARIS', 'FRANCE', 'ATHENS', 'GREECE'),
    ('STOCKHOLM', 'SWEDEN', 'PARIS', 'FRANCE')],
   'incorrect': [('ATHENS', 'GREECE', 'BAGHDAD', 'IRAQ'),
    ('ATHENS', 'GREECE', 'BANGKOK', 'THAILAND'),
    ('ATHENS', 'GREECE', 'BEIJING', 'CHINA'),
    ('ATHENS', 'GREECE', 'BERLIN', 'GERMANY'),
    ('ATHENS', 'GREECE', 'BERN', 'SWITZERLAND'),
    ('ATHENS', 'GREECE', 'CAIRO', 'EGYPT'),
    ('ATHENS', 'GREECE', 'CANBERRA', 'AUSTRALIA'),
    ('ATHENS', 'GREECE', 'HAVANA', 'CUBA'),
    ('ATHENS', 'GREECE', 'HELSINKI', 'FINLAND'),
    ('ATHENS', 'GREECE', 'ISLAMABAD', 'PAKISTAN'),
    ('ATHENS', 'GREECE', 'LONDON', 'ENGLAND'),
    ('ATHENS', 'GREECE', 'MADRID', 'SPAIN'),
    ('ATHENS', 'GREECE', 'MO

In [None]:
# Loading google pretrain mdoel
from gensim import models

models.KeyedVectors.load_word2vec_format('data/GoogleNews-vectors-negative300.bin', binary=True)


In [44]:
# Get closer word for 2 entity
model.wv.closer_than('king','queen')

['princess',
 'angel',
 'vampire',
 'robert',
 'smith',
 'fairy',
 'lion',
 'adam',
 'stephen',
 'hunter',
 'trilogy',
 'alice',
 'pirate',
 'anne',
 'martin',
 'johnny',
 'beatles',
 'wonderland',
 'donnie',
 'darko']