# Generating Embeddings

## Importing libraries

In [1]:
#!pip install fasttext

In [2]:
import pandas as pd
import numpy as np
import gensim
import fasttext
from nltk.tokenize import word_tokenize

import sys
from pathlib import Path
import os

In [3]:
path = Path(os.path.dirname(os.getcwd()))
path = str(path)
print(path)
sys.path.insert(1, path)

/Users/saideepbunny/Projects/HuffPost-News-classification


In [4]:
from utils.utils import preprocess_text

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/saideepbunny/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/saideepbunny/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/saideepbunny/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


## Reading data

In [5]:
df = pd.read_json(f'{path}/data/train_data.json', orient='records')
df

Unnamed: 0,headline,category,short_description,authors,date,headline_length,short_description_length
0,Stephen Colbert Hits Trump With The Perfect 'S...,ENTERTAINMENT,"""Late Show"" host has a correction for the pres...",Ed Mazza,2018-01-12,63,52
1,Dear Colleagues: We SUCK!,WELLNESS,"I'm not saying we are wrong. In fact, we may a...","David Katz, M.D., ContributorFounder, True Hea...",2015-03-10,25,220
2,Emily Fletcher Shares Guided Meditation Techni...,WELLNESS,Meditation doesn't have to be complicated. In ...,,2013-11-28,57,120
3,HuffPost Rise: What You Need To Know On Februa...,POLITICS,Welcome to the HuffPost Rise Morning Newsbrief...,,2016-02-15,51,103
4,3 Ways To Fight Overwhelm And Add Joy To Your ...,WELLNESS,Working moms are juggling more than ever befor...,"Paula Jenkins, ContributorLife Coach and Host ...",2016-07-09,50,123
...,...,...,...,...,...,...,...
61970,Owning An Assault Weapon Is No Longer A Fundam...,POLITICS,An appeals court wiped out an earlier ruling t...,Cristian Farias,2016-03-06,68,94
61971,Blackfish: Rooting for Killer Whales and Kille...,ENTERTAINMENT,A debate ensued that resonates for yet another...,"Regina Weinreich, Contributor\nAuthor, 'Keroua...",2013-06-22,69,121
61972,Trevor Noah Mockingly Praises Trump's 'Right R...,ENTERTAINMENT,"""Now I know your first instinct is to be disgu...",Lee Moran,2017-11-28,73,125
61973,Elite 'Bundlers' Raise More Than $113 Million ...,POLITICS,"Big backers include Ben Affleck, George Lucas,...","Michael Beckel, Center for Public Integrity",2016-09-23,65,81


## Preprocessing data

In [6]:
df['content'] = df['headline'] + df['short_description']
df['content_preprocessed'] = df['content'].apply(lambda x: preprocess_text(x))

## Creating Embedding models

### CBOW models

In [7]:
def create_cbow(embedding_dim, words, window, epochs, sg):
    '''
    creating word2vec or skipgram.
    '''
    #Creating Word2Vec
    cbow_model = gensim.models.Word2Vec(words , vector_size = embedding_dim , window = window , min_count = 1, epochs = epochs, sg = sg)
    created_model = "WORD2VEC" if sg==0 else "SKIPGRAM"
    
    print(f'{created_model} {embedding_dim} VECTOR EMBEDDING DIMENSIONS:')
    print(f'=========================================')
    print(f'- Vocabulary count: {len(cbow_model.wv)}')
    print(f'''- Similar words for word "great:\n"{cbow_model.wv.most_similar('great')}''')
    print('\n')
    
    cbow_model.save(f"{path}/embeddings/{created_model.lower()}_model_{embedding_dim}.model")


### Fasttext model

In [8]:
#writing the text column to build embeddings
train_content_path = f'{path}/data/text_label.txt'
df.to_csv(train_content_path, columns = ['content_preprocessed'], header = None, index = False)

In [9]:
def create_fasttext(train_content_path, embedding_dim):
    
    model = fasttext.train_unsupervised(train_content_path, dim = embedding_dim)
    
    print(f'FASTTEXT {model.dim} VECTOR EMBEDDING DIMENSIONS:')
    print(f'=========================================')
    
    print('- Fasttext embeddings Created')
    print(f'- Vocabulary count: {len(model.words)}')
    print(f'''- Similar words for word "great:\n"{model.get_nearest_neighbors('great', k=10)}''')
    
    model.save_model(f'{path}/embeddings/fasttext_model_{embedding_dim}.bin')


## Building models

In [10]:
%%time

embedding_dims = [100, 200, 300]
corpus = [word_tokenize(sentence) for sentence in df['content_preprocessed']]

for dim in embedding_dims:

    #Word2Vec
    create_cbow(dim, corpus, 5, 50, 0)

    #Skipgram
    create_cbow(dim, corpus, 5, 50, 1)

    #Fasttext
    create_fasttext(train_content_path, dim)

    print('\n\n**********************************************************************************************************************\n\n')

WORD2VEC 100 VECTOR EMBEDDING DIMENSIONS:
- Vocabulary count: 80421
- Similar words for word "great:
"[('good', 0.4518512785434723), ('wonderful', 0.4399999976158142), ('certainly', 0.4133574366569519), ('medicate', 0.3924970328807831), ('againonly', 0.3861885368824005), ('art', 0.3858899772167206), ('sure', 0.38400423526763916), ('sellout', 0.38033515214920044), ('hurtall', 0.3745027482509613), ('channel', 0.37382563948631287)]


SKIPGRAM 100 VECTOR EMBEDDING DIMENSIONS:
- Vocabulary count: 80421
- Similar words for word "great:
"[('goalsponsors', 0.6539395451545715), ('evernote', 0.6371009945869446), ('challengesnow', 0.6287484765052795), ('finishers', 0.6078079342842102), ('narcissismthe', 0.6032633781433105), ('nationalistmake', 0.5986570715904236), ('againpeople', 0.59730064868927), ('obamathere', 0.5955760478973389), ('establishes', 0.5935031175613403), ('dreamsbeing', 0.5898616909980774)]




Read 1M words
Number of words:  17267
Number of labels: 0
Progress: 100.0% words/sec/thread:   85796 lr:  0.000000 avg.loss:  2.305209 ETA:   0h 0m 0s 98.3% words/sec/thread:   86148 lr:  0.000867 avg.loss:  2.307111 ETA:   0h 0m 0s


FASTTEXT 100 VECTOR EMBEDDING DIMENSIONS:
- Fasttext embeddings Created
- Vocabulary count: 17267
- Similar words for word "great:
"[(0.8244670033454895, 'greats'), (0.7737111449241638, 'greasy'), (0.720535933971405, 'greatly'), (0.7141452431678772, 'greatest'), (0.7092286348342896, 'greatist'), (0.7041301131248474, 'greatness'), (0.6738501787185669, 'create'), (0.6681212782859802, 'greater'), (0.6251865029335022, 'creates'), (0.604482889175415, 'goodwill')]


**********************************************************************************************************************


WORD2VEC 200 VECTOR EMBEDDING DIMENSIONS:
- Vocabulary count: 80421
- Similar words for word "great:
"[('exceptional', 0.34178149700164795), ('good', 0.33802881836891174), ('attractors', 0.318238765001297), ('magical', 0.31763002276420593), ('sacred', 0.2883853018283844), ('channel', 0.2840352952480316), ('learning', 0.2821224331855774), ('surprising', 0.2805941700935364), ('difficult', 0.27612173557281494), ('

Read 1M words
Number of words:  17267
Number of labels: 0
Progress: 100.0% words/sec/thread:   48627 lr:  0.000000 avg.loss:  2.309833 ETA:   0h 0m 0s


FASTTEXT 200 VECTOR EMBEDDING DIMENSIONS:
- Fasttext embeddings Created
- Vocabulary count: 17267
- Similar words for word "great:
"[(0.8348000049591064, 'greats'), (0.7866116166114807, 'greasy'), (0.7409617304801941, 'greatly'), (0.7353630661964417, 'greatest'), (0.7262373566627502, 'greatist'), (0.700736939907074, 'greatness'), (0.6940997838973999, 'greater'), (0.6416086554527283, 'grease'), (0.6340741515159607, 'create'), (0.6006224751472473, 'creates')]


**********************************************************************************************************************


WORD2VEC 300 VECTOR EMBEDDING DIMENSIONS:
- Vocabulary count: 80421
- Similar words for word "great:
"[('exceptional', 0.300202339887619), ('housei', 0.2871813476085663), ('wonderful', 0.27409595251083374), ('attractors', 0.27407634258270264), ('magical', 0.26589444279670715), ('sacred', 0.2656116187572479), ('industrya', 0.2568259537220001), ('honoring', 0.25417360663414), ('stressful', 0.24914678931236267), ('

Read 1M words
Number of words:  17267
Number of labels: 0
Progress: 100.0% words/sec/thread:   29220 lr:  0.000000 avg.loss:  2.307586 ETA:   0h 0m 0s


FASTTEXT 300 VECTOR EMBEDDING DIMENSIONS:
- Fasttext embeddings Created
- Vocabulary count: 17267
- Similar words for word "great:
"[(0.8409676551818848, 'greats'), (0.7794288396835327, 'greasy'), (0.7383869886398315, 'greatly'), (0.7345160841941833, 'greatest'), (0.7273334860801697, 'greatist'), (0.7152729630470276, 'greatness'), (0.6994662284851074, 'greater'), (0.6535100936889648, 'grease'), (0.6266390681266785, 'create'), (0.6094589233398438, 'retreat')]


**********************************************************************************************************************


CPU times: user 33min 23s, sys: 19.9 s, total: 33min 43s
Wall time: 10min 28s
