### Gensim Word2Vec

In [1]:
import warnings
warnings.simplefilter('ignore')

In [2]:
import re
import spacy
import logging
import pandas as pd
from time import time
from collections import defaultdict
import io
import requests

#### Dataset preparation

In [3]:
url = requests.get('https://raw.githubusercontent.com/harshildarji/DataScienceLab/master/data/train_data.csv').content
df = pd.read_csv(io.StringIO(url.decode('utf-8')))

In [4]:
df.iloc[143:148]

Unnamed: 0,desc,tweet
143,ahrc esrc project gcrf investigating local res...,argues for greater engagement with faith value...
144,learn how to do things alone,indonesia why unhcr amp iom in indonesia keep ...
145,multiple sclerosis sag film tv medical psych a...,trump s election fraud committee found no vote...
146,l a filmmaker american nationalist i support o...,they need to go home it is not the job of isra...
147,screenplay writer producer nice guy with a big...,ice won t deport the last nazi war criminal in...


In [5]:
# check if null values exist in the dataframe

df.isnull().sum()

desc     27
tweet     0
dtype: int64

In [6]:
df = df.dropna().reset_index(drop=True)
df.isnull().sum()

desc     0
tweet    0
dtype: int64

In [7]:
# display first few lines of dataframe

df['data'] = df['desc'] + ' ' + df['tweet']
df = df.drop(['desc', 'tweet'], axis = 1)
df.head()

Unnamed: 0,data
0,none mariotti did your ancestors have to apply...
1,usmc veteran love god wife family dog and coun...
2,none rt japanmissionun thank you unicef unhcr ...
3,we ve been referring people to law firms since...
4,i love life i love my family i love people wal...


In [8]:
df.shape

(3929, 1)

#### Cleaning 🧹

In [9]:
# lemmatization and removing stopwords
# in case of OSError, run 'python -m spacy download en' in terminal and try again!

nlp = spacy.load('en', disable=['ner', 'parser'])
def cleaning(doc):
    txt = [token.lemma_ for token in doc if not token.is_stop]
    if len(txt) > 2:
        return ' '.join(txt)

In [10]:
# removing non-alphanumeric characters

brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in df['data'])

In [11]:
%%time
# using NLP pipelines to speed-up cleaning (Read more at: https://spacy.io/usage/processing-pipelines)

txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=5000, n_threads=-1)]

Wall time: 12.9 s


In [12]:
# create dataframe of cleaned data

df_clean = pd.DataFrame({'clean': txt})
df_clean = df_clean.dropna().drop_duplicates()
df_clean.shape

(3291, 1)

In [13]:
df_clean.to_csv('new.csv', index = False)

#### Common phrases and most frequent words

In [14]:
#  using gensim phrases package to automatically detect common phrases to build a list of sentences

from gensim.models.phrases import Phrases, Phraser

sent = [row.split() for row in df_clean['clean']]
bigram = Phrases(sent, min_count=30)
sentences = bigram[sent]

In [15]:
word_freq = defaultdict(int)
for sent in sentences:
    for i in sent:
        word_freq[i] += 1
len(word_freq)

10046

#### Training

In [16]:
import multiprocessing
from gensim.models import Word2Vec

In [17]:
cores = multiprocessing.cpu_count()
print('Number of cores: {}'.format(cores))

Number of cores: 4


In [18]:
%%capture
# initialize the model (Read more about parameters: https://www.kaggle.com/pierremegret/gensim-word2vec-tutorial#The-parameters:)

w2v_model = Word2Vec(min_count=20,
                     window=2,
                     size=300,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)

In [19]:
%%time
# building the vocabulary table

w2v_model.build_vocab(sentences, progress_per=10000)

Wall time: 1.08 s


In [20]:
%%time
# model training

w2v_model.train(sentences, total_examples = w2v_model.corpus_count, epochs = 30, report_delay = 1)

Wall time: 6min 6s


(177557, 1661130)

In [21]:
# precompute L2-normalized vectors
# Note: we cannot continue training after doing a replace. The model becomes read-only, means we can call most_similar, similarity etc., but not train.

w2v_model.init_sims(replace = True)

In [22]:
# save the trained model

w2v_model.wv.save_word2vec_format('model.txt')

In [23]:
w2v_model.wv.similarity('immigration', 'refugee')

0.9998005