<a href="https://colab.research.google.com/github/sivannavis/NLP-for-human-rights/blob/main/WordEmbed_for_10k.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>



# Read in 10k dataset demonstration
Previous code in DEI-10k

In [1]:
import pandas as pd
import itertools
import spacy
import numpy as np
import re
import os
import matplotlib.pyplot as plt
import seaborn as sns
import random
import datetime

# import chart_studio
# import chart_studio.plotly as py
# import chart_studio.tools as tls
import plotly.graph_objects as go
import plotly.figure_factory as ff
import plotly.express as px

from tqdm.notebook import tqdm_notebook
from tqdm import  tqdm
tqdm_notebook.pandas(desc="progress bar")

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

## Connecting to working directory and read in 10k data for a specific year

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
# for specific years
selected_year = '2020'

In [4]:
# working directory
%cd "/content/gdrive/MyDrive/DFG Cost of Human Rights Violations/Datasets/10k_clean_text/_0.2 paragraphed"
!pwd

/content/gdrive/.shortcut-targets-by-id/1OPzVNu0CgKsi5tBwlMJVXZTu1EWX9F6C/DFG Cost of Human Rights Violations/Datasets/10k_clean_text/_0.2 paragraphed
/content/gdrive/.shortcut-targets-by-id/1OPzVNu0CgKsi5tBwlMJVXZTu1EWX9F6C/DFG Cost of Human Rights Violations/Datasets/10k_clean_text/_0.2 paragraphed


In [5]:
dir_10ks = '10ks_para_sics/'
file_name_prefix_10ks = dir_10ks + '10ks_para_sics_'

df_selected_year_10ks = pd.read_csv(file_name_prefix_10ks + selected_year + '.csv')

# read with row limits
# df_selected_year_10ks = pd.read_csv(file_name_prefix_10ks + selected_year + '.csv', nrows = 100)


In [6]:
df_selected_year_10ks.shape

(2017623, 4)

In [7]:

df_selected_year_10ks.head()

Unnamed: 0,id,ticker_display,primary_industry_id,text
0,2020Q1_MCRB_10-K_20191231_0001564590-20-008002...,MCRB,HC-BP,Indicate by check mark whether the registrant:...
1,2020Q1_MCRB_10-K_20191231_0001564590-20-008002...,MCRB,HC-BP,Indicate by check mark whether the registrant ...
2,2020Q1_MCRB_10-K_20191231_0001564590-20-008002...,MCRB,HC-BP,Indicate by check mark whether the registrant ...
3,2020Q1_MCRB_10-K_20191231_0001564590-20-008002...,MCRB,HC-BP,This Annual Report on Form 10-K contains forwa...
4,2020Q1_MCRB_10-K_20191231_0001564590-20-008002...,MCRB,HC-BP,In some cases you can identify forward-looking...


## **Demonstration of duplication dropping process (ignored for samples)**

In [8]:
# Create 10k column to drop duplicate paragraphs at the 10k level
df_selected_year_10ks['10K_id'] = df_selected_year_10ks.apply(lambda row: row['id'].rsplit('_', 1)[0], axis=1)

KeyboardInterrupt: ignored

In [None]:
df_selected_year_10ks_deduped = df_selected_year_10ks.drop_duplicates(subset=['10K_id', 'text'], keep='first')

In [None]:
df_selected_year_10ks_deduped.drop(['10K_id'], inplace=True, axis=1)

In [None]:
df_selected_year_10ks_deduped.shape
df_selected_year_10ks_deduped.head()

In [None]:
selected_year_10ks_dir = f'{selected_year}_DEI_10ks'

In [None]:
os.mkdir(selected_year_10ks_dir)

In [None]:
df_selected_year_10ks_deduped.to_csv(f'{selected_year_10ks_dir}/{selected_year}_10ks_deduped.csv.gz', compression='gzip', index=False)

In [None]:
del df_selected_year_10ks_deduped
del df_selected_year_10ks

# Embedding 1.0 - skip gram with unigram tonkenizers

## data preprocessing

### sampling and dropping duplicates

In [9]:
# 1000-sample example
# for specific years
selected_year = '2020'
# read with row limits
sample = 1000
# df_selected_year_10ks_sample = pd.read_csv(file_name_prefix_10ks + selected_year + '.csv', nrows = sample, usecols = [3])
df_selected_year_10ks_sample = pd.read_csv(file_name_prefix_10ks + selected_year + '.csv', nrows = sample)
print(df_selected_year_10ks_sample.shape)


(1000, 4)


In [14]:
# drop duplicates
df_selected_year_10ks_sample['10K_id'] = df_selected_year_10ks_sample.apply(lambda row: row['id'].rsplit('_', 1)[0], axis=1)
df_selected_year_10ks_sample_deduped = df_selected_year_10ks_sample.drop_duplicates(subset=['10K_id', 'text'], keep='first')
df_selected_year_10ks_sample_deduped.drop(['10K_id'], inplace=True, axis=1)
print(df_selected_year_10ks_sample_deduped.shape)

# save to files
selected_year_10ks_sample_dir = f'{selected_year}_DEI_10ks'
# os.mkdir(selected_year_10ks_sample_dir)
df_selected_year_10ks_sample_deduped.to_csv(f'{selected_year_10ks_sample_dir}/{selected_year}_10ks_sample_deduped.csv.gz', compression='gzip', index=False)

(979, 4)


In [16]:
df_selected_year_10ks_sample_deduped.head()

Unnamed: 0,id,ticker_display,primary_industry_id,text
0,2020Q1_MCRB_10-K_20191231_0001564590-20-008002...,MCRB,HC-BP,Indicate by check mark whether the registrant:...
1,2020Q1_MCRB_10-K_20191231_0001564590-20-008002...,MCRB,HC-BP,Indicate by check mark whether the registrant ...
2,2020Q1_MCRB_10-K_20191231_0001564590-20-008002...,MCRB,HC-BP,Indicate by check mark whether the registrant ...
3,2020Q1_MCRB_10-K_20191231_0001564590-20-008002...,MCRB,HC-BP,This Annual Report on Form 10-K contains forwa...
4,2020Q1_MCRB_10-K_20191231_0001564590-20-008002...,MCRB,HC-BP,In some cases you can identify forward-looking...


### tokenizers(unigram)
text to sentense runtime: ~4s

In [15]:


import pandas as pd
import os
from nltk.corpus import stopwords
import nltk.data
import logging
import numpy as np
from gensim.models import Word2Vec
from gensim.models import fasttext


In [16]:
# Import various modules for string cleaning
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords

def text_to_wordlist( text, remove_stopwords=False ):
    # Function to convert a document to a sequence of words,
    # optionally removing stop words.  Returns a list of words.
    #
    # 1. Remove HTML
    text = BeautifulSoup(text).get_text()
    #  
    # 2. Remove non-letters
    text = re.sub("[^a-zA-Z]"," ", text)
    #
    # 3. Convert words to lower case and split them
    words = text.lower().split()
    #
    # 4. Optionally remove stop words (false by default)
    if remove_stopwords:
        stops = set(stopwords.words(" "))
        words = [w for w in words if not w in stops]
    #
    # 5. Return a list of words
    return(words)

In [17]:
# Download the punkt tokenizer for sentence splitting
import nltk.data
nltk.download("popular")   

# Load the punkt tokenizer
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

# Define a function to split a review into parsed sentences
def text_to_sentences( text, tokenizer, remove_stopwords=False ):
    # Function to split a review into parsed sentences. Returns a 
    # list of sentences, where each sentence is a list of words
    #
    # 1. Use the NLTK tokenizer to split the paragraph into sentences
    raw_sentences = tokenizer.tokenize(text.strip())
    #
    # 2. Loop over each sentence
    sentences = []
    for raw_sentence in raw_sentences:
        # If a sentence is empty, skip it
        if len(raw_sentence) > 0:
            # Otherwise, call review_to_wordlist to get a list of words
            sentences.append( text_to_wordlist( raw_sentence, \
              remove_stopwords ))
    #
    # Return the list of sentences (each sentence is a list of words,
    # so this returns a list of lists
    return sentences

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cmudict.zip.
[nltk_data]    | Downloading package gazetteers to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gazetteers.zip.
[nltk_data]    | Downloading package genesis to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/genesis.zip.
[nltk_data]    | Downloading package gutenberg to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gutenberg.zip.
[nltk_data]    | Downloading package inaugural to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/inaugural.zip.
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/movie_reviews.zip.
[nltk_data]    | Downloading package names to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/names.zip.
[nltk_data]    | Downloading package shakespeare to /root/nltk_data...
[nlt

In [21]:
sentences = []  # Initialize an empty list of sentences

# print(df_selected_year_10ks_sample)
print ("Parsing sentences from training set")
for text in df_selected_year_10ks_sample_deduped['text']:
    print(text)
    sentences += text_to_sentences(text, tokenizer)

Parsing sentences from training set
Indicate by check mark whether the registrant: (1) has filed all reports required to be filed by Section 13 or 15(d) of the Securities Exchange Act of 1934 during the preceding 12 months (or for such shorter period that the registrant was required to file such reports) and (2) has been subject to such filing requirements for the past 90 days.    Yes  ☒    No  ☐
Indicate by check mark whether the registrant has submitted electronically every Interactive Data File required to be submitted pursuant to Rule 405 of Regulation S-T (§ 232.405 of this chapter) during the preceding 12 months (or for such shorter period that the registrant was required to submit such files).    Yes  ☒    No  ☐
Indicate by check mark whether the registrant is a large accelerated filer an accelerated filer a non-accelerated filer a smaller reporting company or an emerging growth company. See the definitions of the "large accelerated filer" "accelerated filer" "smaller reporting 

In [22]:
# check sentences
print(len(sentences))
print(sentences[0])

3999
['indicate', 'by', 'check', 'mark', 'whether', 'the', 'registrant', 'has', 'filed', 'all', 'reports', 'required', 'to', 'be', 'filed', 'by', 'section', 'or', 'd', 'of', 'the', 'securities', 'exchange', 'act', 'of', 'during', 'the', 'preceding', 'months', 'or', 'for', 'such', 'shorter', 'period', 'that', 'the', 'registrant', 'was', 'required', 'to', 'file', 'such', 'reports', 'and', 'has', 'been', 'subject', 'to', 'such', 'filing', 'requirements', 'for', 'the', 'past', 'days']


## Training Word2Vec model

In [20]:
# Import the built-in logging module and configure it so that Word2Vec 
# output messages
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
    level=logging.INFO)

# Set values for various parameters
num_features = 300    # Word vector dimensionality                      
min_word_count = 40   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

# Initialize and train the model (this will take some time)
from gensim.models import word2vec
print("Training model...")

# Using default architecture: skip-gram
model = word2vec.Word2Vec(sentences, workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling)

# If you don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)

# It can be helpful to create a meaningful model name and 
# save the model for later use. You can load it later using Word2Vec.load()
model_name = "300features_40minwords_10context"
model.save(model_name)

2022-03-24 03:26:05,781 : INFO : collecting all words and their counts
2022-03-24 03:26:05,783 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2022-03-24 03:26:05,810 : INFO : collected 5545 word types from a corpus of 126597 raw words and 4081 sentences
2022-03-24 03:26:05,812 : INFO : Loading a fresh vocabulary
2022-03-24 03:26:05,817 : INFO : effective_min_count=40 retains 524 unique words (9% of original 5545, drops 5021)
2022-03-24 03:26:05,819 : INFO : effective_min_count=40 leaves 95210 word corpus (75% of original 126597, drops 31387)
2022-03-24 03:26:05,823 : INFO : deleting the raw counts dictionary of 5545 items
2022-03-24 03:26:05,825 : INFO : sample=0.001 downsamples 52 most-common words
2022-03-24 03:26:05,827 : INFO : downsampling leaves estimated 59270 word corpus (62.3% of prior 95210)
2022-03-24 03:26:05,830 : INFO : estimated required memory for 524 words and 300 dimensions: 1519600 bytes
2022-03-24 03:26:05,832 : INFO : resetting layer wei

Training model...


2022-03-24 03:26:06,045 : INFO : worker thread finished; awaiting finish of 3 more threads
2022-03-24 03:26:06,047 : INFO : worker thread finished; awaiting finish of 2 more threads
2022-03-24 03:26:06,052 : INFO : worker thread finished; awaiting finish of 1 more threads
2022-03-24 03:26:06,056 : INFO : worker thread finished; awaiting finish of 0 more threads
2022-03-24 03:26:06,060 : INFO : EPOCH - 1 : training on 126597 raw words (59313 effective words) took 0.1s, 501021 effective words/s
2022-03-24 03:26:06,157 : INFO : worker thread finished; awaiting finish of 3 more threads
2022-03-24 03:26:06,175 : INFO : worker thread finished; awaiting finish of 2 more threads
2022-03-24 03:26:06,178 : INFO : worker thread finished; awaiting finish of 1 more threads
2022-03-24 03:26:06,184 : INFO : worker thread finished; awaiting finish of 0 more threads
2022-03-24 03:26:06,187 : INFO : EPOCH - 2 : training on 126597 raw words (59347 effective words) took 0.1s, 513691 effective words/s
2022

In [24]:
# checking model results
# model.doesnt_match("identify statements term such".split())
print(model.wv.doesnt_match("statements term such products cost".split()))
print(model.wv.most_similar("equity"))
print(model.wv.most_similar("patent"))
print(model.wv.similarity("agreement", "statements"))

term
[('exchange', 0.9976292252540588), ('investment', 0.9971957206726074), ('since', 0.9971070289611816), ('grant', 0.9958962798118591), ('loss', 0.9958276748657227), ('paid', 0.9955037832260132), ('iv', 0.9950726628303528), ('through', 0.994901716709137), ('granted', 0.9945908784866333), ('within', 0.9942872524261475)]
[('claims', 0.9892095327377319), ('such', 0.9889212846755981), ('patents', 0.9847432374954224), ('are', 0.9841980338096619), ('it', 0.9839975833892822), ('able', 0.9827666878700256), ('do', 0.9818534255027771), ('party', 0.9811986088752747), ('would', 0.9795234203338623), ('applications', 0.9792795777320862)]
0.21258757


  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


In [25]:
model.wv.most_similar(positive = ['rights', 'patent'], negative = ['properties'], topn = 10)

[('more', 0.9585977792739868),
 ('that', 0.9578635096549988),
 ('would', 0.9536533355712891),
 ('will', 0.9422581195831299),
 ('result', 0.9414169192314148),
 ('do', 0.9412647485733032),
 ('impact', 0.939698338508606),
 ('affect', 0.9393108487129211),
 ('have', 0.9353950619697571),
 ('it', 0.9340128898620605)]

In [26]:
words = model.wv.index2word
print(len(words))
print(words[:100])
print(model['financial'])
# for word in words:
    # print(word,model[word])

524
['the', 'of', 'and', 'to', 'in', 'or', 'a', 'our', 'for', 'we', 'as', 'that', 'with', 'on', 'may', 'be', 'company', 'is', 'are', 'by', 'not', 'other', 'from', 'which', 'any', 'an', 's', 'have', 'such', 'if', 'at', 'properties', 'will', 'product', 'under', 'clinical', 'could', 'lease', 'certain', 'us', 'shares', 'ser', 'including', 'development', 'financial', 'property', 'these', 'december', 'million', 'has', 'its', 'agreement', 'also', 'all', 'subject', 'this', 'products', 'costs', 'approval', 'common', 'future', 'additional', 'business', 'than', 'holdco', 'operating', 'value', 'fda', 'study', 'stock', 'term', 'candidates', 'microbiome', 'based', 'result', 'was', 'it', 'income', 'were', 'operations', 'leases', 'addition', 'rights', 'research', 'expenses', 'market', 'patent', 'statements', 'time', 'ability', 'results', 'related', 'master', 'use', 'more', 'third', 'their', 'over', 'management', 'required']
[-7.18645379e-02 -5.05775474e-02 -7.96697289e-02 -3.70738171e-02
 -4.85505722e

  after removing the cwd from sys.path.


# Embedding 2.0 tri-gram/bi-gram/uni-gram on skip-gram

## data preprocessing

### sampling and dropping duplicates_copy

In [None]:
# 1000-sample example
# for specific years
selected_year = '2020'
# read with row limits
sample = 1000
# df_selected_year_10ks_sample = pd.read_csv(file_name_prefix_10ks + selected_year + '.csv', nrows = sample, usecols = [3])
df_selected_year_10ks_sample = pd.read_csv(file_name_prefix_10ks + selected_year + '.csv', nrows = sample)
print(df_selected_year_10ks_sample.shape)


(1000, 4)


In [None]:
# drop duplicates
df_selected_year_10ks_sample['10K_id'] = df_selected_year_10ks_sample.apply(lambda row: row['id'].rsplit('_', 1)[0], axis=1)
df_selected_year_10ks_sample_deduped = df_selected_year_10ks_sample.drop_duplicates(subset=['10K_id', 'text'], keep='first')
df_selected_year_10ks_sample_deduped.drop(['10K_id'], inplace=True, axis=1)
print(df_selected_year_10ks_sample_deduped.shape)

# save to files
selected_year_10ks_sample_dir = f'{selected_year}_DEI_10ks'
# os.mkdir(selected_year_10ks_sample_dir)
df_selected_year_10ks_sample_deduped.to_csv(f'{selected_year_10ks_sample_dir}/{selected_year}_10ks_sample_deduped.csv.gz', compression='gzip', index=False)

(979, 4)


In [None]:
df_selected_year_10ks_sample_deduped.head()

Unnamed: 0,id,ticker_display,primary_industry_id,text
0,2020Q1_MCRB_10-K_20191231_0001564590-20-008002...,MCRB,HC-BP,Indicate by check mark whether the registrant:...
1,2020Q1_MCRB_10-K_20191231_0001564590-20-008002...,MCRB,HC-BP,Indicate by check mark whether the registrant ...
2,2020Q1_MCRB_10-K_20191231_0001564590-20-008002...,MCRB,HC-BP,Indicate by check mark whether the registrant ...
3,2020Q1_MCRB_10-K_20191231_0001564590-20-008002...,MCRB,HC-BP,This Annual Report on Form 10-K contains forwa...
4,2020Q1_MCRB_10-K_20191231_0001564590-20-008002...,MCRB,HC-BP,In some cases you can identify forward-looking...


### tokenizers(unigram)
text to sentense runtime: ~4s

In [None]:


import pandas as pd
import os
from nltk.corpus import stopwords
import nltk.data
import logging
import numpy as np
from gensim.models import Word2Vec
from gensim.models import fasttext


In [None]:
# Import various modules for string cleaning
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords

def text_to_wordlist( text, remove_stopwords=False ):
    # Function to convert a document to a sequence of words,
    # optionally removing stop words.  Returns a list of words.
    #
    # 1. Remove HTML
    text = BeautifulSoup(text).get_text()
    #  
    # 2. Remove non-letters
    text = re.sub("[^a-zA-Z]"," ", text)
    #
    # 3. Convert words to lower case and split them
    words = text.lower().split()
    #
    # 4. Optionally remove stop words (false by default)
    if remove_stopwords:
        stops = set(stopwords.words(" "))
        words = [w for w in words if not w in stops]
    #
    # 5. Return a list of words
    return(words)

In [None]:
# Download the punkt tokenizer for sentence splitting
import nltk.data
nltk.download("popular")   

# Load the punkt tokenizer
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

# Define a function to split a review into parsed sentences
def text_to_sentences( text, tokenizer, remove_stopwords=False ):
    # Function to split a review into parsed sentences. Returns a 
    # list of sentences, where each sentence is a list of words
    #
    # 1. Use the NLTK tokenizer to split the paragraph into sentences
    raw_sentences = tokenizer.tokenize(text.strip())
    #
    # 2. Loop over each sentence
    sentences = []
    for raw_sentence in raw_sentences:
        # If a sentence is empty, skip it
        if len(raw_sentence) > 0:
            # Otherwise, call review_to_wordlist to get a list of words
            sentences.append( text_to_wordlist( raw_sentence, \
              remove_stopwords ))
    #
    # Return the list of sentences (each sentence is a list of words,
    # so this returns a list of lists
    return sentences

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cmudict.zip.
[nltk_data]    | Downloading package gazetteers to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gazetteers.zip.
[nltk_data]    | Downloading package genesis to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/genesis.zip.
[nltk_data]    | Downloading package gutenberg to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gutenberg.zip.
[nltk_data]    | Downloading package inaugural to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/inaugural.zip.
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/movie_reviews.zip.
[nltk_data]    | Downloading package names to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/names.zip.
[nltk_data]    | Downloading package shakespeare to /root/nltk_data...
[nlt

In [65]:
sentences = []  # Initialize an empty list of sentences

# print(df_selected_year_10ks_sample)
print ("Parsing sentences from training set")
for text in df_selected_year_10ks_sample_deduped['text']:
    print(text)
    sentences += text_to_sentences(text, tokenizer)

Parsing sentences from training set
Indicate by check mark whether the registrant: (1) has filed all reports required to be filed by Section 13 or 15(d) of the Securities Exchange Act of 1934 during the preceding 12 months (or for such shorter period that the registrant was required to file such reports) and (2) has been subject to such filing requirements for the past 90 days.    Yes  ☒    No  ☐
Indicate by check mark whether the registrant has submitted electronically every Interactive Data File required to be submitted pursuant to Rule 405 of Regulation S-T (§ 232.405 of this chapter) during the preceding 12 months (or for such shorter period that the registrant was required to submit such files).    Yes  ☒    No  ☐
Indicate by check mark whether the registrant is a large accelerated filer an accelerated filer a non-accelerated filer a smaller reporting company or an emerging growth company. See the definitions of the "large accelerated filer" "accelerated filer" "smaller reporting 

In [66]:
# check sentences
print(len(sentences))
print(sentences[0])
print(sentences)

3999
['indicate', 'by', 'check', 'mark', 'whether', 'the', 'registrant', 'has', 'filed', 'all', 'reports', 'required', 'to', 'be', 'filed', 'by', 'section', 'or', 'd', 'of', 'the', 'securities', 'exchange', 'act', 'of', 'during', 'the', 'preceding', 'months', 'or', 'for', 'such', 'shorter', 'period', 'that', 'the', 'registrant', 'was', 'required', 'to', 'file', 'such', 'reports', 'and', 'has', 'been', 'subject', 'to', 'such', 'filing', 'requirements', 'for', 'the', 'past', 'days']


### [failed] build bi-grams < token-to-phrases-to-token method

In [27]:
from gensim.models.phrases import Phrases, Phraser
def build_phrases(sentences):
    phrases = Phrases(sentences,
                      min_count=5,
                      threshold=7,
                      progress_per=1000)
    return Phraser(phrases)

In [28]:
# build phrase model
phrases_model = build_phrases(sentences)

# save bi-gram phrase model
phrases_model.save('phrases_model.txt')
phrases_model= Phraser.load('phrases_model.txt')

2022-03-24 03:41:36,212 : INFO : collecting all words and their counts
2022-03-24 03:41:36,213 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types
2022-03-24 03:41:36,268 : INFO : PROGRESS: at sentence #1000, processed 28116 words and 17592 word types
2022-03-24 03:41:36,322 : INFO : PROGRESS: at sentence #2000, processed 57395 words and 28465 word types
2022-03-24 03:41:36,383 : INFO : PROGRESS: at sentence #3000, processed 88994 words and 39105 word types
2022-03-24 03:41:36,453 : INFO : collected 45600 word types from a corpus of 123921 words (unigram + bigrams) and 3999 sentences
2022-03-24 03:41:36,454 : INFO : using 45600 counts as vocab in Phrases<0 vocab, min_count=5, threshold=7, max_vocab_size=40000000>
2022-03-24 03:41:36,455 : INFO : source_vocab length 45600
2022-03-24 03:41:36,938 : INFO : Phraser built with 1024 phrasegrams
2022-03-24 03:41:36,943 : INFO : saving Phraser object under phrases_model.txt, separately None
2022-03-24 03:41:37,458 : INFO : sa

In [29]:
# bi-gram extraction
def sentence_to_bi_grams(phrases_model, sentence):
    return ' '.join(phrases_model[sentence])

# bo-gram concatenation
def sentences_to_bi_grams(n_grams, sentences, output_file_name):
  with open(output_file_name, 'w+') as out_file:
    for sentence in sentences:
        # cleaned_sentence = clean_sentence(sentence)
        # tokenized_sentence = tokenize(cleaned_sentence)
        parsed_sentence = sentence_to_bi_grams(n_grams, sentence)
        out_file.write(parsed_sentence + '\n')

sentences_to_bi_grams(phrases_model, sentences, "sample_bigram")


# def sentences_to_bi_grams(n_grams, input_file_name, output_file_name):
#     with open(input_file_name, 'r') as input_file_pointer:
#         with open(output_file_name, 'w+') as out_file:
#             for sentence in get_sentences(input_file_pointer):
#                 cleaned_sentence = clean_sentence(sentence)
#                 tokenized_sentence = tokenize(cleaned_sentence)
#                 parsed_sentence = sentence_to_bi_grams(n_grams, tokenized_sentence)
#                 out_file.write(parsed_sentence + '\n')

#### Tokenize bigram again
runtime: ~ 60s

In [67]:
# check out bi-grams
sample_bigram = open("sample_bigram",'r').read().split()
! pwd
print(sample_bigram)
print(len(sample_bigram))
# sentences=  sample_bigram

/content/gdrive/.shortcut-targets-by-id/1OPzVNu0CgKsi5tBwlMJVXZTu1EWX9F6C/DFG Cost of Human Rights Violations/Datasets/10k_clean_text/_0.2 paragraphed
108295


In [68]:
# Redo "sentences" tokenizers
bigram_sentences = []  # Initialize an empty list of sentences

# print(df_selected_year_10ks_sample)
print ("Parsing sentences from training set")
for text in sample_bigram:
    print(text)
    bigram_sentences += text_to_sentences(text, tokenizer)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
adverse_events
of
unanticipated
severity
or
frequency
problems
with
manufacturers
or
manufacturing_processes
or
failure
to_comply
with
regulatory_requirements
the
regulatory_agency
may
impose
restrictions_on
the
products
or
us
including
requiring
withdrawal
of
the
product
from
the
market
any
failure
to_comply
with
applicable
regulatory_requirements
may
yield
various
results
including
noncompliance
with
similar
eu
requirements
regarding
safety
monitoring
or
pharmacovigilance
can
also
result_in
significant
financial
penalties
similarly
failure
to_comply
with
u_s
and
foreign_regulatory
requirements
regarding
the
development
of
products
for
pediatric
populations
and
the
protection
of
personal
health_information
can
also
lead
to
significant
penalties
and
sanctions
any
government
investigation
of
alleged
violations
of
law
could_require
us
to
expend
significant
time
and
resources
in
response
and
could
generate
negative
publicity

  ' Beautiful Soup.' % markup)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
actively
redeveloping
space_at
our
properties
and
re_leasing
such
space
to
new
diversified
tenants
at
higher
rents
than
those
paid
for
space
currently
or
formerly
occupied_by
sears
or
kmart
prior_to
redevelopment
we
seek
to
optimize
the
mix
of
tenants
at
and
maximize
the
value
of
our
properties
by
focusing
on
growing
national
retailers
and
taking_into
account
customer
demographics
and
the
competitive
environment
of
each
property
s
market
area
we_believe
that
the
superior
real_estate
locations
diversity
of
property
types
and
national
footprint
that
characterize
our_portfolio
make
us
well
positioned
to
meet
the
store
growth
needs
of
retailers
across
a
variety
of
sectors
and
concepts
as
we
lease
space
to
such
retailers
we
aim
to
create
multi_tenant
shopping_centers
that
command
superior
rents
and
valuations
due_to
their
prime
locations
synergies
with
adjoining
retailers
and
proximity
to
productive
malls
and
shopping_centers


In [59]:
# check sentences
sentences = bigram_sentences
print(len(sentences))
print(sentences)

108295


## bigram tokenizers: phrases within tokenizing

## Training Word2Vec model_copied
runtime: ~1s

In [61]:
# Import the built-in logging module and configure it so that Word2Vec 
# output messages
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
    level=logging.INFO)

# Set values for various parameters
num_features = 300    # Word vector dimensionality                      
min_word_count = 40   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

# Initialize and train the model (this will take some time)
from gensim.models import word2vec
print("Training model...")

# Using default architecture: skip-gram
model = word2vec.Word2Vec(sentences, workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling)

# If you don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)

# It can be helpful to create a meaningful model name and 
# save the model for later use. You can load it later using Word2Vec.load()
model_name = "300features_40minwords_10context_bigram"
model.save(model_name)

2022-03-24 04:06:43,596 : INFO : collecting all words and their counts
2022-03-24 04:06:43,601 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2022-03-24 04:06:43,621 : INFO : PROGRESS: at sentence #10000, processed 61789 words, keeping 27 word types
2022-03-24 04:06:43,632 : INFO : PROGRESS: at sentence #20000, processed 123422 words, keeping 27 word types
2022-03-24 04:06:43,645 : INFO : PROGRESS: at sentence #30000, processed 188172 words, keeping 27 word types
2022-03-24 04:06:43,660 : INFO : PROGRESS: at sentence #40000, processed 250476 words, keeping 27 word types
2022-03-24 04:06:43,670 : INFO : PROGRESS: at sentence #50000, processed 313343 words, keeping 27 word types
2022-03-24 04:06:43,684 : INFO : PROGRESS: at sentence #60000, processed 375336 words, keeping 27 word types
2022-03-24 04:06:43,695 : INFO : PROGRESS: at sentence #70000, processed 438063 words, keeping 27 word types
2022-03-24 04:06:43,709 : INFO : PROGRESS: at sentence #80000, proce

Training model...


2022-03-24 04:06:44,069 : INFO : worker thread finished; awaiting finish of 3 more threads
2022-03-24 04:06:44,077 : INFO : worker thread finished; awaiting finish of 2 more threads
2022-03-24 04:06:44,079 : INFO : worker thread finished; awaiting finish of 1 more threads
2022-03-24 04:06:44,086 : INFO : worker thread finished; awaiting finish of 0 more threads
2022-03-24 04:06:44,087 : INFO : EPOCH - 1 : training on 677586 raw words (115355 effective words) took 0.3s, 405193 effective words/s
2022-03-24 04:06:44,374 : INFO : worker thread finished; awaiting finish of 3 more threads
2022-03-24 04:06:44,378 : INFO : worker thread finished; awaiting finish of 2 more threads
2022-03-24 04:06:44,388 : INFO : worker thread finished; awaiting finish of 1 more threads
2022-03-24 04:06:44,390 : INFO : worker thread finished; awaiting finish of 0 more threads
2022-03-24 04:06:44,394 : INFO : EPOCH - 2 : training on 677586 raw words (115443 effective words) took 0.3s, 392291 effective words/s
20

## unigram tasks revisit

In [62]:
# checking model results
# model.doesnt_match("identify statements term such".split())
print(model.wv.doesnt_match("statements term such products cost".split()))
print(model.wv.most_similar("equity"))
print(model.wv.most_similar("patent"))
print(model.wv.similarity("agreement", "statements"))



ValueError: ignored

In [55]:
model.wv.most_similar(positive = ['rights', 'patent'], negative = ['properties'], topn = 10)

[('offering', 0.9992719888687134),
 ('voting', 0.9992697238922119),
 ('tax', 0.9992654323577881),
 ('fee', 0.9992598295211792),
 ('capital', 0.9992556571960449),
 ('an', 0.9992542862892151),
 ('rental', 0.9992340803146362),
 ('executive', 0.9992314577102661),
 ('non', 0.999228835105896),
 ('interest', 0.9992247223854065)]

In [63]:
words = model.wv.index2word
print(len(words))
print(words[:100])
print(model['financial'])
# for word in words:
    # print(word,model[word])

27
['e', 't', 'a', 'i', 'o', 'n', 'r', 's', 'c', 'd', 'l', 'h', 'u', 'p', 'm', 'f', '_', 'g', 'y', 'b', 'v', 'w', 'x', 'k', 'q', 'j', 'z']


  after removing the cwd from sys.path.


KeyError: ignored