In [1]:
import pandas as pd
english_frequency = pd.read_csv("data/google-books-common-words.txt", sep="\t", header=None, names=["word", "count"])

In [2]:
english_frequency.word.describe()

count     97563
unique    97563
top         THE
freq          1
Name: word, dtype: object

In [3]:
english_frequency.word = english_frequency.word.str.lower()

In [4]:
english_frequency

Unnamed: 0,word,count
0,the,53097401461
1,of,30966074232
2,and,22632024504
3,to,19347398077
4,in,16891065263
...,...,...
97560,oilmen,100016
97561,evang,100011
97562,xxl,100009
97563,losey,100008


Loading GloVe tokens

In [5]:

with open("data/glove.840B.300d.wordsonly.txt") as f:
    tokens = f.read()
tokens = tokens.split("\n")


Because my intent is to find whether the GloVe 840B 300d file
contains case variations, I match both dataset normalizing the GloVe column.

For each token in the most frequent token list, I match it with the lowercased GloVe tokens
and add the index of the frequent token pd.Series.
Instead of performing an iterative search which would be really inefficient I just merge by matching lower cased text.

In [6]:
import pandas as pd
import numpy as np
glove_df = pd.DataFrame({"words": tokens})
glove_df["lowered"] = glove_df.words.str.lower()

First lets compute a simple insightful statistics.

In [7]:
glove_df["lowered"].duplicated().sum()

493091

In [8]:
lower_merge = pd.merge(english_frequency, glove_df, left_on="word", right_on="lowered", how="inner")

In [9]:
lower_merge.sort_values("lowered")

Unnamed: 0,word,count,words,lowered
28,a,15310087895,a,a
29,a,15310087895,A,a
26847,aa,7570778,AA,aa
26848,aa,7570778,aa,aa
26849,aa,7570778,Aa,aa
...,...,...,...,...
231234,zymogen,123387,zymogen,zymogen
178901,zz,222214,zZ,zz
178900,zz,222214,Zz,zz
178899,zz,222214,zz,zz


In [10]:
lower_merge.word.unique().shape[0], english_frequency.shape[0]

(97065, 97565)

As we can see from upward notebook cell, most words from common words are kept.
To simplify the analysis and computations, we are going to compute
a count plot of a subset of GloVe tokens contained in this merge set keeping
the cased an uncased tokens. Removing stopwords and words smaller than 3 letters

In [11]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stops = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /home/alex/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
lower_merge = lower_merge.loc[~lower_merge.lowered.isin(stops)]
greater_3_df = lower_merge.loc[lower_merge.lowered.str.len() > 3]

In [13]:
import plotly.graph_objects as go

from plotly.offline import iplot

N = int(lower_merge.shape[0] / 90)
top = greater_3_df.nlargest(N, "count")
print(N)
fig = go.Bar(x=top.words, y=top["count"])
iplot([fig])
# lower_merge.set_index("words")["count"].plot.bar()

2797


In [14]:
word_sample = ["python", "university", "nlp", "ai"]
lower_merge.loc[lower_merge.lowered.isin(word_sample)]

Unnamed: 0,word,count,words,lowered
1356,university,267605414,University,university
1357,university,267605414,university,university
1358,university,267605414,UNIVERSITY,university
1359,university,267605414,UNiversity,university
26169,ai,7884921,ai,ai
26170,ai,7884921,Ai,ai
26171,ai,7884921,AI,ai
26172,ai,7884921,aI,ai
93916,python,882471,Python,python
93917,python,882471,python,python


In [15]:
sub = greater_3_df.loc[~greater_3_df.lowered.duplicated(keep=False)].loc[greater_3_df.words.str.istitle()].sort_values("count")
sub

Unnamed: 0,word,count,words,lowered
252608,westergaard,100005,Westergaard,westergaard
252589,naguib,100022,Naguib,naguib
252563,longmore,100057,Longmore,longmore
252560,ekiti,100063,Ekiti,ekiti
252512,imprimis,100098,Imprimis,imprimis
...,...,...,...,...
59781,microbiol,2051835,Microbiol,microbiol
59449,miiller,2072422,Miiller,miiller
57861,immunol,2170431,Immunol,immunol
57796,obstet,2173466,Obstet,obstet


Lots of those words are family names. It is not really insightful.
I use Named Entity recognition to remove family names.


In [16]:
import spacy
tokenizer = spacy.load("en_core_web_sm")
tokenizer.get_pipe("ner").labels



IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html

2022-12-14 09:53:14.532191: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-12-14 09:53:14.607063: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-12-14 09:53:14.607242: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero


('CARDINAL',
 'DATE',
 'EVENT',
 'FAC',
 'GPE',
 'LANGUAGE',
 'LAW',
 'LOC',
 'MONEY',
 'NORP',
 'ORDINAL',
 'ORG',
 'PERCENT',
 'PERSON',
 'PRODUCT',
 'QUANTITY',
 'TIME',
 'WORK_OF_ART')

In [17]:
col = []
for i, doc in enumerate(tokenizer.pipe(sub.words, )):
    token = doc[0]
    if token.ent_type_ in set(("PERSON", "NORP")): # type string only
        col.append(True)
    else:
        col.append(False)

In [18]:
new = sub.copy()
new["NER"] = col

Let's removed the words tagged as PERSON OR NORP

In [19]:
new.loc[new["NER"] == True]

Unnamed: 0,word,count,words,lowered,NER
252347,mitya,100242,Mitya,mitya,True
252331,novatian,100258,Novatian,novatian,True
252326,abinger,100263,Abinger,abinger,True
252039,plevna,100497,Plevna,plevna,True
252024,harrowby,100508,Harrowby,harrowby,True
...,...,...,...,...,...
99146,vespasian,792357,Vespasian,vespasian,True
98854,carthaginians,796272,Carthaginians,carthaginians,True
95948,goffman,846435,Goffman,goffman,True
66608,longmans,1685730,Longmans,longmans,True


In [20]:
sample = new.loc[(new.NER == False) & (new.words.str.istitle())].sort_values("count", ascending=False).nlargest(N, "count")
iplot([go.Bar(x=sample.words, y = sample["count"])])

## What about Word2Vec pretrained embeddings ? 

In [21]:
with open("data/word2vec_text_GloVe_format.wordsonly.txt") as f:
    tokens = f.read()
tokens = tokens.split("\n")
word2vec = pd.DataFrame({"words":tokens})

word2vec["lowered"] = word2vec.words.str.lower()
lower_merge = pd.merge(english_frequency, word2vec, left_on="word", right_on="lowered", how="inner")

In [22]:
lower_merge = lower_merge.loc[~lower_merge.lowered.isin(stops)]
greater_3_df = lower_merge.loc[lower_merge.lowered.str.len() > 3]

In [23]:
sub = greater_3_df.loc[~greater_3_df.lowered.duplicated(keep=False)].loc[greater_3_df.words.str.istitle()].sort_values("count")
sub

Unnamed: 0,word,count,words,lowered
188442,westergaard,100005,Westergaard,westergaard
188441,losey,100008,Losey,losey
188438,evang,100011,Evang,evang
188433,centralising,100018,Centralising,centralising
188430,naguib,100022,Naguib,naguib
...,...,...,...,...
23131,neighbouring,7981652,Neighbouring,neighbouring
23022,practised,8043983,Practised,practised
22302,prussia,8443290,Prussia,prussia
21456,fibres,8996310,Fibres,fibres


In [24]:
col = []
for i, doc in enumerate(tokenizer.pipe(sub.words, )):
    token = doc[0]
    if token.ent_type_ in set(("PERSON", "NORP")): # type string only
        col.append(True)
    else:
        col.append(False)

In [25]:
new = sub.copy()
new["NER"] = col

In [26]:
# sample = sub[["count", "words"]].sample(N, random_state=100).sort_values("count", ascending=False, )
sample = new.loc[(new.NER == False) & (new.words.str.istitle())].sort_values("count", ascending=False).nlargest(N, "count")
iplot([go.Bar(x=sample.words, y = sample["count"])])

In [27]:
greater_3_df.loc[greater_3_df.lowered.isin(["python", "numpy", "university", "ai", "nlp", "ml"])]

Unnamed: 0,word,count,words,lowered
790,university,267605414,University,university
791,university,267605414,university,university
792,university,267605414,UNIVERSITY,university
79597,python,882471,python,python
79598,python,882471,Python,python
79599,python,882471,PYTHON,python


In [28]:
word_sample = ["python", "university", "nlp", "ai"]
lower_merge.loc[lower_merge.lowered.isin(word_sample)]

Unnamed: 0,word,count,words,lowered
790,university,267605414,University,university
791,university,267605414,university,university
792,university,267605414,UNIVERSITY,university
23311,ai,7884921,AI,ai
23312,ai,7884921,Ai,ai
23313,ai,7884921,ai,ai
23314,ai,7884921,aI,ai
79597,python,882471,python,python
79598,python,882471,Python,python
79599,python,882471,PYTHON,python


## Conclusion

As we can see commonly used words do exist in different cases for the selected GloVe and Word2vec word embeddings.
