# __Exploratory data analysis__

Ref:
- [Exploratory Data Analysis for Natural Language Processing: A Complete Guide to Python Tools](https://neptune.ai/blog/exploratory-data-analysis-natural-language-processing-tools)
- [A Beginner’s Guide to Exploratory Data Analysis (EDA) on Text Data (Amazon Case Study)](https://www.analyticsvidhya.com/blog/2020/04/beginners-guide-exploratory-data-analysis-text-data/)
- [NLP Part 3 | Exploratory Data Analysis of Text Data](https://towardsdatascience.com/nlp-part-3-exploratory-data-analysis-of-text-data-1caa8ab3f79d)

Log:
- 10/31/23: started

## ___Setup___

In [1]:
import pandas as pd
from pathlib import Path
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer


In [2]:
work_dir  = Path.home() / "projects/plantbert"
data_dir  = work_dir / "data"
model_dir = work_dir / "models"
vocab_dir = work_dir / "vocab"

corpus_file = data_dir / "corpus_with_topics.tsv.gz"

In [3]:
corpus_df = pd.read_csv(corpus_file, sep="\t", compression="gzip", index_col=0)
corpus_df.head(2)

ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

In [None]:
corpus_df.shape

(421307, 12)

## ___Word counts___

### Term frequency

https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html

In [None]:
corpus     = corpus_df["Corpus"].tolist()

In [None]:
vect_count = CountVectorizer()

# CSR matrix with word frequency
X          = vect_count.fit_transform(corpus)

In [None]:
X.shape

(421307, 563704)

In [None]:
feat_names = vect_count.get_feature_names_out()

### Check against vocabs

The vocabs are from different dictionaries.

In [None]:
def check_overlap_with_vocab(feat_names, vocab_dir):

  feat_set    = set(feat_names)
  vocab_files = vocab_dir.glob("*.items")

  print("Vocab\t\tN_vocab\tN_intersect")
  dict_inter = {} # {vocab: intersect}
  for vocab_file in vocab_files:
    vocab = vocab_file.read_text().split("\n")
    vocab_set = set(vocab)
    intersect = list(feat_set.intersection(vocab_set))

    dict_inter[vocab_file.stem] = intersect

    print(f"{vocab_file.stem}\t{len(vocab)}\t{len(intersect)}")

In [None]:
dict_inter.keys()

NameError: name 'dict_inter' is not defined

In [None]:
# mostly homonyms, same sound and often the same spelling but differ in meaning
dict_inter["vocab_mth"]

['cipher',
 'balanced',
 'translation',
 'joule',
 'chart',
 'singleton',
 'multivariate',
 'trace',
 'intersect',
 'regression',
 'prime',
 'perigee',
 'payoff',
 'octagon',
 'pairwise',
 'mechanical',
 'information',
 'location',
 'ceiling',
 'expectation',
 'bilinear',
 'speed',
 'quartiles',
 'biometrics',
 'rv',
 'matching',
 'permutation',
 'pyramid',
 'rad',
 'particle',
 'intersection',
 'mensuration',
 'composition',
 'outlier',
 'litre',
 'divides',
 'sup',
 'jump',
 'sheet',
 'vanish',
 'autocorrelation',
 'optimal',
 'residual',
 'slope',
 'homomorphism',
 'observation',
 'infeasible',
 'inhomogeneous',
 'statistic',
 'revolve',
 'hexagon',
 'rise',
 'octahedron',
 'del',
 'sign',
 'anticlockwise',
 'height',
 'quantifier',
 'applied',
 'cycle',
 'below',
 'continuum',
 'configuration',
 'cylinder',
 'instance',
 'length',
 'component',
 'retardation',
 'decagon',
 'equivalent',
 'cube',
 'reciprocal',
 'decile',
 'contradiction',
 'trillion',
 'conformable',
 'cylindroid',

## ___Tf-Idf___

In [None]:
vect_tfidf = TfidfVectorizer(lowercase=False,
                             ngram_range=(1,3),
                             stop_words="english")

In [None]:
X_tfidf = vect_tfidf.fit_transform(corpus)

: 

In [None]:
feat_tfidf_names = vect_tfidf.get_feature_names_out()
feat_tfidf_set   = set(feat_tfidf_names)