<a href="https://colab.research.google.com/github/somilasthana/MachineLearningSkills/blob/master/NLP_OE_Vector_Space_Representation_ipynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer

In [0]:
import pandas as pd
import numpy as np
import scipy
import scipy.spatial.distance

## Useful Functions

In [0]:
def euclidean(u, v): return scipy.spatial.distance.euclidean(u, v) 

## Load DataSet

In [0]:
# Load some categories from the training set
categories = [
    'alt.atheism',
    'talk.religion.misc'#,
    #'comp.graphics',
    #'sci.space',
]

dataset = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)

In [35]:
print("Len {0}".format(len(dataset.data)))

Len 857


In [0]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(dataset.data)

In [37]:
X.shape

(857, 18089)

In [0]:
feature_name = vectorizer.get_feature_names()

In [39]:
len(feature_name)

18089

In [42]:
np.array(feature_name[14000:14050])

array(['representing', 'represents', 'repressed', 'repression',
       'reproach', 'reproduce', 'reproduced', 'reproducibility',
       'reproduction', 'repsonses', 'repuations', 'republican',
       'repudiating', 'repugnance', 'repugnant', 'repulsive', 'reputable',
       'reputation', 'repute', 'request', 'requested', 'requesting',
       'requests', 'requiem', 'require', 'required', 'requirement',
       'requirements', 'requires', 'requiring', 'reread', 'resaerch',
       'rescind', 'rescorla', 'rescue', 'research', 'researched',
       'researchers', 'researching', 'resemblance', 'resemble', 'reserve',
       'reserved', 'reside', 'resident', 'residents', 'residing',
       'resigned', 'resilience', 'resist'], dtype='<U15')

In [0]:
newsdata = {}
m, n = X.shape
for i, name in zip(range(0, n), feature_name):
  newsdata.setdefault(name, X[:, i])

In [0]:
news_df = pd.DataFrame(data=newsdata, index=list(range(0, n)))

In [12]:
len(news_df)

14593

## Convert into Observed / Expected

In [0]:
def observed_over_expected(df):
    col_totals = df.sum(axis=0)
    total = col_totals.sum()
    row_totals = df.sum(axis=1)
    expected = np.outer(row_totals, col_totals) / total
    oe = df / expected
    return oe

In [0]:
news_oe = observed_over_expected(news_df)

In [0]:
 col_totals = X.sum(axis=0)

In [0]:
total = col_totals.sum()

In [0]:
row_totals = X.sum(axis=1)

In [0]:
expected = np.outer(row_totals, col_totals) / total

In [0]:
oe = X / expected

## Probe Observed-Expected Vector

* Looks like CountVector is doing far better than Observed Expected Vector

In [48]:
oe.shape

(857, 18089)

In [50]:
feature_name.index("persecution")

12443

In [0]:
def vector_space_distance(vec_space, index_number, dense=False):
  if dense:
        vec_space = vec_space.toarray()
  w = vec_space[:, index_number]
  d_oe = np.apply_along_axis(lambda x: euclidean(x, w), 0 , vec_space )
  ind = np.argsort(d_oe)[:10]
  return np_feature_name[ind]

In [69]:
vector_space_distance(oe, 12443)

array(['persecution', 'enemy', 'prophecy', 'their', 'by', 'who', 'of',
       'to', 'and', 'the'], dtype='<U78')

In [71]:
vector_space_distance(X, 12443, True)

array(['persecution', 'persecute', 'hatred', 'boggs', 'enemy', 'missouri',
       'pose', 'knies', 'casper', 'neo'], dtype='<U78')

## Document Similarity

In [0]:
def t_vector_space_distance(vec_space, index_number, dense=False):
  if dense:
        vec_space = vec_space.toarray()
  w = vec_space[index_number, :]
  d_oe = np.apply_along_axis(lambda x: euclidean(x, w), 1 , vec_space )
  ind = np.argsort(d_oe)[:10]
  return ind

In [81]:
t_vector_space_distance(oe, 15)

array([ 15, 616, 447, 654, 205, 146,  67, 707,   6, 283])

In [82]:
t_vector_space_distance(X, 15, True)

array([ 15, 578, 534, 677, 662, 105, 312, 747, 388, 259])

In [79]:
print(dataset.data[15])

From: jmeritt@mental.mitre.org
Subject: God's promise to the righteous
Organization: UTexas Mail-to-News Gateway
Lines: 3
NNTP-Posting-Host: cs.utexas.edu

Ps.92:12: "The righteous shall flourish like the palm tree."

Isa.57:1: "The righteous perisheth, and no man layeth it to heart."

