<a href="https://colab.research.google.com/github/thomouvic/txtanalytics/blob/main/doc_sim_graph.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data preparation

In [None]:
!wget https://github.com/blueprints-for-text-analytics-python/blueprints-text/raw/master/data/abcnews/abcnews-date-text.csv.gz
!gzip -d abcnews-date-text.csv.gz

In [2]:
import pandas as pd
headlines = pd.read_csv('abcnews-date-text.csv', parse_dates=["publish_date"])
headlines.head()

Unnamed: 0,publish_date,headline_text
0,2003-02-19,aba decides against community broadcasting lic...
1,2003-02-19,act fire witnesses must be aware of defamation
2,2003-02-19,a g calls for infrastructure protection summit
3,2003-02-19,air nz staff in aust strike for pay rise
4,2003-02-19,air nz strike to affect australian travellers


# TF/IDF

In [3]:
%%time
from sklearn.feature_extraction.text import TfidfVectorizer
from spacy.lang.en.stop_words import STOP_WORDS as stopwords

tfidf = TfidfVectorizer(stop_words=stopwords, min_df=2)
dt = tfidf.fit_transform(headlines["headline_text"])
print('Number of bytes', dt.data.nbytes)
dt



Number of bytes 44856904
CPU times: user 15.3 s, sys: 1.62 s, total: 16.9 s
Wall time: 19.4 s


<1103663x58527 sparse matrix of type '<class 'numpy.float64'>'
	with 5607113 stored elements in Compressed Sparse Row format>

## Lemmas

In [4]:
# from tqdm.auto import tqdm
# import spacy
# nlp = spacy.load("en_core_web_sm")
# nouns_adjectives_verbs = ["NOUN", "PROPN", "ADJ", "ADV", "VERB"]
# for i, row in tqdm(headlines.iterrows(), total=len(headlines)):
#     doc = nlp(str(row["headline_text"]))
#     headlines.at[i, "lemmas"] = " ".join([token.lemma_ for token in doc])
#     headlines.at[i, "nav"] = " ".join([token.lemma_ for token in doc if token.pos_ in nouns_adjectives_verbs])

In [5]:
# tfidf = TfidfVectorizer(stop_words=stopwords)
# dt = tfidf.fit_transform(headlines["lemmas"].map(str))
# dt

# Finding the most similar documents

### Timing Cosine Similarity

In [6]:
%%time
from sklearn.metrics.pairwise import cosine_similarity
r = cosine_similarity(dt[0:10000], dt[0:10000], dense_output=False)

CPU times: user 32.1 ms, sys: 1.12 ms, total: 33.2 ms
Wall time: 34.2 ms


In [7]:
# A: Let's transform the sparse matrix to a pandas dataframe  
import pandas as pd

m = r.tocoo()
tuples = zip(m.row, m.col, m.data)
df = pd.DataFrame(tuples, columns =['docid1', 'docid2', 'score'])
df

# A: Let's keep only rows with similarity greater than a threshold. 
# A: We set the threshold so that each document has on average about 30 neighbors wrt similarity
# A: Also, let's remove rows with similarity equal to 1. These are self-similarities. 
threshold = 0.18
df = df[ threshold <= df['score'] ]
df = df[ 1 > df['score'] ]
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,docid1,docid2,score
0,0,3112,0.262105
1,0,9569,0.248576
2,0,3657,0.269154
3,0,3211,0.423668
4,0,6003,0.206512
...,...,...,...
299959,9999,818,0.238302
299960,9999,684,0.239202
299961,9999,435,0.212136
299962,9999,8626,0.363494


In [None]:
# https://stackoverflow.com/questions/74175462/attributeerror-module-scipy-sparse-has-no-attribute-coo-array
!pip install 'scipy>=1.8'

In [8]:
import networkx as nx
G = nx.Graph()
G = nx.from_pandas_edgelist(df, 'docid1', 'docid2')

# To visualize the graph use Gephi, or see: https://nightingaledvs.com/how-to-visualize-a-graph-with-a-million-nodes/ 