# Tagging patterns analysis using TF-IDF, cosine similarity, & Clustering

# Load Data

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [None]:
tags = pd.read_excel("../sample_data/other_samples/tag_subgraphs.xlsx")

In [None]:
tag_list = []
for c in tags.columns:
    tag_list+=tags[c].dropna().tolist()
tag_list_index = np.random.choice(len(tag_list) , replace = False, size = 50)
new_tag_list = []
for t in tag_list_index:
    new_tag_list.append(tag_list[t])

In [None]:
len(set(tag_list))

# TF-IDF + Cosine Similarity

In [None]:
import pandas as pd
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import scipy.sparse as sparse
from ast import literal_eval
import time
from matplotlib import colors
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
import seaborn as sns
import matplotlib.pyplot as plt

from fuzzywuzzy import fuzz
from fuzzywuzzy import process

In [None]:
def tfidf_names(names):
    """ return cosine similarity matrix: CSM[names1_index][names2_index] = cosine similarity score for an entry in names1 and names2."""
    t1 = time.time()
    analyzer = "char" #[‘word’, ‘char’, ‘char_wb’]
    ngram_range = (2,4) #feature unit: 1 whole word
    max_df = 1.0 #default
    min_df = 1 #means no lower threshold
    binary = False #Take mutiple occurence of a word in name into account
    use_idf = True #Take unit importance in the dataset into accout
    smooth_idf = True #prevent zero division ?
    norm = None #each vector is nomalized so their dot product is cosine similarity
    vectorizer = TfidfVectorizer(analyzer = analyzer, ngram_range = ngram_range, max_df = max_df,
                            min_df = min_df, binary = binary, use_idf = use_idf, smooth_idf = smooth_idf, norm = norm)
    CSM_all_names = vectorizer.fit_transform(names)
    preprocessor = vectorizer.build_preprocessor()
    t = time.time()-t1
    print("Match COMPLETED IN:", t)
    return vectorizer, preprocessor, CSM_all_names

def handle_tfidf_score(tfidf1, tfidf2):
    """
    Convert tfidf scorrs to similarity scores
    RAM intensive
    """
    res_m = cosine_similarity(tfidf1, tfidf2) 
    return res_m

In [None]:
_,_,CSM_tags = tfidf_names(new_tag_list)
cosine_score = handle_tfidf_score(CSM_tags,CSM_tags)
cosine_score = np.where((1-cosine_score)<0.01, 0,(1-cosine_score))

# Draw Dendrogram

In [None]:
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.spatial.distance import squareform
import seaborn as sns

In [None]:
cosine_score = squareform(cosine_score)
cosine_score= linkage(cosine_score,"single")

In [None]:
fig , ax1 = plt.subplots(figsize=(20,20))
dendrogram(cosine_score, labels = new_tag_list, orientation = "left")
plt.yticks(fontsize = 20)
plt.title("Dendrogram of Tags with Cosine Similarity as Distance Score", fontsize = 25)
plt.savefig("tag/dendrogram_tag.png",bbox_inches='tight')

In [None]:
cosine_score