# Silver NEAR Data Profile Tags v01

## Dependencies and Packages

### Install Dependencies

In [None]:
!ls -la

total 3558872
drwxr-xr-x 1 root root       4096 Apr 14 11:20 .
drwxr-xr-x 1 root root       4096 Apr 14 11:15 ..
drwxr-xr-x 4 root root       4096 Apr 12 13:33 .config
drwx------ 5 root root       4096 Apr 14 11:20 drive
-rw-r--r-- 1 root root 3644258522 Apr 14 11:17 GoogleNews-vectors-negative300.bin
drwxr-xr-x 1 root root       4096 Apr 12 13:34 sample_data


In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!pip install -U sentence-transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!pip install contractions

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!pip install demoji

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


### Import Packages

In [None]:
import glob
import json
import os
import re

from sklearn.metrics.pairwise import cosine_similarity

import pandas as pd
import numpy as np
pd.options.display.max_columns = None

## Preprocessing

In [None]:
import re

import contractions
import demoji
import nltk
import tqdm


def normalize_document(doc):
    doc = doc.translate(doc.maketrans("\n\t\r", "   "))
    doc = doc.lower()
    doc = contractions.fix(doc)
    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z0-9-\s]', ' ', doc, re.I|re.A)
    doc = re.sub(' +', ' ', doc)
    doc = re.sub('\{\}\"', '', doc)
    doc = doc.strip()

    return doc

def normalize_corpus(docs):
    print(type(docs))
    norm_docs = []
    for doc in tqdm.tqdm(docs):
        norm_doc = normalize_document(doc)
        norm_docs.append(norm_doc)

    return norm_docs


def clean_profile_column(series):
    def remove_null_values(d):
        return {k: v for k, v in d.items() if v is not None}

    cleaned_values = series.apply(lambda x: remove_null_values(json.loads(x)) if pd.notnull(x) else None)
    cleaned_values = cleaned_values[cleaned_values.apply(lambda x: bool(x))]

    return cleaned_values

# Working with silver_near_social_txs_parsed table

In [None]:
path = '/content/drive/MyDrive/DataScienceData/Pagoda-NEAR/'
data_path = path + 'Query_tags_2023_04_13.csv'

In [None]:
tags = pd.read_csv(data_path)
tags.head()

Unnamed: 0,signer_id,profile,block_timestamp
0,metapool-official.near,"{""liquid-staking"":"""",""near-protocol"":"""",""near""...",1675109895737903584
1,metapool-official.near,"{""aurora"":""""}",1675109939228073527
2,metapool-official.near,"{""liquid-staking"":"""",""near-protocol"":"""",""near""...",1675109895737903584
3,metapool-official.near,"{""aurora"":""""}",1675109939228073527
4,obukhova.near,"{""web3"":"""",""crypto"":"""",""privacy"":"""",""community...",1675124033638930781


In [None]:
df = tags
df = df.drop_duplicates()
df = df.dropna(subset="profile").copy()
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1955 entries, 0 to 3023
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   signer_id        1955 non-null   object
 1   profile          1955 non-null   object
 2   block_timestamp  1955 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 61.1+ KB


In [None]:
df['profile'] = clean_profile_column(df['profile'])
df = df.dropna(subset="profile").copy()

In [None]:
df['tags'] = df['profile'].apply(lambda x: [normalize_document(k) if isinstance(k, str) else k for k, _ in x.items()])
df = df.drop(columns=["profile"])

In [None]:
df

Unnamed: 0,signer_id,block_timestamp,tags
0,metapool-official.near,1675109895737903584,"[liquid-staking, near-protocol, near, staking,..."
1,metapool-official.near,1675109939228073527,[aurora]
4,obukhova.near,1675124033638930781,"[web3, crypto, privacy, community, decentraliz..."
5,daoplanet.near,1675199846173309175,"[community, dao, education, onboarding, events]"
6,minorityprogrammers.near,1675167659888643164,"[rust, developer, dao, developer-governance, d..."
...,...,...,...
3012,koustav.near,1667619560148378089,"[learner, near, tester]"
3017,psalm.near,1668633217817337126,"[web3, nft, defi, writer]"
3018,luluca_l.near,1668636583644156265,"[non-artist, chaotic, popcorn, maker]"
3022,scopalaffairs.near,1681424140795202125,"[blockchain, developer]"


In [None]:
grouped = df.groupby('signer_id')['tags'].apply(list).reset_index(name='aggregated_tags')
grouped

Unnamed: 0,signer_id,aggregated_tags
0,0180e9e024d226a21664d2a010b9525187aca997c863d8...,"[[near, web3, crypto, nft-artist]]"
1,0328daf040bd979a99ad3cf31a28e94ec6ffa9b62d7e04...,"[[learner, crypto, web-3]]"
2,04fcb9f7ca9e47866352164ca456b0bd0cf759449188ee...,"[[web3, founder, nft, near]]"
3,0652837f7b4cfc28460b45d7352dfe400c504b506557f4...,[[developer]]
4,089f197a5d6ae1d7d06d2e21853016f3536d629cbce2ee...,"[[ai-art, bitcoin, blockchain, crypto, cryptog..."
...,...,...
1659,zhasik.near,"[[web3, near]]"
1660,zjunior.near,[[learner]]
1661,zpoken.near,"[[near, rust, zk, cryptography, engineer, deve..."
1662,zubairansari.near,"[[aurora, auroraisnear, auroraonelove, lovedac..."


In [None]:
def dissolve_double_list(lst):
    return [element for sublist in lst for element in sublist]

grouped["aggregated_tags"] = grouped["aggregated_tags"].apply(dissolve_double_list)

In [None]:
grouped[grouped["signer_id"] == "scopalaffairs.near"]

Unnamed: 0,signer_id,aggregated_tags
1362,scopalaffairs.near,"[artist, datascientist, research, decentraliza..."


In [None]:
tags = grouped["aggregated_tags"]

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
vectorizer = CountVectorizer(analyzer=lambda x: x)
tag_matrix = vectorizer.fit_transform(tags)

In [None]:
cosine_sim = cosine_similarity(tag_matrix)

In [None]:
idx = 1362
grouped["signer_id"][idx]

'scopalaffairs.near'

In [None]:
similar_users = cosine_sim[idx].argsort()[:-7:-1]
print(similar_users)

[1362 1206 1145  717  702  953]


In [None]:
for i in similar_users:
    if i == idx:
        continue
    else:
        score = cosine_similarity(tag_matrix)[idx][i]
        print(score)
        print(grouped.iloc[i])

0.5345224838248487
signer_id          petersalomonsen.near
aggregated_tags     [artist, developer]
Name: 1206, dtype: object
0.4364357804719848
signer_id                          ntphat2634.near
aggregated_tags    [crypto, developer, blockchain]
Name: 1145, dtype: object
0.4364357804719848
signer_id           hasan31060628.near
aggregated_tags    [artist, nft, near]
Name: 717, dtype: object
0.4364357804719848
signer_id                      hackachain.near
aggregated_tags    [developer, blockchain, zk]
Name: 702, dtype: object
0.4364357804719848
signer_id                        luda.near
aggregated_tags    [nft, web3, blockchain]
Name: 953, dtype: object
