In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re

import spacy
import nltk
from nltk.tokenize import PunktSentenceTokenizer

In [2]:
data = pd.read_csv("data/data.csv")
abb = pd.read_csv("data/term_abb.csv")
term_def = pd.read_csv("data/term_def.csv")
data

Unnamed: 0,id,content
0,321712,Hey 👋 \n\nWe re using our bot:\n\nhttps://t.me...
1,321713,Good stuff \n\nI am surprised I took so long t...
2,321717,you are using a non-official one
3,321718,use the one that uniswap uses: https://thegrap...
4,321719,keep in mind this is a hot subgraph so it can ...
...,...,...
44131,374466,Can find it in many places\nAlso on Santiment:...
44132,374467,"guys, does anyone know if there is an applicat..."
44133,374468,Any Lobsters going to Kyiv Web3 Hackathon Sept...
44134,374469,whats funny is that no one complains about the...


## Cleaning the data 
Removing the stopwords, punctuations

In [3]:
stopwordlist = ['a', 'about', 'above', 'after', 'again', 'ain', 'all', 'am', 'an',
             'and','any','are', 'as', 'at', 'be', 'because', 'been', 'before',
             'being', 'below', 'between','both', 'by', 'can', 'd', 'did', 'do',
             'does', 'doing', 'down', 'during', 'each','few', 'for', 'from',
             'further', 'had', 'has', 'have', 'having', 'he', 'her', 'here',
             'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in',
             'into','is', 'it', 'its', 'itself', 'just', 'll', 'm', 'ma',
             'me', 'more', 'most','my', 'myself', 'now', 'o', 'of', 'on', 'once',
             'only', 'or', 'other', 'our', 'ours','ourselves', 'out', 'own', 're','s', 'same', 'she', "shes", 'should', "shouldve",'so', 'some', 'such',
             't', 'than', 'that', "thatll", 'the', 'their', 'theirs', 'them',
             'themselves', 'then', 'there', 'these', 'they', 'this', 'those',
             'through', 'to', 'too','under', 'until', 'up', 've', 'very', 'was',
             'we', 'were', 'what', 'when', 'where','which','while', 'who', 'whom',
             'why', 'will', 'with', 'won', 'y', 'you', "youd","youll", "youre",
             "youve", 'your', 'yours', 'yourself', 'yourselves']

STOPWORDS = set(stopwordlist)
def cleaning_stopwords(text):
    text = str(text)
    text = text.replace("/", " ")
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])
data['content'] = data['content'].apply(lambda text: cleaning_stopwords(text))
data['content'].head()

0    Hey 👋 We using bot: https: t.me lobster_watche...
1    Good stuff I surprised I took long find commun...
2                               using non-official one
3    use one uniswap uses: https: thegraph.com host...
4    keep mind hot subgraph change anytime without ...
Name: content, dtype: object

In [4]:
import string

english_punctuations = string.punctuation
punctuations_list = english_punctuations

def cleaning_punctuations(text):
    translator = str.maketrans('', '', punctuations_list)
    return text.translate(translator)
data['content']= data['content'].apply(lambda x: cleaning_punctuations(x))
data['content'].tail()

44131    Can find many places Also Santiment historical...
44132    guys anyone know application tools able check ...
44133    Any Lobsters going Kyiv Web3 Hackathon Septemb...
44134    whats funny no one complains txs rejected cens...
44135    By way think onchain analytics best user exper...
Name: content, dtype: object

In [5]:
data['content'][0]

'Hey 👋 We using bot https tme lobsterwatcher And also filtering recommendations select topics worth attention 5 people duty every day'

In [6]:
def cleaning_pipeline(text):
    tmp = cleaning_stopwords(text)
    tmp = cleaning_punctuations(tmp)
    return tmp

In [7]:
from difflib import SequenceMatcher

s = SequenceMatcher(None, 'apple', 'pineapple')
s.ratio() # This shows how much the two words are related to each other

0.7142857142857143

In [8]:
def tokenize(text):
    words = nltk.word_tokenize(text)
    return words

In [9]:
crypto_words = list(abb.terms.apply(lambda x:x.lower())) + list(term_def.terms.apply(lambda x:x.lower()))
crypto_words.append("crypto")

def get_crypto_related_words(text):
    words = tokenize(text.lower())
    
    set_of_words = set()
    for word in words:
        for i in crypto_words:
            s = SequenceMatcher(None, word, i)
            if s.ratio() >= 0.8 or i in word:
                set_of_words.add(word)
    return list(set_of_words)

In [10]:
string = cleaning_pipeline("""
We need to extract crypto related words / phrases from the conversational messages of different crypto related channels
Eg -> The phrases marked in [bold] are the ones which needs to be extracted
This is super exciting. Using deep reinforcement learning to **analyze Blockchain security** and find even better selfish **mining techniques**
By the way, which do you think is the **on-chain analytics** with the best user experience?
Preferably the ones that are self-served and that everyone on the team can use
We're launching **incentivized testnet** on **polygon** today at **tokensoft**
""")
string

'We need extract crypto related words phrases conversational messages different crypto related channels Eg  The phrases marked bold ones needs extracted This super exciting Using deep reinforcement learning analyze Blockchain security find even better selfish mining techniques By way think onchain analytics best user experience Preferably ones selfserved everyone team use Were launching incentivized testnet polygon today tokensoft'

In [11]:
get_crypto_related_words(string)

['blockchain', 'onchain', 'tokensoft', 'mining', 'crypto', 'testnet']

In [12]:
start, end = 100, 200
result, result2 = [], []
for string in data.content[start:end]:
    s = str(string).lower()
    tmp = get_crypto_related_words(s)
    result.append((s, tmp))
    if tmp:
        result2.append((s, tmp))

In [13]:
result

[('https tme nftavenue people like post without fear getting banned',
  ['nftavenue']),
 ('joke shared already but agree', []),
 ('nice update gearbox announcements thank ser ivangbi', []),
 ('https twittercom aaveaave status 1488277654315102208s21', []),
 ('https etherscanio address 0x5dd596c901987a2b28c38a9c1dfbf86fffc15d77',
  ['etherscanio', 'address']),
 ('tornado', []),
 ('guess he’s next handle be', ['next']),
 ('btw tornado cash really safe term privacy fund really can’t tracked',
  ['cash', 'fund']),
 ('8k eth withdrawal noticeable even split across wallets', ['wallets']),
 ('splitting hundred wallets around half year nobody notice i think',
  ['wallets']),
 ('reasonably safe done correctly if impatient pull much put it’s obvious there’s deniability nice but it’s nothing like privacy first tech aztec secret etc',
  []),
 ('you probability stuff but guess work protocol ensures 100 privacy', []),
 ('insane guys built generational wealth scamming people like that…', []),
 ('on te

In [14]:
result[19]

('in case investigation irs i hypothesizing condone anyone i not even sure method would work don’t no sarcasm 1 you supposedly anonymously create shitcoin call time buy actual clean wallet worth 10k usd start 2 you proceed buy dirty wallets create volumes make community bribes etc creates legitimacy around coin 3 along way people buy sell might buying someone else’s bags consider expense like 10 washing ofc stop buying point this much harder think need legitimacy around project but tldr option you might end losing much case but one options growing market even make way notional had',
 ['case', 'wallets', 'shitcoin', 'no', 'wallet'])

In [15]:
result[20]

('isn’t easier lowquality bluecryptoshark nft project',
 ['bluecryptoshark', 'nft'])

In [16]:
result[24]

('this continuation cex world pre 2017 chinese exchanges billions fake volume coinbase 20 mm',
 [])

In [17]:
result2

[('https tme nftavenue people like post without fear getting banned',
  ['nftavenue']),
 ('https etherscanio address 0x5dd596c901987a2b28c38a9c1dfbf86fffc15d77',
  ['etherscanio', 'address']),
 ('guess he’s next handle be', ['next']),
 ('btw tornado cash really safe term privacy fund really can’t tracked',
  ['cash', 'fund']),
 ('8k eth withdrawal noticeable even split across wallets', ['wallets']),
 ('splitting hundred wallets around half year nobody notice i think',
  ['wallets']),
 ('on testnet i get like thousand eth easy', ['testnet']),
 ('in case investigation irs i hypothesizing condone anyone i not even sure method would work don’t no sarcasm 1 you supposedly anonymously create shitcoin call time buy actual clean wallet worth 10k usd start 2 you proceed buy dirty wallets create volumes make community bribes etc creates legitimacy around coin 3 along way people buy sell might buying someone else’s bags consider expense like 10 washing ofc stop buying point this much harder think