In [17]:
import json
import re
import operator
import string


# Class to append value in a dictionary
class MyDictionary(dict):

    # __init__ function
    def __init__(self):
        self = dict()

    # Function to add key:value
    def add(self, key, value):
        self[key] = value


class Relevancy(object):
    def __init__(self, file_name, tag):
        self.tag = tag
        self.file_name = file_name
        with open(self.file_name, "r") as f:
            self.data = json.load(f)

    # Python3 code to pre-processing the string
    # Intending to remove all punctuation and common words
    def pre_process(self, str):
        # removing punctuation ---> using string module
        translator = str.maketrans('', '', string.punctuation)
        str = str.translate(translator)

        # converting all into lower cases
        str = str.lower()
        
        # removing numbers and digits
        # str = ''.join([i for i in str if not i.isdigit()])
        regex = re.compile('[^a-zA-Z]')
        #First parameter is the replacement, second parameter is your input string
        str = regex.sub(' ', str)

        # removing prep, conj, articles ---> using re module
        str = re.sub('(\s+)(a|an|and|the|this|that|these|those|i|would|could|should|m|ve)(\s+)', ' ', str)
        str = re.sub('(\s+)(to|for|from|in|into|under|with|within|below|up|down|of|on|s|t)(\s+)', ' ', str)
        str = re.sub('(\s+)(are|may|by|as|we|or|it|be|which|the|when|make|no|set|your|its|it\'s)(\s+)', ' ', str)
        str = re.sub('(\s+)(if|any|used|all|has|have|new|data|at|code|node|state|-|they|our)(\s+)', ' ', str)
        str = re.sub('(\s+)(you|must|every|each|not|what|one|then|way|so|will|also|is|can|"|")(\s+)', ' ', str)
        str = re.sub('(\s+)(their|was|more|other|use|do|need|my|some|get|out|many|had|here|over)(\s+)', ' ', str)
        return str

    # Python3 code to find frequency of each word
    # function for calculating the frequency
    def freq(self, str):
        str = self.pre_process(str)

        # break the string into list of words
        str_list = str.split()

        # gives set of unique words
        unique_words = set(str_list)
        frequency = MyDictionary()
        for word in unique_words:
            frequency.add(word, str_list.count(word))

        # sort by value (downwards) ---> using operator module
        sorted_freq = sorted(frequency.items(), key=operator.itemgetter(1), reverse=True)

        # collect first 10 items in dictionary
        # first_few_items = {k: sorted_freq[k] for k in list(sorted_freq)[:10]}
        # btw, sorted_freq is a list
        return sorted_freq[0:20]  # , len(sorted_freq)

    # merge two lists toether without appending same word
    # if same word found, increases count
    def merge_two_lists(self, a, b):
        new_list = {}
        for pair in a + b:
            key, value = pair
            new_list[key] = new_list.get(key, 0) + value
        new_list = [[key, value] for key, value in new_list.items()]
        return new_list

    # convert a tuple into a list
    def tuple_to_list(self, listname):
        a = []
        for i in range(0, len(listname)):
            a.append(list(listname[i]))
        return a

    # For sorting a list of lists using 2nd item
    def second_item(self, item):
        return item[1]

    # for getting a list of words for a particular tag
    def list_of_most_used_words(self):
        freq_list = []
        for key in self.data:
            if self.tag in key['title'].lower():
                str = key['content']
                freq_list = self.merge_two_lists(freq_list, self.tuple_to_list(self.freq(str)))

        freq_list.sort(key=self.second_item, reverse=True)
        reduced_list = []
        for i in freq_list:
            if i[1] > 1000:
                reduced_list.append(i)
        return reduced_list

    # Return number of Relevant posts
    def relevant_post(self):
        relevant = 0
        for key in self.data:
            if self.tag in key['title'].lower():
                relevant += 1
        return relevant

In [25]:
# application code
if __name__ == "__main__":
    file_name = "ethereum.json"
    tag = 'ethereum'
    keyword = Relevancy(file_name, tag)
    print("Number of Relevant post is:" + str(keyword.relevant_post()))
    print(keyword.list_of_most_used_words())

Number of Relevant post is:373
[['ethereum', 4558], ['blockchain', 1641], ['contract', 1259], ['but', 1057]]


## Relevancy Search in percentage

In [14]:
import json
from math import floor
count = 0
file_name = "data/ethereum.json"
with open(file_name, "r") as f:
    data = json.load(f)
relevant = 0
for key in data:
    if 'ethereum' in key['title'].lower():
        relevant += 1
for key in data:
    count += 1      
p = floor(relevant/count * 100)
print(relevant)
print(count)

475
485


## Better relevancy search using finding tags

In [13]:
import json
from math import floor
count = 0
file_name = "data/ethereum.json"
with open(file_name, "r") as f:
    data = json.load(f)
relevant = 0
for key in data:
    count += 1
    tags = key['tags']
    for item in tags: 
        if 'ethereum' in item.lower():
            relevant += 1
print(relevant)
print(count)

482
485


In [None]:
# input all relevant posts into a single file

In [44]:
import json
final_count = 0
final_json_data = []

file_name_list = ["ethereum.json", "blockchain.json", "smart-contract.json",
                      "solidity.json", "vyper.json", "ripple.json",
                      "remix.json", "metamask.json", "bitcoin.json"]
tag_list = ["ethereum", "blockchain", "smart contract", "solidity", "vyper", "ripple",
                "remix", "metamask", "bitcoin"]

for file_name, tag in zip(file_name_list, tag_list):

    json_data  = json.load(open(filename))

    for key in json_data:
        tags = key['tags']
        for item in tags: 
            if tag in item.lower():
                final_json_data.append(key)
                count += 1
    print("Number of total post for ", tag, "is =", count)

    # Output the updated file with pretty JSON                                      
open("final_all_post_data.json", "w").write(
        json.dumps(final_json_data, sort_keys=True, indent=4, separators=(',', ': '))
    )
print("The number of total post is: ", count)

In [20]:
import json
data  = json.load(open("_test_.json"))
new = []

for key in data:
    count += 1
    tags = key['ename']
    for item in tags: 
        if 'egg' in item.lower():
            new.append(key)

new = json.dumps(new)
new

'[{"ename": ["mark", "egg"], "url": "Lennon.com"}, {"ename": ["egg"], "url": "Lennon.com"}]'

In [None]:
file_tag = {"ethereum-390.json":"ethereum", "vyper-272.json":"vyper",
                "bitcoin-281.json":"bitcoin", "smart-contract-369":"smart contract",
                "blockchain-351.json":"blockchain", "solidity-374":"solidity"}
for x,y in file_tag.items():
    print(x, y)

## Posts over time

In [156]:
#year_list = ['2019', '2018', '2017', '2016', '2015', '2014']
#month_list = ['January', 'February', 'March', 'April', 'May', 'June', 'July',
            # 'August', 'September', 'October', 'November', 'December']
freq = {"2019":0, "2018":0, "2017":0, "2016":0, "2015":0, "2014":0, "missing":0}
date_list = ['27 April, 2018', '17 December', '12 May, 2017', '17 January',
            '16 June, 2018', '16 April', '', '16 April', '11 May, 2016', '', '17 June, 2018',
             '11 May, 2016', '27 April, 2015', '16 April']
for date in date_list:
    if ',' in date:
        y = date.split(',')[1]
        y = y.lstrip()
        freq[y] += 1
    elif date=='':
        freq["missing"] += 1
    else:
        freq["2019"] += 1
        
freq

{'2019': 5,
 '2018': 3,
 '2017': 1,
 '2016': 2,
 '2015': 1,
 '2014': 0,
 'missing': 2}

In [23]:
a = '2016-11-27'
a = a.split("-")
a

['2016', '11', '27']

In [None]:
# Panda DataFrame for Medium Posts
file_name = "metamask-350.json"
dates = []
with open(file_name, "r") as f:
    data = json.load(f)
import pandas as pd
df = pd.DataFrame.from_dict(data, orient='columns')
#df[['post_date', 'upvotes']]
for key in data:
    dates.append(key['post_date'])
if dates[26]=='':
    print(True)

In [29]:
te = [
      {
        "Name": "Bala",
        "phone": "None"
      },
      {
        "Name": "Bala",
        "phone": "None"
      },
      {
        "Name": "Bala",
        "phone": "None"
      },
      {
        "Name": "Bala",
        "phone": "None"
      },
      {
          "Name": "Bala1",
          "phone": "None"
      }      
    ]

unique = { each['Name'] : each for each in te }
a = list(unique.values())
with open("test_3.json","w") as fp:
    json.dump(a, fp)
print(a)

[{'Name': 'Bala', 'phone': 'None'}, {'Name': 'Bala1', 'phone': 'None'}]


In [32]:
with open("final_all_post_data.json","r") as f:
    data = json.load(f)
unique_post = {each['title'] : each for each in data}
listed_dictionary_items = list(unique_post.values())
with open("final_data_removing_duplicacy.json","w") as f:
    json.dump(listed_dictionary_items, f)
print("# of post after removing duplicay = ", len(listed_dictionary_items))

# of post after removing duplicay =  2156


## Associative Tag analysis

In [36]:
import json
with open("../conference_medium_data/author_data/related_data_rm_duplicacy.json","r") as f:
    data = json.load(f)
count = 0
tag_list = []
for key in data:
    tag_list.append([i for i in key['tags']])
    count += 1

# 2 ways to remove nested lists
from itertools import chain
break_nested_list = list(chain(*tag_list))
print(len(break_nested_list))
# flattened  = [val for sublist in list_of_lists for val in sublist]
unique_list_of_tags = list(set(break_nested_list))
print(len(unique_list_of_tags))

from collections import Counter
associated_with_tag = list(set([item for items in tag_list if "Solidity" in items for item in items]))
top_tags_for_tag = Counter([item for items in tag_list if "Solidity" in items for item in items])

top_tags_only = Counter([item for items in tag_list for item in items])

associated_with_smart_contract = list(set([item for items in tag_list if "Smart Contracts"\
                                           in items for item in items or "Smart Contract" in items for item in items]))
top_tags_for_smart_contracts = Counter(([item for items in tag_list if "Smart Contracts"\
                                           in items for item in items or "Smart Contract" in items for item in items]))
#print(associated_with_tag)
print(top_tags_for_smart_contracts)

5755
980
Counter({'Smart Contracts': 1783, 'Ethereum': 1417, 'Blockchain': 1128, 'Solidity': 769, 'Cryptocurrency': 229, 'Truffle': 211, 'Bitcoin': 154, 'Tutorial': 144, 'Programming': 138, 'Dapps': 118, 'ICO': 109, 'Security': 99, 'Web3': 90, 'Erc20': 69, 'Development': 65, 'Technology': 55, 'Crypto': 53, 'Blockchain Development': 50, 'Neo': 44, 'Vyper': 44, 'JavaScript': 43, 'Token': 34, 'Tech': 30, 'Smart Contracts Tutorial': 30, 'Open Source': 25, 'Decentralization': 25, 'Blockchain Technology': 25, '이더리움': 25, '솔리디티': 25, 'Testing': 23, 'Token Sale': 20, 'Hacking': 20, 'Openzeppelin': 20, 'Gochain': 20, 'Software Development': 20, 'Audit': 19, 'Metamask': 18, 'Programming Languages': 15, 'Ethereum Blockchain': 15, 'Partnerships': 15, 'Developer Tools': 15, 'Guide': 15, 'Startup': 15, 'Altcoins': 15, 'Real Estate': 15, 'Solidity Tutorial': 15, 'Vulnerability': 15, 'Smart Contract Security': 14, 'Rsk': 14, 'Dispute Resolutions': 14, 'Remix': 14, 'Oracle': 10, 'Coding': 10, 'Python':

### Tag frequency count

In [9]:
import json
with open("../conference_medium_data/author_data/related_data_rm_duplicacy.json","r") as f:
    data = json.load(f)
    
tag_list = ["solidity", "ethereum", "blockchain", "web3", "metamask", "truffle", "remix", "token",\
            "tokens", "erc20", "vyper", "smart contract", "smart contracts", "ethereum blockchain",\
            "myetherwallet", "wallet", "dapp", "dapps",]

for i in tag_list:
    count = 0
    for key in data:
        if i in [item.lower() for item in key['tags']]:
            count += 1
            # print(key['title'])
    print(i, "=", count)

solidity = 419
ethereum = 856
blockchain = 708
web3 = 217
metamask = 192
truffle = 140
remix = 5
token = 32
tokens = 1
erc20 = 138
vyper = 26
smart contract = 3
smart contracts = 377
ethereum blockchain = 30
myetherwallet = 11
wallet = 18
dapp = 6
dapps = 100


In [45]:
import json
with open("../conference_medium_data/author_data/related_data_rm_duplicacy.json","r") as f:
    data = json.load(f)
response_list = []
for key in data:
    response_list.append(key['responses'])
    count += 1
response_freq = Counter([item for item in response_list])
print(sum(response_list))
print("Average= ", sum(response_list)/len(response_list))
print(response_freq)
# print(sorted(response_list, reverse=True))

4682
Average=  3.7819063004846525
Counter({0: 619, 1: 205, 2: 105, 3: 68, 4: 42, 5: 41, 7: 25, 8: 21, 6: 19, 9: 18, 11: 13, 10: 8, 12: 7, 14: 5, 21: 4, 17: 3, 23: 3, 13: 3, 16: 3, 18: 2, 39: 2, 153: 1, 68: 1, 24: 1, 19: 1, 22: 1, 79: 1, 20: 1, 29: 1, 40: 1, 72: 1, 165: 1, 105: 1, 63: 1, 131: 1, 684: 1, 57: 1, 26: 1, 136: 1, 15: 1, 124: 1, 128: 1, 129: 1})


### Avg claps and voters

In [26]:
import json
from collections import Counter
with open("../conference_medium_data/author_data/related_data_rm_duplicacy.json","r") as f:
    data = json.load(f)
clap_list = []
voter_list = []
data_dict = dict()
for key in data:
    clap_list.append(key['claps'])
    voter_list.append(key['voters'])
for i,j in zip(clap_list,voter_list):
    data_dict[i] = j

new_clap = []
new_voter = []
for key in data:
    for i in key['tags']:
        if "Blockchain" in i or "Tokens" in i:
            new_clap.append(key['claps'])
            new_voter.append(key['voters'])
            
print("max clap:", max(new_clap))
print("avg clap:", sum(new_clap)/len(new_clap))
print("max voter:", max(new_voter))
print("avg voter:", sum(new_voter)/len(new_voter))
print("claps per voter:", sum(new_clap)/sum(new_voter))
print("total claps:", sum(new_clap))
print("total voters:", sum(new_voter))
# print(sorted(new_clap, reverse=True))
# print(sorted(new_voter, reverse=True))
# response_freq = Counter([item for item in response_list])
# print(voter_list)
# print("Average= ", sum(response_list)/len(response_list))
# print(clap_list)
# print(data_dict)

max clap: 35690
avg clap: 823.3447432762836
max voter: 5027
avg voter: 73.19070904645477
claps per voter: 11.24930683146818
total claps: 673496
total voters: 59870


## Vulnerability Search

In [54]:
import json
with open("../conference_medium_data/author_data/related_data_rm_duplicacy.json") as f:
    post_data = json.load(f)

word_list = ["security", "vulnerability", "vulnerabilities", "reentrancy", "re entrancy", "re-entrancy", "race condition",\
             "denial of service", "DoS", "transaction order", "transactions order", "trasaction order depend",\
             "transaction_ordering_depend", "timestamp dependence", "integer overflow", "integer underflow"]
# word_list = ["transaction order"]
for i in word_list:
    count = 0
    for key in post_data:
        if i in key['content'] or i in key['title'].lower() or i in key['tags']:
            count += 1
            # print(key['title'])
    print(i, "=", count)

security = 328
vulnerability = 49
vulnerabilities = 73
reentrancy = 15
re entrancy = 0
re-entrancy = 7
race condition = 6
denial of service = 7
DoS = 13
transaction order = 8
transactions order = 0
trasaction order depend = 0
transaction_ordering_depend = 0
timestamp dependence = 0
integer overflow = 14
integer underflow = 2


In [36]:
sec_tool_list = ["mythril", "mythx", "mythos", "oyente", "solhint", "solium", "ethlint",\
                 "securify", "teether", "smartcheck", "manticore", "sonarsolidity", "ethir",\
                 "maian", "solcheck", "solgraph", "solint", "vandal", "contractfuzzer",\
                 "rattle", "sasc", "zeus", "contractlarva", "echinda", "ethertrust", "fsolidm",\
                 "octopus", "osiris", "reguard", "scompile", "slither", "surya", "sūrya", "verisolid",\
                 "verx", "vultron"]
# sec_tool_list = ["sūrya"]
for i in sec_tool_list:
    count = 0
    for key in post_data:
        if (i in key['content'].lower() or i in key['title'].lower() or i in key['tags']) \
                            and ("transaction ordering" in key['content'].lower() or i in key['title'].lower()):
                                 #or ("timestamp-depend" in key['content'].lower() or i in key['title'].lower())):
            count += 1
            # print(key['title'])
    print(i, "=", count)

mythril = 0
mythx = 4
mythos = 0
oyente = 0
solhint = 1
solium = 0
ethlint = 0
securify = 0
teether = 0
smartcheck = 1
manticore = 0
sonarsolidity = 0
ethir = 0
maian = 0
solcheck = 0
solgraph = 0
solint = 0
vandal = 0
contractfuzzer = 0
rattle = 0
sasc = 0
zeus = 0
contractlarva = 0
echinda = 0
ethertrust = 0
fsolidm = 0
octopus = 0
osiris = 0
reguard = 0
scompile = 0
slither = 1
surya = 0
sūrya = 0
verisolid = 0
verx = 0
vultron = 0


## Number of mentions of popular tags

In [73]:
tags = ["solidity", "goethereum", "go-ethereum", "web3", "web3js", "contract development", "blockchain", "ethereum", "truffle",\
        "transaction", "transactions", "remix", "contract design", "token", "tokens", "ether", "erc20", "erc-20",\
        "metamask", "mining", "mine", "javascript", "private blockchain", "wallet", "wallets", "gas", "parity", "parities"]

for i in tags:
    count = 0
    for key in post_data:
        counter = Counter(key['content'].lower().split(" "))
        count += counter[i]
            # print(key['title'])
    print(i, "=", count)

solidity = 1697
goethereum = 0
go-ethereum = 10
web3 = 886
web3js = 17
contract development = 0
blockchain = 2303
ethereum = 3389
truffle = 1790
transaction = 1460
transactions = 725
remix = 194
contract design = 0
token = 2578
tokens = 1894
ether = 835
erc20 = 693
erc-20 = 195
metamask = 1524
mining = 133
mine = 53
javascript = 346
private blockchain = 0
wallet = 1057
wallets = 188
gas = 1079
parity = 156
parities = 0


### Reading time

In [52]:
import json
with open("../conference_medium_data/author_data/related_data_rm_duplicacy.json") as f:
    post_data = json.load(f)
    
read_list = []
for key in post_data:
    read_list.append(key['readtime'])

_0_2 = 0
_2_5 = 0
_5_10 = 0
_10_15 = 0
_15 = 0
for i in read_list:
    i = float(i)
    if i<2:
        _0_2 += 1
    elif i>=2 and i<5:
        _2_5 += 1
    elif i>=5 and i<10:
        _5_10 += 1
    elif i>=10 and i<15:
        _10_15 += 1
    else:
        _15 += 1

read_count = [_0_2, _2_5, _5_10, _10_15, _15]
read_count

[300, 556, 295, 58, 29]

In [45]:
import math
round(1.4)

1

In [51]:
a = '1.54'
a = float(a)
a

1.54