In [None]:
####################### CONTRACT ADDRESSES IN USER_TXS DATASET ######################

# Load your dataframes
user_transactions = pd.read_csv('dataset/user_transactions.csv')
contract_addresses = pd.read_csv('dataset/contract_addresses.csv')

# Extract unique addresses from 'from' and 'to' columns
unique_addresses = pd.concat([user_transactions['from'], user_transactions['to']]).unique()

# Check if these addresses exist in contract_addresses
matches = contract_addresses[contract_addresses['address'].isin(unique_addresses)]

# Write the matches to a CSV file
matches.to_csv('contract_addresses_in_user_txs.csv', index=False)

In [None]:
###################### UNIQUE CONTRACTS IN CONTRACTS_CODES.JSON ##################
json_file = 'contract_parser/verified-smart-contracts/data/combined.json'

contracts = set()
counter = 0
with open(json_file, 'r') as f:
    for line in f:
        counter = counter + 1
        data = json.loads(line)
        contracts.add((data['source_code'], data['address']))


unique_source_codes = set()

for contract in tqdm(contracts, desc="Processing"):
    unique_source_codes.add(contract[0])

print(f"The number of unique source codes is {len(unique_source_codes)}")
print(counter)

In [None]:
#################### LOAD PARSED CONTRACT HUGE JSON ###################
with open('contract_parser/verified-smart-contracts/data/combined.json', 'r') as f:
    data = json.load(f)

total_contracts = len(data)
unique_contracts = len(set(contract['contract_name'] for contract in data))

print(f"Total number of contracts: {total_contracts}")
print(f"Number of unique contract names: {unique_contracts}")

In [None]:
############################ CREATE DF OF PARSED CONTRACTS ##########################
dir_path = 'contract_parser/verified-smart-contracts/data/parsed'
files = [os.path.join(dir_path, f) for f in os.listdir(dir_path) if f.endswith('.parquet')]

dfs = [pd.read_parquet(f) for f in files]

parsed_contracts = pd.concat(dfs, ignore_index=True)
print('columns of parsed contracts df:', parsed_contracts.columns) # so it just save all the classes of same contract separatly, so the only unique identifire would be the contract_name I guess
class_counts = parsed_contracts['class_name'].value_counts()
common_libraries = class_counts[:10].index.tolist()
print('10 most common class_names in parsed contracts df:', common_libraries)

In [None]:
##################################### TOPIC MODELING ON CONTRACTS COMMENTS ##################################

'''
concatenate all class_documentations associated to one contract, 
remove commun function comments, so each doc is one contract all comments, 
run LDA
Common_libraries = ['ERC721A', 'SafeMath', 'IUniswapV2Pair', 'ERC20', 'IERC20', 'ERC721', 'Ownable']
'''

common_libraries = []

def concat_non_common_docs(docs, classes):
    # This will run for each group of class_documentation strings in the same contract_name.
    non_common_docs = [doc for doc, class_name in zip(docs, classes) if class_name not in common_libraries and pd.notnull(doc)]
    return ' '.join(non_common_docs)

contract_docs = parsed_contracts.groupby('contract_name').apply(lambda x: concat_non_common_docs(x['class_documentation'], x['class_name']))

contract_docs = contract_docs.reset_index(name='class_documentation')

print(len(contract_docs))

CUSTOM_FILTERS = [lambda x: x.lower(), strip_tags, strip_punctuation]

processed_contracts = [preprocess_string(doc, CUSTOM_FILTERS) for doc in contract_docs['class_documentation']]

# Create the dictionary and corpus required for LDA
dictionary = corpora.Dictionary(processed_contracts)
dictionary.filter_extremes(no_below=10, no_above=0.1)  # adjust these parameters

corpus = [dictionary.doc2bow(contract) for contract in processed_contracts]

num_topics = 15  # Choose an appropriate number
lda = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=10)

for i in range(num_topics):
    print(f"Topic #{i}: ", lda.print_topic(i))

'''
29488
Topic #0:  0.067*"assumes" + 0.038*"support" + 0.038*"burning" + 0.038*"uint128" + 0.038*"mints" + 0.037*"e" + 0.036*"g" + 0.036*"at" + 0.036*"gas" + 0.036*"minted"
Topic #1:  0.071*"enumerable" + 0.067*"but" + 0.060*"erc721enumerable" + 0.059*"separately" + 0.047*"type" + 0.043*"collection" + 0.041*"related" + 0.032*"compliant" + 0.031*"optional" + 0.025*"ids"
Topic #2:  0.058*"note" + 0.054*"over" + 0.053*"8" + 0.048*"compiler" + 0.048*"now" + 0.048*"needed" + 0.046*"checking" + 0.043*"overflow" + 0.043*"wrappers" + 0.043*"arithmetic"
Topic #3:  0.031*"notice" + 0.028*"author" + 0.010*"mint" + 0.009*"7" + 0.009*"minting" + 0.009*"nft" + 0.008*"time" + 0.008*"we" + 0.007*"at" + 0.007*"users"
Topic #4:  0.098*"erc721a" + 0.069*"order" + 0.065*"optimized" + 0.062*"ids" + 0.062*"sequential" + 0.061*"assumptions" + 0.026*"hashing" + 0.023*"merkle" + 0.021*"proofs" + 0.019*"test"
Topic #5:  0.027*"events" + 0.026*"t" + 0.026*"added" + 0.026*"applications" + 0.026*"how" + 0.026*"mechanisms" + 0.025*"ierc20" + 0.014*"openzeppelin" + 0.014*"allowance" + 0.014*"instead"
Topic #6:  0.040*"overflow" + 0.029*"bugs" + 0.028*"o" + 0.025*"arithmetic" + 0.021*"added" + 0.020*"type" + 0.015*"reverting" + 0.015*"result" + 0.015*"overflows" + 0.015*"recommended"
Topic #7:  0.122*"pragma" + 0.079*"experimental" + 0.078*"abiencoderv2" + 0.077*"sol" + 0.070*"import" + 0.068*"openzeppelin" + 0.047*"utils" + 0.046*"math" + 0.041*"lib" + 0.030*"8"
Topic #8:  0.064*"role" + 0.045*"admin" + 0.028*"roles" + 0.026*"function" + 0.023*"proxy" + 0.018*"call" + 0.014*"accounts" + 0.013*"public" + 0.013*"my" + 0.010*"or"
Topic #9:  0.074*"assumes" + 0.074*"max" + 0.073*"cannot" + 0.072*"value" + 0.038*"than" + 0.035*"maximum" + 0.034*"id" + 0.034*"exceed" + 0.034*"64" + 0.034*"uint64"
Topic #10:  0.114*"███" + 0.080*"pool" + 0.068*"░" + 0.045*"██" + 0.041*"eth" + 0.039*"staking" + 0.027*"reward" + 0.027*"rewards" + 0.023*"▓▓" + 0.022*"▒"
Topic #11:  0.055*"you" + 0.023*"safeerc20" + 0.023*"throw" + 0.020*"call" + 0.020*"failure" + 0.019*"returns" + 0.018*"or" + 0.018*"name" + 0.018*"value" + 0.018*"return"
Topic #12:  0.031*"asset" + 0.023*"each" + 0.022*"proxy" + 0.021*"ether" + 0.020*"caller" + 0.016*"t" + 0.016*"swap" + 0.013*"then" + 0.013*"split" + 0.013*"made"
Topic #13:  0.061*"or" + 0.045*"license" + 0.039*"io" + 0.028*"software" + 0.024*"synthetix" + 0.024*"source" + 0.022*"interfaces" + 0.021*"any" + 0.019*"c" + 0.017*"copyright"
Topic #14:  0.071*"com" + 0.061*"github" + 0.048*"author" + 0.031*"blob" + 0.028*"sol" + 0.027*"master" + 0.024*"20" + 0.019*"code" + 0.019*"smart" + 0.016*"erc"
'''

In [None]:
print('number of contracts classes:',len(parsed_contracts))
unique_contract_names = parsed_contracts['contract_name'].unique().tolist()
print('number of unique contracts by their names:',len(unique_contract_names))
print('some sample of contract_names:',unique_contract_names[:10])

In [None]:
######################## TAG CONTRACTS BY TRAINED LDA #######################
#TODO: Fix ths bug 
 
def format_topics_sentences(ldamodel, corpus, texts):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)

df_topic_sents_keywords = format_topics_sentences(ldamodel=lda, corpus=corpus, texts=processed_contracts)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Contract_Name', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Merge with the original data
contract_name_to_topic_df = pd.concat([contract_docs['contract_name'], df_dominant_topic], axis=1)

# Save to CSV
contract_name_to_topic_df.to_csv('contract_name_to_topic.csv', index=False)


In [None]:
#TODO: CREATE new adj_matrix with topic modeling tagging, re-run CF recommender and compare result with Kmean tagging model