In [30]:
import os
from dotenv import load_dotenv
load_dotenv()
import pandas as pd
import numpy as np
import datetime
import sqlalchemy as db

def get_tweets(collection_name: str):
    """ purpose: retrieves all tweets associated with the given collection from the social db table 'nft_tweets_demo' 
                 filters for the tweets including hashtags '#'
        output: dataframe with columns (collection, text)
    """
    s = os.getenv("PG_HOST") #postgres social db
    engine = db.create_engine(s)
    connection = engine.connect()

    query = f""" 
    SELECT collection, text
    FROM nft_tweets_demo
    WHERE collection = '{collection_name}'
    """
    tweets = pd.read_sql_query(query, connection)
    tweets[tweets['text'].str.contains("#")].reset_index(drop=True)
    return tweets

def get_hashtag_counts(collection_name: str):
    """ purpose: parses through tweets for a given collection and counts for the number of times hashtag appears
        output: dataframe with columns (hashtag, count, total, percent)
    """
    tweets = get_tweets(collection_name)
    hashtags_count = pd.DataFrame(tweets.text.str.extractall(r'(#\w+)')[0].value_counts())
    hashtags_count.columns = ['count']
    hashtags_count.reset_index(inplace = True)
    hashtags_count['total'] = hashtags_count['count'].sum()
    hashtags_count['percent'] = (hashtags_count['count'] / hashtags_count['count'].sum()) * 100
    hashtags_count.rename(columns={ hashtags_count.columns[0]: "hashtag" }, inplace = True)
    return(hashtags_count)

def count_hashtag_counts(df):
    """ purpose: parses through the given dataframe and counts for the number of times hashtag appears
        output: dataframe with columns (hashtag, count, total, percent)
    """
    hashtags_count = pd.DataFrame(df.text.str.extractall(r'(#\w+)')[0].value_counts())
    hashtags_count.columns = ['count']
    hashtags_count.reset_index(inplace = True)
    hashtags_count['total'] = hashtags_count['count'].sum()
    hashtags_count['percent'] = (hashtags_count['count'] / hashtags_count['count'].sum()) * 100
    hashtags_count.rename(columns={ hashtags_count.columns[0]: "hashtag" }, inplace = True)
    return(hashtags_count)


def get_sample_tweets(collection_name: str):
    """ purpose: retrieves randomized tweets from the social db table 'nft_tweets_demo' 
                that has the same sample size as collection_name tweets
        output: dataframe with columns (collection, text)
    """
    s = os.getenv("PG_HOST") #postgres social db
    engine = db.create_engine(s)
    connection = engine.connect()

    query = f""" 
    SELECT collection, text
    FROM nft_tweets_demo
    """
    tweets = pd.read_sql_query(query, connection)
    tweets[tweets['text'].str.contains("#")].reset_index(drop=True)
    
    # get tweets for the collection and its length to sample tweets
    tweets_collection_name = get_tweets(collection_name)
    sample = tweets.sample(len(tweets_collection_name))
    return sample


In [42]:
def get_relevant_hashtags(collection_name: str):
    """ input: colleciton_name
        output: relevant hashtags associated with the given nft_project ranked by relevance
        process:
        1) get all hashtags that appears in tweets for a collection
        2) get a sample tweet pools that shares the same amount of tweets as the given collection
        3) brute force simulates and calculataes p values for each hashtags 
    """
    # step 1: count hashtags for a specific collection 
    counts = get_hashtag_counts(collection_name)
    counts['trial_counts'] = [0] * len(counts)
 
    # step 2 & 3: get a sample tweets of the size of specific collection tweets and calculate p-value
    for trial in range(2):
        sample = get_sample_tweets(collection_name)
        trial_counts = count_hashtag_counts(sample)
        
        # brute-force calculating p-value
        for t in list(trial_counts['hashtag']):
            if t in list(counts['hashtag']):
                if counts[counts['hashtag'] == t]['percent'].iloc[0] < trial_counts[trial_counts['hashtag'] == t]['percent'].iloc[0]:
                    counts.loc[counts['hashtag'] == t, 'trial_counts'] = counts.loc[counts['hashtag'] == t, 'trial_counts'] + 1

    
    counts = pd.merge(counts,get_hashtag_counts(collection_name), left_on = 'hashtag', right_on = 'hashtag' )
    return counts


In [45]:
get_relevant_hashtags('parallel') # get collection names from 'select distinct(collection) from nft_tweets_demo' 

Unnamed: 0,hashtag,count_x,total_x,percent_x,trial_counts,count_y,total_y,percent_y
0,#nft,285,1248,22.836538,0,285,1248,22.836538
1,#parallelalpha,272,1248,21.794872,0,272,1248,21.794872
2,#blesstiger,271,1248,21.714744,0,271,1248,21.714744
3,#NFT,13,1248,1.041667,2,13,1248,1.041667
4,#10860,13,1248,1.041667,0,13,1248,1.041667
...,...,...,...,...,...,...,...,...
177,#10919,1,1248,0.080128,0,1,1248,0.080128
178,#digitalasset,1,1248,0.080128,2,1,1248,0.080128
179,#nonfungible,1,1248,0.080128,1,1,1248,0.080128
180,#immutablex,1,1248,0.080128,0,1,1248,0.080128


the last step is to insert these hashtags into nft_collection_hashtags