In [61]:
import numpy as np
import pandas as pd
import pickle
import clip
import requests
import PIL
from PIL import Image as PILImage
from io import BytesIO
import torch
import IPython
from IPython.display import Image, display
import sklearn
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
import math
import easyocr

In [5]:
print(np.__version__)
print(pd.__version__)
print(requests.__version__)
print(PIL.__version__)
print(torch.__version__)
print(IPython.__version__)
print(sklearn.__version__)

1.26.4
2.2.3
2.32.3
10.3.0
2.6.0
8.32.0
1.3.1


In [6]:
!pip install git+https://github.com/openai/CLIP.git --quiet

In [8]:
import torch
from transformers import pipeline

torch.backends.mps.is_available = lambda: False  # Disable MPS explicitly

In [9]:
import torch
print(torch.device("cpu"))
print(torch.cuda.is_available())  # Should be False
print(torch.backends.mps.is_available())  # Should be False

cpu
False
False


In [10]:
# Use a pipeline as a high-level helper
from transformers import pipeline
import pickle

text_embedding_pipe = pipeline("feature-extraction", model="BAAI/bge-small-en-v1.5", device = "cpu")
# with open('sentiment_pipeline.pk', 'rb') as f:
#     sentiment_pipe = pickle.load(f)
sentiment_pipe = pipeline("text-classification", model="cardiffnlp/twitter-roberta-base-sentiment", device = "cpu", truncation = True, max_length = 512)
ocr_pipe = easyocr.Reader(['en'], gpu = False)

model, preprocess = clip.load("ViT-B/32", device = "cpu")

Device set to use cpu
Device set to use cpu
Using CPU. Note: This module is much faster with a GPU.


In [23]:
def ocr_img_from_url(img_url):
    try:
        response = requests.get(img_url, stream=True)
        if response.status_code != 200:
            print(f"Failed to fetch: {img_url} (Status: {response.status_code})")
            return ""

        # Read image as bytes
        img_bytes = response.content
        
        # Use EasyOCR directly with bytes
        result = ocr_pipe.readtext(img_bytes)

        # Extract text with high confidence
        text_result = " ".join(text for _, text, prob in result if prob > 0.5)

        return text_result if text_result else ""

    except Exception as e:
        print(f"Error processing {img_url}: {e}")
        return ""
        
def img_embed_from_url(image_url):
    # Sample image and text data
    response = requests.get(image_url)
    # Preprocess image
    image = preprocess(
        PILImage.open(
            BytesIO(response.content)
        )
    ).unsqueeze(0).to("cpu")

    with torch.no_grad():
        image_features = model.encode_image(image)
    image_features /= image_features.norm(dim = -1, keepdim = True)
    return image_features[0]

def text_embed_from_str(text):
    text = str(text)
    embed = text_embedding_pipe(text)[0][0]
    return embed

def get_sentiment(text):
    text = str(text)
    res = sentiment_pipe(text)[0]['label']
    if res == 'LABEL_2':
        return 1
    elif res == 'LABEL_1':
        return 0
    else:
        return -1

In [24]:
def preprocess_obj (obj):
    blank_text_embed = text_embed_from_str('')
    blank_img_embed = img_embed_from_url('https://static.vecteezy.com/system/resources/thumbnails/012/680/916/small/blank-black-cement-wall-texture-for-background-with-copy-space-for-design-free-photo.jpg')
    
    columns_to_normalise = ['totalVolume', 'volume24h', 'marketCap', 'uniqueHolders', 'transferCount']
    obj[columns_to_normalise] = scaler.transform(pd.DataFrame([obj[columns_to_normalise]]))[0]
    
    if obj['name'] is not np.nan:
        obj['name_sentiment'] = get_sentiment(obj['name'])
        obj['name_embed'] = text_embed_from_str(obj['name'])
    elif obj['name'] is np.nan:
        obj['name_sentiment'] = 0
        obj['name_embed'] = blank_text_embed
    
    if obj['description'] is not np.nan:
        obj['description_sentiment'] = get_sentiment(obj['description'])
        obj['description_embed'] = text_embed_from_str(obj['description'])
    elif obj['description'] is np.nan:
        obj['description_sentiment'] = 0
        obj['description_embed'] = blank_text_embed

    # IMPORTANT!!!
    # Change the name to the corresponding image cdn column
    if obj['previewImageUrl'] is not np.nan:
        obj['img_embed'] = img_embed_from_url(obj['previewImageUrl'])
        obj['img_ocr'] = ocr_img_from_url(obj['previewImageUrl'])
    elif obj['previewImageUrl']is np.nan:
        obj['img_embed'] = blank_img_embed
        obj['img_ocr'] = ''

    if obj['img_ocr'] == '':
        obj['img_text_embed'] = blank_text_embed
        obj['img_text_sentiment'] = 0
    
    return obj
    
def preprocess_df (df):
    blank_text_embed = text_embed_from_str('')
    
    blank_img_embed = img_embed_from_url('https://static.vecteezy.com/system/resources/thumbnails/012/680/916/small/blank-black-cement-wall-texture-for-background-with-copy-space-for-design-free-photo.jpg')
    
    columns_to_normalise = ['totalVolume', 'volume24h', 'marketCap', 'uniqueHolders', 'transferCount']
    df[columns_to_normalise] = scaler.fit_transform(df[columns_to_normalise])
    
    df[['name', 'description']] = df[['name', 'description']].fillna('')

    df['name_sentiment'] = df['name'].apply(lambda x: get_sentiment(x) if x != '' else 0 )
    df['name_embed'] = df['name'].apply(lambda x: text_embed_from_str(x) if x != '' else blank_text_embed)
    
    df['description_sentiment'] = df['description'].apply(lambda x: get_sentiment(x) if x != '' else 0 )
    df['description_embed'] = df['description'].apply(lambda x: text_embed_from_str(x) if x != '' else blank_text_embed)

    df['img_embed'] = df['mediaPreviewUrl'].apply(lambda x: img_embed_from_url(x) if x is not np.nan else blank_img_embed)
    df['img_ocr'] = df['mediaPreviewUrl'].apply(lambda x: ocr_img_from_url(x) if x is not np.nan else '')
    df['img_text_sentiment'] = df['img_ocr'].apply(lambda x: get_sentiment(x) if x != '' else 0 )
    df['img_text_embed'] = df['img_ocr'].apply(lambda x: text_embed_from_str(x) if x != '' else blank_text_embed)

    # Convert timestamps to numerical values
    df['createdAt'] = pd.to_datetime(df['createdAt'])
    T = df['createdAt'].max()  # Latest timestamp
    
    time_diff = (T - df['createdAt']).dt.total_seconds()
    df['time_weight'] = np.exp(-0.001 * (time_diff))

    return df
    
        

In [31]:
df_big = pd.read_csv('Coin.csv')
df_coin = pd.read_csv('Coin_new.csv')
df_big.address.loc[424]

'0x170f7ed23b6cf5d5ac69ba9bd1ea094febfd66ee'

In [32]:
'0x170f7ed23b6cf5d5ac69ba9bd1ea094febfd66ee' in df_coin.address.values

False

In [37]:
import json
example_post = df_big.loc[424].copy()
example_post_json = example_post.to_dict()
with open("example_post.json", "w") as file:
    json.dump(example_post_json, file, indent=4)

In [38]:
df_coin_preprocess = preprocess_df(df_coin)

Error processing https://media.decentralized-content.com/-/rs:fit:600:600/f:best/aHR0cHM6Ly9tYWdpYy5kZWNlbnRyYWxpemVkLWNvbnRlbnQuY29tL2lwZnMvYmFmeWJlaWVtNmx1eTRwbHNwY3c1YW82bnJ0ZWIyN2t4eTdyazVzYXBnYjVlczQ3eG42d2ltaHZrZ3U=: OpenCV(4.11.0) /Users/xperience/GHA-Actions-OpenCV/_work/opencv-python/opencv-python/opencv/modules/imgproc/src/color.cpp:199: error: (-215:Assertion failed) !_src.empty() in function 'cvtColor'

Error processing https://media.decentralized-content.com/-/rs:fit:600:600/f:best/aHR0cHM6Ly9tYWdpYy5kZWNlbnRyYWxpemVkLWNvbnRlbnQuY29tL2lwZnMvYmFmeWJlaWd1NTcycno1dXo3dnRpNGJtZDJsdHBzcG5ydGh0NHo2emk3cXd1ZGp1NGZpMmJldWx0emE=: OpenCV(4.11.0) /Users/xperience/GHA-Actions-OpenCV/_work/opencv-python/opencv-python/opencv/modules/imgproc/src/color.cpp:199: error: (-215:Assertion failed) !_src.empty() in function 'cvtColor'

Error processing https://media.decentralized-content.com/-/rs:fit:600:600/f:best/aHR0cHM6Ly9tYWdpYy5kZWNlbnRyYWxpemVkLWNvbnRlbnQuY29tL2lwZnMvYmFmeWJlaWh4dzZicWdtZ

In [39]:
%%time 
example_post_preprocess = preprocess_obj(example_post)

CPU times: user 4.81 s, sys: 1.2 s, total: 6.01 s
Wall time: 2.03 s


In [41]:
df_coin_preprocess.columns

Index(['id', 'name', 'symbol', 'description', 'createdAt', 'creatorAddress',
       'uniqueHolders', 'mediaMimeType', 'totalSupply', 'totalVolume',
       'volume24h', 'marketCap', 'address', 'mediaPreviewUrl',
       'marketCapDelta24h', 'mediaOriginalUri', 'scrapedAt', 'transferCount',
       'updatedAt', 'name_sentiment', 'name_embed', 'description_sentiment',
       'description_embed', 'img_embed', 'img_ocr', 'img_text_sentiment',
       'img_text_embed', 'time_weight'],
      dtype='object')

In [43]:
def sentiment_calculator(sen1, sen2):
    distance = abs(sen1 - sen2)
    if distance == 0:
        return 1
    elif distance == 1:
        return 0.5
    else:
        return 0
        
def calculate_sim(obj1, obj2):
    financial_columns = ['totalVolume', 'volume24h', 'marketCap', 'uniqueHolders', 'transferCount']
    w_sentiment = 0.15
    w_financial = 0.15
    w_embed = 0.1
    
    sen_ocr = sentiment_calculator(obj1['img_text_sentiment'], obj2['img_text_sentiment']) 
    sen_name = sentiment_calculator(obj1['name_sentiment'],obj2['name_sentiment'])
    sen_description = sentiment_calculator(obj1['description_sentiment'],obj2['description_sentiment']) 

    img_embed = cosine_similarity([obj1['img_embed']], [obj2['img_embed']])[0][0]
    name_embed = cosine_similarity([obj1['name_embed']], [obj2['name_embed']])[0][0]
    description_embed = cosine_similarity([obj1['description_embed']], [obj2['description_embed']])[0][0]
    img_text_embed = cosine_similarity([obj1['description_embed']], [obj2['description_embed']])[0][0]
    
    financial_sim = cosine_similarity([obj1[financial_columns]], [obj2[financial_columns]])[0][0]
    sentiment_sim = sen_ocr + sen_name + sen_description
    embed_sim = img_embed + name_embed + description_embed + img_text_embed
    
    total_distance = w_sentiment * (sentiment_sim) + w_embed * (embed_sim)  + w_financial * (financial_sim)
    return total_distance 

In [44]:
calculate_sim (example_post_preprocess, df_coin_preprocess.loc[0])

0.8768203870989897

In [45]:
df_coin_preprocess['similarity'] = df_coin_preprocess.apply(lambda x: calculate_sim(example_post_preprocess, x), axis = 1)

In [46]:
weighted_mean_similarity = np.average(df_coin_preprocess['similarity'], weights=df_coin_preprocess['time_weight'])

In [47]:
weighted_mean_similarity

0.6694290906793772

In [48]:
df_coin_preprocess['similarity'].describe()

count    163.000000
mean       0.759401
std        0.106842
min        0.421587
25%        0.702574
50%        0.769419
75%        0.831577
max        0.916695
Name: similarity, dtype: float64

In [49]:
import pickle
with open ('sentiment_pipeline.pk', 'wb') as f:
    pickle.dump(sentiment_pipe, f)
with open('text_embedding_pipeline.pk', 'wb') as g:
    pickle.dump(text_embedding_pipe, g)
with open('img_embedding_model.pk', 'wb') as h:
    pickle.dump(model, h)
with open('img_embedding_preprocess.pk', 'wb') as i:
    pickle.dump(preprocess, i)

In [62]:
import flask

In [63]:
flask.__version__

  flask.__version__


'3.1.0'

In [33]:
ocr_img_from_url("https://media.decentralized-content.com/-/rs:fit:600:600/f:best/aHR0cHM6Ly9tYWdpYy5kZWNlbnRyYWxpemVkLWNvbnRlbnQuY29tL2lwZnMvYmFmeWJlaWViejdnN2EyeWFhZXZobDIzY3M2dnV2bXczaXFjaWxsZmtlaWNnYnVjZTNuN2FkeDQ2Ym0=")

'bit about coinsagains'