In [1]:
import pandas as pd
import numpy as np

import jieba
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
import re

import warnings
warnings.filterwarnings('ignore')

In [2]:
kalshi_data = pd.read_csv('../kalshi/data_events.csv')
poly_data = pd.read_csv('../polymarket/markets_data.csv')

In [3]:
kalshi_titles = kalshi_data['title'].values
kalshi_titles

array(['Initial jobless claims from Aug 22-28, 2021?',
       'EU meets its 2030 climate goals?',
       'India meets its 2030 climate goals?', ...,
       'New home sales in July 2021?',
       'COVID daily cases on July 26, 2021?',
       'NYC subway riders from Jul 26-Aug 1, 2021?'], dtype=object)

In [4]:
poly_questions = poly_data['question'].values
poly_questions

array(['NFL Saturday: Chiefs vs. Raiders',
       'archWill Kamala Harris win the 2024 US Presidential Election?',
       'Will the Nuggets be the 2022-23 NBA Champion?', ...,
       'Will Jake Paul fight Person A next?', 'Cowboys vs. Commanders',
       'Will $POPCAT be listed first on Binance?'], dtype=object)

# question相似度计算

In [5]:
# 对文本分词后，再去除停用词
def wordCutter(text, stop_file_path):
    # 自定义词典
    jieba.load_userdict(stop_file_path) # 加载停用词
    # 将 jieba 进行初始化
    jieba.initialize()
    # 文本预处理：去除标点符号
    textSymbolDeleted = re.sub('[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]', '', text)
    # 文本分词
    listExactSeparated = textSymbolDeleted.split(' ')
    listExactSeparated = [x.lower() for x in listExactSeparated]  # 换成小写
    listResult = []
    # 读取停用词库
    with open(stop_file_path, encoding='utf-8') as f:
        cons = f.readlines()
        setStopwords = set()
        for stopword in cons:
            # 去掉读取每一行数据的\n
            stopword = stopword.replace("\n", "")
            stopword = stopword.replace(" ", "")
            setStopwords.add(stopword)
    
    # 去除停用词
    for word in listExactSeparated:
        if word not in setStopwords:
            listResult.append(word)
    return ','.join(listResult)

In [6]:
stop_file_path='常用英文停用词(NLP处理英文必备)stopwords.txt'
# poly_questions 
poly_question_words=[wordCutter(x,stop_file_path) for x in poly_questions]

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\PuLin\AppData\Local\Temp\jieba.cache
Loading model cost 0.918 seconds.
Prefix dict has been built successfully.


In [7]:
poly_question_words[:3]

['nfl,saturday,chiefs,raiders',
 'archwill,kamala,harris,win,2024,presidential,election',
 'nuggets,202223,nba,champion']

In [8]:
# kalshi_titles
kalshi_title_words=[wordCutter(x,stop_file_path)for x in kalshi_titles]

In [9]:
kalshi_title_words[:3]

['initial,jobless,claims,aug,2228,2021',
 'eu,meets,2030,climate,goals',
 'india,meets,2030,climate,goals']

In [10]:
# 提取TFIDF
tfidf = TfidfVectorizer()
tfidf.fit(poly_question_words + kalshi_title_words)
poly_feat = tfidf.transform(poly_question_words) # 转化为矩阵
kalshi_feat = tfidf.transform(kalshi_title_words)

In [11]:
# 进行归一化，为后面进行相似度计算做准备
poly_feat = normalize(poly_feat)
kalshi_feat = normalize(kalshi_feat)

In [12]:
# 检索进行排序
poly_kalshi_scores = pd.DataFrame()
for query_idx, feat in enumerate(poly_feat):
    # 对poly的每个question 与 每kalshi的每个question计算TFIDF相似度
    score = feat @ kalshi_feat.T
    score = score.toarray()[0]
    max_score_page_idx = score.argmax()  # 找到TFIDF值最大的index
    poly_kalshi_scores.loc[query_idx,'poly_index'] = query_idx
    poly_kalshi_scores.loc[query_idx,'poly_question'] = poly_data.loc[query_idx,'question']
    poly_kalshi_scores.loc[query_idx,'poly_question_id'] = poly_data.loc[query_idx,'question_id']

    poly_kalshi_scores.loc[query_idx,'kalshi_index'] = max_score_page_idx
    poly_kalshi_scores.loc[query_idx,'kalshi_title'] = kalshi_data.loc[max_score_page_idx,'title']
    poly_kalshi_scores.loc[query_idx,'kalshi_event_ticker'] = kalshi_data.loc[max_score_page_idx,'event_ticker']
    poly_kalshi_scores.loc[query_idx,'kalshi_series_ticker'] = kalshi_data.loc[max_score_page_idx,'series_ticker']

    poly_kalshi_scores.loc[query_idx,'max_score'] = max(score)

    # 再加每个事件的开始和结束事件
    

In [13]:
poly_kalshi_scores = poly_kalshi_scores.sort_values(by='max_score',ascending=False)

In [14]:
poly_kalshi_scores.to_csv('poly_kalshi_scores.csv',index=False)

In [15]:
# 保留得分大于0.98的数据
remain_poly_kalshi = poly_kalshi_scores[poly_kalshi_scores['max_score']>=0.98]

# 排除关闭的事件

In [16]:
poly_data_closed = poly_data[['question','closed']]
poly_data_closed.rename(columns={'question':'poly_question'},inplace=True)

In [17]:
remain_poly_kalshi=remain_poly_kalshi.merge(poly_data_closed,on='poly_question',how='left')

In [18]:
remain_poly_kalshi=remain_poly_kalshi[remain_poly_kalshi['closed']==True]

# 查看市场是否一致

In [19]:
# 获取 polymarket 上的市场
poly_data_token_outcome = poly_data[['question','token_outcome']]
poly_data_token_outcome.rename(columns={'question':'poly_question'}, inplace=True)

In [20]:
remain_poly_kalshi.merge(poly_data_token_outcome,on='poly_question',how='left')

Unnamed: 0,poly_index,poly_question,poly_question_id,kalshi_index,kalshi_title,kalshi_event_ticker,kalshi_series_ticker,max_score,closed,token_outcome
0,16474.0,Will Trump win 55% of voters without a college...,0x020acbf152845aa74ae0a7091e60a2f905dcb1f4e4c4...,1184.0,Will Trump win more than 55% of voters without...,KXDJTNOGRAD,KXDJTNOGRAD,1.0,True,"Yes, No"
1,13503.0,Will there be a 269-269 tie in Electoral College?,0x5d32895475ceb959a1981e97493af5a63216a0fdca96...,759.0,Will there be a 269-269 electoral college tie?,KXECTIE-24,KXECTIE,1.0,True,"Yes, No"
2,17981.0,AP calls Georgia by 8:00 AM Nov 6?,0x8be8f10c8f5661580967036cadbe624bcb5e5e9d16ec...,1237.0,AP calls Georgia before 8:00 AM Nov 6?,KXELECTIONGACALL,KXELECTIONGACALL,1.0,True,"Yes, No"
3,7736.0,Trump + Biden debate on June 27 as planned?,0x5a6f8264e78442450e7152d10c23293dfbd0a37dcca9...,4756.0,Trump and Biden debate on June 27 as planned?,DEBATEHAPPEN-24JUN27,KXDEBATEHAPPEN,1.0,True,"Yes, No"
4,15405.0,Will Trump win 5 swing states?,0x1fbc4adde00c6f107af3c1b1770010c9d5f64ffd1277...,480.0,How many swing states will Trump win?,KXSWINGSTATES24DJT,KXSWINGSTATES24DJT,1.0,True,"Yes, No"
5,15298.0,Will Trump win 2 swing states?,0x1fbc4adde00c6f107af3c1b1770010c9d5f64ffd1277...,480.0,How many swing states will Trump win?,KXSWINGSTATES24DJT,KXSWINGSTATES24DJT,1.0,True,"Yes, No"
6,15090.0,Will Trump win 7 swing states?,0x1fbc4adde00c6f107af3c1b1770010c9d5f64ffd1277...,480.0,How many swing states will Trump win?,KXSWINGSTATES24DJT,KXSWINGSTATES24DJT,1.0,True,"Yes, No"
7,19024.0,Coinbase #1 finance app on Friday?,0x43d6ccf102b25516cd3eb9bd8da0b6c72c9bf4b0e241...,1030.0,Will Coinbase be the #1 finance app on Friday?,KXAPPRANKFINANCE-24NOV15,KXAPPRANKFINANCE,1.0,True,"Yes, No"
8,15148.0,Will Trump win 3 swing states?,0x1fbc4adde00c6f107af3c1b1770010c9d5f64ffd1277...,480.0,How many swing states will Trump win?,KXSWINGSTATES24DJT,KXSWINGSTATES24DJT,1.0,True,"Yes, No"
9,15181.0,Will Trump win 1 swing state?,0x1fbc4adde00c6f107af3c1b1770010c9d5f64ffd1277...,480.0,How many swing states will Trump win?,KXSWINGSTATES24DJT,KXSWINGSTATES24DJT,1.0,True,"Yes, No"
