In [87]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import ScalarFormatter
import seaborn as sns
from scipy import stats
from scipy.ndimage import gaussian_filter1d
from scipy.interpolate import make_interp_spline

In [88]:
def workflow(df):

    # 使用 np.where() 根据 Profit 的正负值添加 result 列
    df['result'] = np.where(df['Profit'] > 0, 'won', 'lost')

    df  = df.copy(deep= True)
    rows = df[df[['CashedOut','Bonus', 'Profit']].isna().any(axis= 1)]
    df['Profit'].fillna(value = -1*df['Bet'],inplace  =True)
    df['CashedOut'].fillna(value = df['BustedAt'],inplace = True)
    df['Bonus'].fillna(value = 0.0,inplace = True)
    df['PlayDate'] = pd.to_datetime(df['PlayDate'])
    for col_type in ['Bet']:
        df['Next {}'.format(col_type)] = (df.sort_values(by=['PlayDate'], ascending=True)
                       .groupby('Username')[col_type].shift(-1))  
        
    df['Cumulative Profit'] = (df.sort_values(by=['PlayDate'], ascending=True)
                        .groupby('Username')['Profit'].agg(np.cumsum))
    # 假设 'Win' 是赢得的金额，'Bet' 是下注金额

    # 计算每个用户的期望 Profit
    expected_profit = df.groupby('Username')['Profit'].mean().reset_index()
    expected_profit.rename(columns={'Profit': 'Expected_Profit'}, inplace=True)
    # 将期望利润合并到原始数据
    df = df.merge(expected_profit, on='Username', how='left')


    df_sorted = df.sort_values(by =['Username','PlayDate'],ascending = [True,True])
    return df_sorted

def signal_analyze(df):
    df['Bet_Signal'] = df['Next Bet']-df['Bet']
    df['Bet_percent'] = (df['Next Bet']-df['Bet'])/df['Bet']
    df['encoded_Bet_Signal'] = '0'
    df.loc[df['Bet_Signal'] < 0 , 'encoded_Bet_Signal'] = '-'
    df.loc[df['Bet_Signal'] > 0 ,'encoded_Bet_Signal' ] = '+'

    df['encoded_Bet_percent'] = '0'
    df.loc[df['Bet_percent'] < 0 , 'encoded_Bet_percent'] = '-'
    df.loc[df['Bet_percent'] > 0 , 'encoded_Bet_percent'] = '+'
    return df


In [89]:
# 基于数据集研究风险承受水平对于决策收益的影响

data = pd.read_csv('./bustabit.csv').copy()

data.drop(['Id', 'GameID'], axis=1, inplace=True)


data = workflow(data)

data['Bet_Counts'] = data.groupby('Username')['Username'].transform('size')

data = signal_analyze(data)

data

# bool_series = pd.notnull(data['Bet_Signal'])
# res = stats.probplot(data[bool_series]['Bet_Signal'],dist = stats.norm ,sparams=(2,5))

# # 绘制 QQ 图
# plt.figure(figsize=(8, 6))
# plt.plot(res[0][0], res[0][1], 'o', label='Observed Data')  # 数据点
# plt.plot(res[0][0], res[0][0], 'r--', label='Theoretical Line')  # 理论分布线

# # 添加标题和标签
# plt.title('QQ Plot for Bet_Signal', fontsize=16)
# plt.xlabel('Theoretical Quantiles', fontsize=14)
# plt.ylabel('Sample Quantiles', fontsize=14)
# plt.legend(fontsize=12)
# plt.grid()

# # 显示图表
# plt.show()
# sns.scatterplot(data =data, x= 'Bet_Counts',y= 'Expected_Profit',color = 'blue',s=12)
# plt.title('Bet Counts VS. Expected Profit')
# plt.xlabel('Bet Counts')
# plt.ylabel('Expected Profit')
# plt.show()

# clipped_profit = [x for x in data['Expected_Profit'] if np.abs(x)<5000]
# plt.hist(clipped_profit,bins = 1000,color = 'red',edgecolor = 'black',alpha = 0.85)
# plt.title('Histogram of Expected Profit')
# plt.xlabel('Expected Profit')
# plt.ylabel('Frequence')
# plt.show()

# # 绘制带 KDE 曲线的直方图  每名玩家下注次数的直方图
# plt.figure(figsize=(10, 6))
# sns.histplot(data['Bet_Counts'], bins=30, kde=True, color='blue', edgecolor='black')

# # 添加标题和轴标签
# plt.title('Histogram with KDE Fit of Bet Counts', fontsize=16)
# plt.xlabel('Bet Counts', fontsize=14)
# plt.ylabel('Frequency / Density', fontsize=14)

# # 显示图表
# plt.show()


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Profit'].fillna(value = -1*df['Bet'],inplace  =True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['CashedOut'].fillna(value = df['BustedAt'],inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on wh

Unnamed: 0,Username,Bet,CashedOut,Bonus,Profit,BustedAt,PlayDate,result,Next Bet,Cumulative Profit,Expected_Profit,Bet_Counts,Bet_Signal,Bet_percent,encoded_Bet_Signal,encoded_Bet_percent
25389,----------------,11,1.01,2.27,0.36,1.03,2016-11-03 06:14:27+00:00,won,12.0,0.36,-2.433333,3,1.0,0.090909,+,+
37757,----------------,12,1.01,1.83,0.34,3.25,2016-11-03 15:05:40+00:00,won,8.0,0.70,-2.433333,3,-4.0,-0.333333,-,-
47386,----------------,8,1.09,0.00,-8.00,1.09,2016-11-03 20:26:24+00:00,lost,,-7.30,-2.433333,3,,,0,0
19471,--dilib--,349,1.60,1.87,215.91,4.37,2016-11-01 22:57:26+00:00,won,98.0,215.91,-108.453750,8,-251.0,-0.719198,-,-
33955,--dilib--,98,2.54,4.63,155.46,11.95,2016-11-02 00:56:16+00:00,won,311.0,371.37,-108.453750,8,213.0,2.173469,+,+
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40631,zzanggubank,1024,4.18,0.00,-1024.00,4.18,2016-11-02 05:47:31+00:00,lost,123.0,144641.43,15837.936667,9,-901.0,-0.879883,-,-
19296,zzanggubank,123,1.83,0.00,-123.00,1.83,2016-11-02 11:05:36+00:00,lost,201.0,144518.43,15837.936667,9,78.0,0.634146,+,+
16334,zzanggubank,201,1.13,0.00,-201.00,1.13,2016-11-02 15:30:35+00:00,lost,1068.0,144317.43,15837.936667,9,867.0,4.313433,+,+
48673,zzanggubank,1068,2.20,0.00,-1068.00,2.20,2016-11-02 17:16:34+00:00,lost,708.0,143249.43,15837.936667,9,-360.0,-0.337079,-,-
