In [None]:
"""Shannon entropy H of a discrete random variable"""

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from scipy.stats import norm 

In [17]:
data=r"/Users/macbookpro/Documents/RNG_testing/df_Small.csv"

In [18]:
df=pd.read_csv(data)
df=df.iloc[:1000]

## Cards

In [None]:
"""Row Entropy : uncertainty of different shuffles
Column Entropy : how often each card appears across all shuffles

For a perfectly uniform distribution (where each card has an equal chance of appearing), the entropy will be maximized.

Maximum Entropy for a Deck of Cards:   H(X)=log2(52)≈5.7 bits. 
This is the theoretical maximum entropy when all cards are equally likely to appear.

"""

In [19]:
df_card=df[["card"]]

In [20]:
#seperating cards
split_cards= df_card["card"].str.split("," , expand=True)
#Rename columns
#split_cards.columns= [f"{i+1}" for i in range(split_cards.shape[1])]

In [21]:
#value_counts
melted = split_cards.melt(var_name='position', value_name='card')
melted=melted.groupby(['card', 'position'] , sort=True).size().unstack()

In [22]:
#step1: calculate probabilities
prob_df= melted/1000

In [23]:
prob_df.head()

position,0,1,2,3,4,5,6,7,8,9,...,42,43,44,45,46,47,48,49,50,51
card,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2C,0.026,0.028,0.018,0.016,0.026,0.017,0.014,0.016,0.026,0.02,...,0.015,0.018,0.017,0.019,0.013,0.023,0.024,0.021,0.021,0.017
2D,0.014,0.02,0.019,0.017,0.024,0.023,0.023,0.023,0.023,0.027,...,0.021,0.021,0.015,0.009,0.014,0.018,0.016,0.023,0.015,0.02
2H,0.023,0.024,0.022,0.027,0.014,0.009,0.017,0.025,0.022,0.009,...,0.017,0.016,0.016,0.014,0.02,0.02,0.018,0.019,0.026,0.018
2S,0.019,0.017,0.023,0.023,0.014,0.017,0.022,0.016,0.025,0.017,...,0.015,0.019,0.023,0.019,0.014,0.016,0.023,0.02,0.019,0.02
3C,0.017,0.022,0.022,0.017,0.014,0.021,0.026,0.025,0.02,0.02,...,0.015,0.016,0.022,0.01,0.019,0.021,0.022,0.016,0.017,0.02


In [24]:
def calculate_entropy(probabilities):
    # Filter out zero probabilities to avoid log(0)
    probabilities = probabilities[probabilities > 0]
    entropy = -np.sum(probabilities * np.log2(probabilities))
    return entropy

# Calculate row entropies
row_entropy_values = []
for index, row in prob_df.iterrows():
    total = row.sum()
    if total > 0:
        probabilities = row / total
        entropy = calculate_entropy(probabilities)
        row_entropy_values.append(entropy)
    else:
        row_entropy_values.append(0)
        
#Calculate column entropies
column_entropy_values = []
for column in prob_df.columns:
    total = prob_df[column].sum()
    if total > 0:
        probabilities = prob_df[column] / total
        entropy = calculate_entropy(probabilities)
        column_entropy_values.append(entropy)
    else:
        column_entropy_values.append(0)    

In [25]:
entropy_df = pd.DataFrame({
    'Row': range(1, 53),                # Row indices
    'Row Entropy': row_entropy_values,  # Row entropy values
    'Column Entropy': column_entropy_values  # Column entropy values
})

In [27]:
entropy_df.set_index('Row', inplace=True)

In [28]:
entropy_df

Unnamed: 0_level_0,Row Entropy,Column Entropy
Row,Unnamed: 1_level_1,Unnamed: 2_level_1
1,5.67293,5.668259
2,5.662709,5.668599
3,5.653616,5.667333
4,5.675076,5.659199
5,5.669232,5.658387
6,5.660424,5.670657
7,5.658447,5.659961
8,5.662969,5.645316
9,5.650859,5.664878
10,5.665864,5.65717


In [29]:
#creating summery df
mean_row_entropy = entropy_df['Row Entropy'].mean()
std_row_entropy = entropy_df['Row Entropy'].std()
mean_column_entropy = entropy_df['Column Entropy'].mean()
std_column_entropy = entropy_df['Column Entropy'].std()

#Calculate the margin of error for 99% confidence
Z = 2.576  # Z-score for 99% confidence
margin_of_error_row = Z * (std_row_entropy / np.sqrt(len(entropy_df)))
margin_of_error_column = Z * (std_column_entropy / np.sqrt(len(entropy_df)))

row_entropy_ci = (mean_row_entropy - margin_of_error_row, mean_row_entropy + margin_of_error_row)
column_entropy_ci = (mean_column_entropy - margin_of_error_column, mean_column_entropy + margin_of_error_column)

summary_df = pd.DataFrame({
    'Entropy Type': ['Row Entropy', 'Column Entropy'],
    'Mean Entropy': [mean_row_entropy, mean_column_entropy],
    'Std Entropy': [std_row_entropy , std_column_entropy],
    'Lower CI': [row_entropy_ci[0], column_entropy_ci[0]],
    'Upper CI': [row_entropy_ci[1], column_entropy_ci[1]],
})

In [30]:
summary_df

Unnamed: 0,Entropy Type,Mean Entropy,Std Entropy,Lower CI,Upper CI
0,Row Entropy,5.66126,0.006982,5.658766,5.663754
1,Column Entropy,5.66126,0.007647,5.658528,5.663992


## Lotto

In [None]:
"""For 59 unique numbers, the maximum entropy can be calculated as:
Hmax=log2(59)≈ 5.88 bits 

Column Entropy : represent the entropy of the frequency distribution of each number across all draws

"""

In [31]:
df_lotto=df[["lotoo"]]

In [32]:
#seprating values
df_lotto= df_lotto["lotoo"].str.split("," , expand=True)

In [38]:
#creating Frequency matrix
df_lotto=df_lotto.astype(int)
melted = df_lotto.melt(var_name='position', value_name='Number')
melted=melted.groupby(['Number', 'position'] , sort=True).size().reset_index(name='Count')
frequency_matrix = melted.pivot(index='Number', columns='position', values='Count')

In [40]:
prob_df_Lotto = frequency_matrix / 1000

In [42]:
def calculate_entropy(probabilities):
    # Filter out zero probabilities to avoid log(0)
    probabilities = probabilities[probabilities > 0]
    entropy = -np.sum(probabilities * np.log2(probabilities))
    return entropy

# Calculate row entropies
row_entropy_values = []
for index, row in prob_df_Lotto.iterrows():
    total = row.sum()
    if total > 0:
        probabilities = row / total
        entropy = calculate_entropy(probabilities)
        row_entropy_values.append(entropy)
    else:
        row_entropy_values.append(0)
        
#Calculate column entropies
column_entropy_values = []
for column in prob_df_Lotto.columns:
    total = prob_df[column].sum()
    if total > 0:
        probabilities = prob_df_Lotto[column] / total
        entropy = calculate_entropy(probabilities)
        column_entropy_values.append(entropy)
    else:
        column_entropy_values.append(0)   

In [53]:
#creating summery df
column_entropy_array = np.array(column_entropy_values)
mean_column_entropy = column_entropy_array.mean()
std_column_entropy = column_entropy_array.std()

#Calculate the margin of error for 99% confidence
Z = 2.576  # Z-score for 99% confidence

margin_of_error_column = Z * (std_column_entropy / np.sqrt(len(column_entropy_array)))

column_entropy_ci = (mean_column_entropy - margin_of_error_column, mean_column_entropy + margin_of_error_column)

In [54]:
summary_df_Lotto = pd.DataFrame({
    'Entropy Type': ['Column Entropy'],
    'Mean Entropy': [mean_column_entropy],
    'Std Entropy': [std_column_entropy],
    'Lower CI': [column_entropy_ci[0]],
    'Upper CI': [column_entropy_ci[1]],
})
summary_df_Lotto

Unnamed: 0,Entropy Type,Mean Entropy,Std Entropy,Lower CI,Upper CI
0,Column Entropy,5.837523,0.007342,5.829802,5.845245


## Dice

In [None]:
#For a fair six-sided die : H(X)=log2(6)≈2.585 bits

In [55]:
df_Dice= df[["dice"]]

In [56]:
#Seprating columns
df_Dice.loc[:,'Die1'] = df_Dice['dice'].apply(lambda x: x[0])
df_Dice.loc[:,'Die2'] = df_Dice['dice'].apply(lambda x: x[2])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value


In [72]:
value_counts= df_Dice['Die1'].value_counts().sort_index()

In [76]:
df_valueCount_lotto= pd.DataFrame({ "face" : value_counts.index , 
               "freq" : value_counts.values 
})
df_valueCount_lotto.set_index("face" , inplace=True)

In [77]:
Dice_prob= df_valueCount_lotto / 1000

In [78]:
Dice_prob

Unnamed: 0_level_0,freq
face,Unnamed: 1_level_1
1,0.157
2,0.169
3,0.165
4,0.154
5,0.165
6,0.19


In [80]:
# Calculate entropy
def calculate_entropy(probabilities):
    # Filter out zero probabilities to avoid log(0)
    probabilities = probabilities[probabilities > 0]
    entropy = -np.sum(probabilities * np.log2(probabilities))
    return entropy

# Calculate entropy from the Probability column
die_entropy = calculate_entropy(Dice_prob['freq'])


In [81]:
die_entropy

2.581536178803398