In [122]:
import pandas as pd
import scipy.stats as sc

In [123]:
df = pd.read_csv('./golf.csv', sep=';')
df.head()

Unnamed: 0,Outlook,Temperature,Humidity,Wind,Play
0,sunny,85.0,85.0,False,no
1,sunny,80.0,90.0,True,no
2,overcast,83.0,78.0,False,yes
3,rain,70.0,96.0,False,yes
4,rain,68.0,80.0,False,yes


* Categorial attribute: Outlook
* Numerical attribute: Temperature

Create 2 dataframes based on whether golfed is played:

In [124]:
play_no = df.loc[df['Play'] == 'no']
play_yes = df.loc[df['Play'] == 'yes']

### Outlook

Create value counts for outlook on both play outcomes:

In [125]:
outlook_no = play_no['Outlook'].value_counts()
outlook_yes = play_yes['Outlook'].value_counts()

Create a table for play vs outlook:

In [126]:
def create_table(col_indexes):
    I = pd.Index(['yes', 'no'], name='Play')
    C = pd.Index(outlook_yes.index.tolist(), name='Outlook')
    return pd.DataFrame(data=[outlook_yes, outlook_no], index=I, columns=C).fillna(0)

df1 = create_table(outlook_yes.index.tolist())
df1

Outlook,overcast,rain,sunny
Play,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
yes,4.0,3.0,2.0
no,0.0,2.0,3.0


Bayes Probability Table:

In [127]:
df2 = (df1.T / df1.T.sum()).T
df2

Outlook,overcast,rain,sunny
Play,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
yes,0.444444,0.333333,0.222222
no,0.0,0.4,0.6


### Temparature

Calculate measures of dispersion:

In [128]:
no_mean = play_no['Temperature'].mean()
no_sd = play_no['Temperature'].std()

yes_mean = play_yes['Temperature'].mean()
yes_sd = play_yes['Temperature'].std()

pdf_no = sc.norm(no_mean, no_sd).pdf
pdf_yes = sc.norm(yes_mean, yes_sd).pdf

Assuming the temperature values have a normal distribution, calculate their probabilities using the Gaussian PDF:
 
$$f(x) = \frac1{\sigma\sqrt{2\pi}}e^{\frac{-1}{2}\left(\frac{x-\mu}{\sigma}\right)^2}$$

In [129]:
play_no['Temperature'].apply(pdf_no)

0     0.021216
1     0.039997
5     0.024124
7     0.047874
13    0.045551
Name: Temperature, dtype: float64

In [130]:
play_yes['Temperature'].apply(pdf_yes)

2     0.017361
3     0.057490
4     0.046576
6     0.022292
8     0.052431
9     0.061399
10    0.061399
11    0.063871
12    0.027880
Name: Temperature, dtype: float64

### Laplace Correction for Outlook

Verify that the posterior probability is not zero when Outlook is unknown given information about whether golf was played or not.

In [131]:
N = len(df)
# no. of cases where Outlook is unknown
nij = 0
# no. of Outlook categories i.e. overcast, rain, sunny, unknown
Xi = 4                                                     
# smoothing parameter
k = 1

In [132]:
# no. of cases where Play is no
nj = len(play_no)
p_unknown_no = ((N * nij + k) / (N * nj + k * Xi))  
p_unknown_no

0.013513513513513514

In [133]:
# no. of cases where Play is yes
nj = len(play_yes)
p_unknown_yes = ((N * nij + k) / (N * nj + k * Xi))
p_unknown_yes

0.007692307692307693