In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

#### My Implementation

In [2]:
df = pd.read_csv('data.csv')

df.head()

Unnamed: 0,Outlook,Play
0,Rainy,Yes
1,Sunny,Yes
2,Overcast,Yes
3,Overcast,Yes
4,Sunny,No


In [3]:
inp_classes = df['Outlook'].unique()
out_classes = df['Play'].unique()

print('Input Classes: ', inp_classes)
print('Output Classes: ', out_classes)

Input Classes:  ['Rainy' 'Sunny' 'Overcast']
Output Classes:  ['Yes' 'No']


In [4]:
X = df.drop('Play', axis=1)
Y = df['Play']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=123)

In [5]:
counts = Y.value_counts()
play_prob = {}
for cls in counts.index:
    play_prob[cls] = counts[cls] / counts.sum()

In [6]:
outlook_given_play = {}
for play in out_classes:
    probs = {}
    temp = X_train[Y_train == play]
    counts = temp['Outlook'].value_counts()
    for cls in inp_classes:
        try:
            probs[cls] = counts[cls] / counts.sum()
        except KeyError as ke:
            probs[cls] = 0.0
    outlook_given_play[play] = probs

In [7]:
def get_play(condition):
    denominator = 0
    for play, probs in outlook_given_play.items():
        denominator += play_prob[play] * probs[condition]
    numerator = play_prob['Yes'] * outlook_given_play['Yes'][condition]
    prob = numerator / denominator
    return 'Yes' if prob > 0.5 else 'No'

In [8]:
correct = 0
for i in X_test.index:
    pred = get_play(X_test['Outlook'][i])
    if pred == Y_test[i]:
        correct += 1
acc = correct / len(Y_test)

In [9]:
print("Accuracy: %.2f" % acc)

Accuracy: 0.67


In [10]:
print("Input Classes: ")
for i, outlook in enumerate(inp_classes):
    print(f"{i+1}. {outlook.capitalize()}")
condition = int(input("Enter the outlook condition: "))
print("Play: %s" % get_play(inp_classes[condition-1]))

Input Classes: 
1. Rainy
2. Sunny
3. Overcast
Enter the outlook condition: 2
Play: Yes


#### Sklearn Implementation

In [11]:
df = pd.read_csv('data.csv')

df.head()

Unnamed: 0,Outlook,Play
0,Rainy,Yes
1,Sunny,Yes
2,Overcast,Yes
3,Overcast,Yes
4,Sunny,No


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14 entries, 0 to 13
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Outlook  14 non-null     object
 1   Play     14 non-null     object
dtypes: object(2)
memory usage: 352.0+ bytes


In [13]:
pre_df = pd.get_dummies(df, columns=['Outlook'])
pre_df.head()

Unnamed: 0,Play,Outlook_Overcast,Outlook_Rainy,Outlook_Sunny
0,Yes,0,1,0
1,Yes,0,0,1
2,Yes,1,0,0
3,Yes,1,0,0
4,No,0,0,1


In [14]:
X = pre_df.drop('Play', axis=1)
Y = pre_df['Play']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=123)

In [15]:
model = GaussianNB()

model.fit(X_train, Y_train)

GaussianNB()

In [16]:
Y_pred = model.predict(X_test)

In [17]:
accuracy = accuracy_score(Y_test, Y_pred)
report = classification_report(Y_test, Y_pred)
print("Accuracy: %.2f" % accuracy)
print(report)

Accuracy: 0.67
              precision    recall  f1-score   support

          No       0.50      1.00      0.67         1
         Yes       1.00      0.50      0.67         2

    accuracy                           0.67         3
   macro avg       0.75      0.75      0.67         3
weighted avg       0.83      0.67      0.67         3

