In [23]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_digits, load_breast_cancer
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB

In [45]:
import warnings
warnings.filterwarnings("ignore")

In [48]:
def write_answer(filename, answer):
    with open(filename, "w") as fout:
        fout.write(str(answer))

In [2]:
digits, breast_cancer = load_digits(), load_breast_cancer()

In [32]:
digits_data = np.concatenate([digits.data, digits.target[:, np.newaxis]], axis=1)
digits_columns = list(range(1, digits.data.shape[1] + 1)) + ['target']
digits_df = pd.DataFrame(data=digits_data, columns=digits_columns)
digits_df.name = 'digits'

digits_df.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,56,57,58,59,60,61,62,63,64,target
0,0.0,0.0,5.0,13.0,9.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,6.0,13.0,10.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,12.0,13.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,11.0,16.0,10.0,0.0,0.0,1.0
2,0.0,0.0,0.0,4.0,15.0,12.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,11.0,16.0,9.0,0.0,2.0
3,0.0,0.0,7.0,15.0,13.0,1.0,0.0,0.0,0.0,8.0,...,0.0,0.0,0.0,7.0,13.0,13.0,9.0,0.0,0.0,3.0
4,0.0,0.0,0.0,1.0,11.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,16.0,4.0,0.0,0.0,4.0


In [33]:
breast_cancer_data = np.concatenate([breast_cancer.data, breast_cancer.target[:, np.newaxis]], axis=1)
breast_cancer_columns = np.append(breast_cancer.feature_names, ['target'])
breast_cancer_df = pd.DataFrame(data=breast_cancer_data, columns=breast_cancer_columns)
breast_cancer_df.name = 'breast_cancer'

breast_cancer_df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0.0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0.0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0.0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0.0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0.0


In [53]:
def mean_cv_score(df, estimator):
    X = df.iloc[:, :-1]
    y = df.iloc[:, -1:]
    
    mean_score = cross_val_score(estimator, X, y).mean()
    return mean_score

In [54]:
estimators = (BernoulliNB(), GaussianNB(), MultinomialNB())

for nb in estimators:
    for df in [digits_df, breast_cancer_df]:
        print('%s mean_cv_score for %s: %s' % (nb.__class__.__name__, df.name, mean_cv_score(df, nb)))

BernoulliNB mean_cv_score for digits: 0.8258236507780582
BernoulliNB mean_cv_score for breast_cancer: 0.6274204028589994
GaussianNB mean_cv_score for digits: 0.8186003803550138
GaussianNB mean_cv_score for breast_cancer: 0.9367492806089297
MultinomialNB mean_cv_score for digits: 0.8708771489735053
MultinomialNB mean_cv_score for breast_cancer: 0.8945790401930752


In [51]:
breast_cancer_df.target.unique(), digits_df.target.unique()

(array([0., 1.]), array([0., 1., 2., 3., 4., 5., 6., 7., 8., 9.]))

In [55]:
answers = (0.9367492806089297, 0.8714703025475334, '3 4')

write_answer('answer_1.txt', answers[0])
write_answer('answer_2.txt', answers[1])
write_answer('answer_3.txt', answers[2])