In [38]:
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
import pandas as pd
import seaborn as sns
sns.set(style="white",color_codes=True)
plt.rcParams['figure.figsize'] = (15,9.27)
matplotlib.rcParams['mathtext.fontset'] = "cm"
from scipy.stats import norm

In [39]:
iris = sns.load_dataset('iris')

In [74]:
class GNB(object):
    
    def __init__(self,df,y):
        self.df = df
        self.y = y
        self.mu_sigma_df = df.groupby(y).agg([np.mean,np.std])
        self.pclass = df[y].value_counts()/df[y].count()
    
    def __main(self,x):
        prob_df = pd.DataFrame(index=self.mu_sigma_df.index,
                               columns=self.mu_sigma_df.columns.levels[0])
        for r in prob_df.index:
            num = 0
            for c in prob_df.columns:
                mu = self.mu_sigma_df.loc[r,c][0]
                sigma = self.mu_sigma_df.loc[r,c][1]
                prob_df.loc[r,c] = norm.pdf(x[num],mu,sigma)
                num += 1
        prob_df['pclass'] = self.pclass
        prob_df['res'] = np.prod(prob_df,axis=1)
        return prob_df['res'].idxmax(),prob_df
    
    def predict(self,x):
        return self.__main(x)[0]
    
    def predict_prob_df(self,x):
        return self.__main(x)[1] 

In [41]:
def cross_accuracy_score(df,y,model_class,k=10):
    from sklearn.cross_validation import train_test_split
    X = df.drop(y,axis=1)
    Y = df[y]
    accuracy = []
    for i in range(k):
        xtrain,xtest,ytrain,ytest = train_test_split(X,Y,test_size=1/k)
        df_train = pd.concat([xtrain,ytrain],axis=1)
        model = model_class(df_train,y)
        prediction = []
        for j in range(len(xtest)):
            prediction.append(model.predict(xtest.iloc[j,:]))
        df_test = pd.concat([xtest,ytest],axis=1)
        df_test['prediction'] = prediction
        df_test['isright'] = (df_test[y] == df_test.prediction)
        accuracy.append(sum(df_test.isright)/len(df_test.isright))
    return accuracy

In [42]:
dating = pd.read_excel('data/dating.xlsx')

In [43]:
dating.head()

Unnamed: 0,fd,gp,ic,feeling
0,40920,8.326976,0.953952,largeDoses
1,14488,7.153469,1.673904,smallDoses
2,26052,1.441871,0.805124,didntLike
3,75136,13.147394,0.428964,didntLike
4,38344,1.669788,0.134296,didntLike


In [76]:
n = GNB(dating,'feeling')

In [78]:
n.predict_prob_df([61732,8.325167,0.028479])

Unnamed: 0_level_0,fd,gp,ic,pclass,res
feeling,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
didntLike,2.2351e-05,0.0832149,0.213718,0.342,1.359459e-07
largeDoses,7.74327e-07,0.135276,0.195413,0.327,6.693417e-09
smallDoses,3.57447e-25,0.0234418,0.243541,0.331,6.754658000000001e-28


In [46]:
X = dating.drop('feeling',axis=1)
Y = dating['feeling']
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(X, Y)

GaussianNB(priors=None)

In [47]:
from sklearn.cross_validation import cross_val_score

In [48]:
words = [['my','dog','has','flea','problems','help','please'],['maybe','not','take','him','to',
         'dog','park','stupid'],['my','dalmation','is','so','cute','I','love','him'],['stop',
         'posting','stupid','worthless','garbage'],['mr','licks','ate','my','steak',
         'how','to','stop','him'],['quit','buying','worthless','dog','food','stupid']]
classvec = [0,1,0,1,0,1]

In [49]:
words_df = pd.DataFrame({'words':words,'class':classvec})

In [50]:
words_df

Unnamed: 0,words,class
0,"[my, dog, has, flea, problems, help, please]",0
1,"[maybe, not, take, him, to, dog, park, stupid]",1
2,"[my, dalmation, is, so, cute, I, love, him]",0
3,"[stop, posting, stupid, worthless, garbage]",1
4,"[mr, licks, ate, my, steak, how, to, stop, him]",0
5,"[quit, buying, worthless, dog, food, stupid]",1


In [51]:
from functools import reduce
words_set = set(reduce(lambda x,y:x+y,words))

In [52]:
words_occurancy_df = pd.DataFrame(columns=list(words_set),index=range(6))

In [53]:
words_occurancy_df

Unnamed: 0,help,steak,buying,stop,posting,dog,ate,so,not,how,...,licks,flea,worthless,food,has,quit,problems,is,love,mr
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


In [54]:
for r in words_occurancy_df.index:
    for c in words_occurancy_df.columns:
        words_occurancy_df.loc[r,c] = (c in words_df.loc[r,'words'])
words_occurancy_df['class'] = classvec

In [55]:
words_occurancy_df

Unnamed: 0,help,steak,buying,stop,posting,dog,ate,so,not,how,...,flea,worthless,food,has,quit,problems,is,love,mr,class
0,True,False,False,False,False,True,False,False,False,False,...,True,False,False,True,False,True,False,False,False,0
1,False,False,False,False,False,True,False,False,True,False,...,False,False,False,False,False,False,False,False,False,1
2,False,False,False,False,False,False,False,True,False,False,...,False,False,False,False,False,False,True,True,False,0
3,False,False,False,True,True,False,False,False,False,False,...,False,True,False,False,False,False,False,False,False,1
4,False,True,False,True,False,False,True,False,False,True,...,False,False,False,False,False,False,False,False,True,0
5,False,False,True,False,False,True,False,False,False,False,...,False,True,True,False,True,False,False,False,False,1


In [56]:
c0 = words_occurancy_df[words_occurancy_df['class'] == 0]
c1 = words_occurancy_df[words_occurancy_df['class'] == 1]

In [57]:
def p_word_c(wd):
    p_word_c0 = sum(c0[wd])/len(c0)
    p_word_c1 = sum(c1[wd])/len(c1)
    return p_word_c0,p_word_c1

In [58]:
def bnb(sentence):
    p_c0 = len(c0)/len(words_occurancy_df)
    p_c1 = len(c1)/len(words_occurancy_df)
    p_c0_word = np.prod([p_word_c(x)[0] for x in sentence])*p_c0
    p_c1_word = np.prod([p_word_c(x)[1] for x in sentence])*p_c1
    if p_c0_word > p_c1_word:
        return 'class0'
    else:
        return 'class1'  

In [59]:
s = ['stupid','garbage']
bnb(s)

'class1'

In [64]:
clf.predict_proba([61732,8.325167,0.028479])

array([[9.51489635e-01, 4.85103646e-02, 4.06602879e-21]])

In [79]:
clf.classes_

array(['didntLike', 'largeDoses', 'smallDoses'], dtype='<U10')