In [1]:
import pandas as pd
import numpy as np
import sklearn

In [2]:
# Data
coffee_df = pd.read_csv("coffee.csv")
coffee_df

Unnamed: 0,Document,Class
0,Coffee Tea Soup Coffee Coffee,Hot
1,Coffee is hot and so is Soup and Tea,Hot
2,Espresso is a hot Coffee and not a Tea,Hot
3,Coffee is neither Tea nor Soup,Hot
4,Sprite Pepsi Cold Coffee and cold Tea,Cold


In [3]:
coffee_df['Class'] = coffee_df.Class.map({'Hot':0, 'Cold':1})
coffee_df

Unnamed: 0,Document,Class
0,Coffee Tea Soup Coffee Coffee,0
1,Coffee is hot and so is Soup and Tea,0
2,Espresso is a hot Coffee and not a Tea,0
3,Coffee is neither Tea nor Soup,0
4,Sprite Pepsi Cold Coffee and cold Tea,1


In [4]:
# convert the df to a numpy array 
train_array = coffee_df.values

# split X and y
X_train = train_array[:,0]
y_train = train_array[:,1]
y_train = y_train.astype('int') # sklearn needs y as integers

print("X_train")
print(X_train)
print("y_train")
print(y_train)

X_train
['Coffee Tea  Soup Coffee Coffee' 'Coffee is hot and so is Soup  and Tea'
 'Espresso is a hot Coffee  and not a Tea'
 'Coffee is neither Tea nor Soup' 'Sprite Pepsi  Cold Coffee and cold Tea']
y_train
[0 0 0 0 1]


In [5]:
# create an object of CountVectorizer() class 
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer()

In [6]:
# fit the vectorizer on training data 
vec.fit(X_train)
vec.vocabulary_

{'coffee': 1,
 'tea': 13,
 'soup': 11,
 'is': 5,
 'hot': 4,
 'and': 0,
 'so': 10,
 'espresso': 3,
 'not': 8,
 'neither': 6,
 'nor': 7,
 'sprite': 12,
 'pepsi': 9,
 'cold': 2}

In [7]:
# fitting the vectorizer on training data again
# removing the stop words this time
vec = CountVectorizer(stop_words='english')
vec.fit(X_train)
vec.vocabulary_

{'coffee': 0,
 'tea': 7,
 'soup': 5,
 'hot': 3,
 'espresso': 2,
 'sprite': 6,
 'pepsi': 4,
 'cold': 1}

In [8]:
# printing feature names
print(vec.get_feature_names())
print(len(vec.get_feature_names()))

['coffee', 'cold', 'espresso', 'hot', 'pepsi', 'soup', 'sprite', 'tea']
8


In [9]:
# another way of representing the features
X_transformed = vec.transform(X_train)
X_transformed

<5x8 sparse matrix of type '<class 'numpy.int64'>'
	with 19 stored elements in Compressed Sparse Row format>

In [10]:
# converting transformed matrix back to an array
# note the high number of zeros
X_transformed.toarray()

array([[3, 0, 0, 0, 0, 1, 0, 1],
       [1, 0, 0, 1, 0, 1, 0, 1],
       [1, 0, 1, 1, 0, 0, 0, 1],
       [1, 0, 0, 0, 0, 1, 0, 1],
       [1, 2, 0, 0, 1, 0, 1, 1]], dtype=int64)

In [11]:
# test data
test_coffee = pd.read_csv('testcoffee.csv') 
test_coffee

Unnamed: 0,Document,Class
0,I hate cold Coffee but love Tea and hot Coffee,Hot


In [21]:
test_coffee['Class'] =test_coffee.Class.map({'Hot':0, 'Cold':1})

In [22]:
# convert to numpy array
test_numpy_array = test_coffee.values

# split into X and y
X_test = test_numpy_array[:,0]
y_test = test_numpy_array[:,1]
print("X_test")
print(X_test)
print("y_test")
print(y_test)

X_test
['I hate cold Coffee but love Tea and hot Coffee']
y_test
[0]


In [23]:
# transform the test data
# note that you *never* fit on test data, only on training data
# and only transform the test data
X_test_transformed = vec.transform(X_test)
X_test_transformed

<1x8 sparse matrix of type '<class 'numpy.int64'>'
	with 4 stored elements in Compressed Sparse Row format>

In [24]:
# convert to non-sparse array
X_test=X_test_transformed.toarray()
X_test


array([[2, 1, 0, 1, 0, 0, 0, 1]], dtype=int64)

In [25]:
# building a multinomial NB model
from sklearn.naive_bayes import MultinomialNB

# instantiate NB class
mnb=MultinomialNB()

# fitting the model on training data
mnb.fit(X_transformed, y_train)

# note that we are using the sparse matrix X_transformed, 
# though you can also use the non-sparse version
# mnb.fit(X_transformed.toarray(), y_train) 

# predicting probabilities of test data
proba = mnb.predict_proba(X_test)

# predict class
y_pred_class = mnb.predict(X_test_transformed)

# predict probabilities
y_pred_proba = mnb.predict_proba(X_test_transformed)


In [28]:
# probability of each class (test data)
print("probability of test document belonging to class Hot" , proba[:,0])
print("probability of test document belonging to class Cold" , proba[:,1])

probability of test document belonging to class Hot [0.89217267]
probability of test document belonging to class Cold [0.10782733]


In [27]:
# printing the overall accuracy
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred_class)

ValueError: Classification metrics can't handle a mix of unknown and binary targets