## Machine Learning - Naive Bayes Classifier Algorithm 

In [1]:
# importing the Naive Bayes classifier

from sklearn.naive_bayes import GaussianNB

In [2]:
nb = GaussianNB() # Use the Iris dataset for the classification
nb.fit()

TypeError: GaussianNB.fit() missing 1 required positional argument: 'estimator'

In [None]:
nb.predict_proba #this tells the probability of the result

### Spam mail classifier

In [3]:
import pandas as pd
df = pd.read_csv("spam.csv")
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
#grouping the data by category
df.groupby("Category").describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,641,Please call our customer service representativ...,4


In [5]:
#creating a numberic column called spam
df["spam"] = df["Category"].apply(lambda x: 1 if x=="spam" else 0)
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [6]:
#splitting the datasets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.Message, df.spam, test_size=0.25)

In [18]:
#Converting the textual column to a numeric one.
#using Counvectorizer.
from sklearn.feature_extraction.text import CountVectorizer
v = CountVectorizer()
X_train_count = v.fit_transform(X_train)
X_test_count = v.fit_transform(X_test)
X_train_count.toarray()[:3]
X_test_count.toarray()[:3]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [19]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train_count, y_train)

In [20]:
emails = ["Ok lar... Joking wif u oni..."
          ,"Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"
]


In [21]:
emails_count = v.transform(emails)
emails_count

<2x4056 sparse matrix of type '<class 'numpy.int64'>'
	with 22 stored elements in Compressed Sparse Row format>

In [17]:
model.score(X_test_count, y_test)

ValueError: X has 4056 features, but MultinomialNB is expecting 7457 features as input.

#### Using SKlearn Pipeline

In [26]:
#this is use to directly convert your textual column to numeric in a simplified way
from sklearn.pipeline import Pipeline
clf = Pipeline([
    ("vectorizer", CountVectorizer()),
    ("nb", MultinomialNB())
])

In [27]:
clf.fit(X_train, y_train)

In [28]:
clf.score(X_test, y_test)

0.9870782483847811

In [29]:
clf.predict(emails)

array([0, 1], dtype=int64)

<b>Exercise:</b>
<br>
<br>
Use wine dataset from sklearn.datasets to classify wines into 3 categories. Load the dataset and split it into test and train. After that train the model using <b>Gaussian</b> and <b>Multinominal</b> classifier and post which model performs better. Use the trained model to perform some predictions on test data.

In [31]:
from sklearn.datasets import load_wine
wine = load_wine()
dir(wine)

['DESCR', 'data', 'feature_names', 'frame', 'target', 'target_names']

In [32]:
import pandas as pd

In [33]:
df = pd.DataFrame(wine.data, columns=wine.feature_names)

In [34]:
df.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0


In [39]:
df["type"] = wine.target

In [None]:
#Using Gaussian classifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split


In [54]:
X_train, X_test, y_train, y_test = train_test_split(wine.data, wine.target, test_size=0.25)

In [55]:
gnb = GaussianNB()
gnb.fit(X_train, y_train)

In [56]:
gnb.score(X_test, y_test)

0.9777777777777777

In [44]:
#Using Multinominal classifier
from sklearn.naive_bayes import MultinomialNB

In [57]:
mnb = MultinomialNB()
mnb.fit(X_train, y_train)

In [58]:
mnb.score(X_test, y_test)

0.7777777777777778