In [19]:
import pandas as pd
df = pd.read_csv('dataset/Naive_Classifier.csv', 
                   header=None, 
                   names=['label', 'value'])

# Output printing out first 5 rows
df.head()

Unnamed: 0,label,value
0,address,"s/o saminathan28/3 nagar 3rd , street, korattu..."
1,address,"flat 6 ird main 4h nagar, mookordapalli, 635126"
2,address,1169 kaviyarasu kannadasan nagar kodungaiyur ...
3,address,1568 bhagavath 21. nedumkadu thiruvananthapur...
4,address,22 nehru colony kumarapalnvam coimbatore ici ...


In [20]:
# Check the shape of the dataset
df.shape

(180741, 2)

In [21]:
#Check if any null values are present in the dataset
df.isnull().sum()

label    0
value    1
dtype: int64

In [22]:
# Remove the null fields
df = df.dropna()
df.shape

(180740, 2)

In [23]:
df.head()

Unnamed: 0,label,value
0,address,"s/o saminathan28/3 nagar 3rd , street, korattu..."
1,address,"flat 6 ird main 4h nagar, mookordapalli, 635126"
2,address,1169 kaviyarasu kannadasan nagar kodungaiyur ...
3,address,1568 bhagavath 21. nedumkadu thiruvananthapur...
4,address,22 nehru colony kumarapalnvam coimbatore ici ...


The label field consist of categorical data, hence need to be converted to numerical data <br>

0 -----------> Address <br>
1 -----------> Name

In [26]:
# Mapping of the categorical fields 0 to address and 
df['label'] = df.label.map({'address':0, 'name':1})
df.head() # returns (rows, columns)


Unnamed: 0,label,value
0,0,"s/o saminathan28/3 nagar 3rd , street, korattu..."
1,0,"flat 6 ird main 4h nagar, mookordapalli, 635126"
2,0,1169 kaviyarasu kannadasan nagar kodungaiyur ...
3,0,1568 bhagavath 21. nedumkadu thiruvananthapur...
4,0,22 nehru colony kumarapalnvam coimbatore ici ...


In [27]:
# split into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['value'], 
                                                    df['label'], 
                                                    random_state=1)

print('Number of rows in the total set: {}'.format(df.shape[0]))
print('Number of rows in the training set: {}'.format(X_train.shape[0]))
print('Number of rows in the test set: {}'.format(X_test.shape[0]))

Number of rows in the total set: 180740
Number of rows in the training set: 135555
Number of rows in the test set: 45185


In [28]:
# Instantiate the CountVectorizer method
from sklearn.feature_extraction.text import CountVectorizer
count_vector = CountVectorizer()

# Fit the training data and then return the matrix
training_data = count_vector.fit_transform(X_train)

# Transform testing data and return the matrix. Note we are not fitting the testing data into the CountVectorizer()
testing_data = count_vector.transform(X_test)

In [29]:
from sklearn.naive_bayes import MultinomialNB
naive_bayes = MultinomialNB()
naive_bayes.fit(training_data, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [30]:
predictions = naive_bayes.predict(testing_data)

In [31]:

from sklearn.metrics import classification_report,accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
print('Accuracy score: ', format(accuracy_score(y_test, predictions)))
print('Precision score: ', format(precision_score(y_test, predictions)))
print('Recall score: ', format(recall_score(y_test, predictions)))
print('F1 score: ', format(f1_score(y_test, predictions)))
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test,predictions))


Accuracy score:  0.9743277636383756
Precision score:  0.9929864253393665
Recall score:  0.9702204256931725
F1 score:  0.9814714244641088
[[13302   217]
 [  943 30723]]
              precision    recall  f1-score   support

           0       0.93      0.98      0.96     13519
           1       0.99      0.97      0.98     31666

    accuracy                           0.97     45185
   macro avg       0.96      0.98      0.97     45185
weighted avg       0.98      0.97      0.97     45185



In [32]:
name = 'Ram Kumar'
names = [name,]
names = count_vector.transform(names)
value = naive_bayes.predict(names)
if value[0] == 0:
    print("address")
else:
    print("name")

name


In [33]:
address_list = ["Brockton Avenue","30 Memorial Drive, Avon MA 2322","250 Hartford Avenue, Bellingham MA 2019","700 Oak Street, Brockton MA 2301"]

In [37]:
address = count_vector.transform(address_list)
prediction_value = naive_bayes.predict(address)
prob = naive_bayes.predict_proba(address)


In [38]:
from prettytable import PrettyTable
table = PrettyTable(["Input", "Prediction", "Probability"])
count_name = 0
count_address = 0
for addr,pred,prob in zip(address_list,prediction_value,prob):
    if pred == 0:
        count_address = count_address + 1
    table.add_row([addr,"address" if pred == 0 else "name",prob])

print("Output of the Classifier")
print("Count of address classified", count_address)
print(table)

Output of the Classifier
Count of address classified 4
+-----------------------------------------+------------+---------------------------------+
|                  Input                  | Prediction |           Probability           |
+-----------------------------------------+------------+---------------------------------+
|             Brockton Avenue             |  address   |     [0.90955321 0.09044679]     |
|     30 Memorial Drive, Avon MA 2322     |  address   | [9.99260216e-01 7.39784309e-04] |
| 250 Hartford Avenue, Bellingham MA 2019 |  address   | [9.99899853e-01 1.00146942e-04] |
|     700 Oak Street, Brockton MA 2301    |  address   |     [0.95835025 0.04164975]     |
+-----------------------------------------+------------+---------------------------------+


In [39]:
indian_names = ['Pranav','Manali','Sakshi','Darshan']

In [41]:
indian_names_list = count_vector.transform(indian_names)
prediction_value = naive_bayes.predict(indian_names_list)
prob = naive_bayes.predict_proba(indian_names_list)


In [42]:
from prettytable import PrettyTable
table = PrettyTable(["Input", "Prediction", "Probability"])
count_name = 0
for name,pred,prob in zip(indian_names,prediction_value,prob):
    if pred == 1:
        count_name = count_name + 1
    table.add_row([name,"address" if pred == 0 else "name",prob])

print("Output of the Classifier")
print("Count of address classified", count_name)
print(table)

Output of the Classifier
Count of address classified 4
+---------+------------+-------------------------+
|  Input  | Prediction |       Probability       |
+---------+------------+-------------------------+
|  Pranav |    name    | [0.02481993 0.97518007] |
|  Manali |    name    |  [0.4161081 0.5838919]  |
|  Sakshi |    name    | [0.03615171 0.96384829] |
| Darshan |    name    | [0.41898625 0.58101375] |
+---------+------------+-------------------------+
