<a href="https://colab.research.google.com/github/smbrmoyo/Semantic_Search/blob/main/Semantic_search.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

# Define the category map
category_map = {'talk.politics.misc': 'Politics', 'rec.autos': 'Autos', 
        'rec.sport.hockey': 'Hockey', 'sci.electronics': 'Electronics', 
        'sci.med': 'Medicine'}

# Get the training dataset
training_data = fetch_20newsgroups(subset='train', 
        categories=category_map.keys(), shuffle=True, random_state=5)

# Build a count vectorizer and extract term counts 
count_vectorizer = CountVectorizer()
train_tc = count_vectorizer.fit_transform(training_data.data)
print("\nDimensions of training data:", train_tc.shape)

# Create the tf-idf transformer
tfidf = TfidfTransformer()
train_tfidf = tfidf.fit_transform(train_tc)

# Define test data 
input_data = [
    'You need to be careful with cars when you are driving on slippery roads', 
    'A lot of devices can be operated wirelessly',
    'Players need to be careful when they are close to goal posts',
    'Political debates help us understand the perspectives of both sides'
]

# Train a Multinomial Naive Bayes classifier
classifier = MultinomialNB().fit(train_tfidf, training_data.target)

# Transform input data using count vectorizer
input_tc = count_vectorizer.transform(input_data)

# Transform vectorized data using tfidf transformer
input_tfidf = tfidf.transform(input_tc)

# Predict the output categories
predictions = classifier.predict(input_tfidf)

# Print the outputs
for sent, category in zip(input_data, predictions):
    print('\nInput:', sent, '\nPredicted category:', \
            category_map[training_data.target_names[category]])



Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)



Dimensions of training data: (2844, 40321)

Input: You need to be careful with cars when you are driving on slippery roads 
Predicted category: Autos

Input: A lot of devices can be operated wirelessly 
Predicted category: Electronics

Input: Players need to be careful when they are close to goal posts 
Predicted category: Hockey

Input: Political debates help us understand the perspectives of both sides 
Predicted category: Politics


# Optimized **Chatbot**

In [None]:
!pip install tflearn
nltk.download('punkt')
nltk.download('wordnet')

In [None]:
import nltk
from nltk.corpus import wordnet
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk.classify.scikitlearn import SklearnClassifier
import numpy
import tflearn
import tensorflow
import random
import json
import pickle
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from nltk.tokenize import word_tokenize
from nltk.classify import ClassifierI
from statistics import mode



class VoteClassifier(ClassifierI):
    def __init__(self, *classifiers):
        self._classifiers = classifiers

    def classify(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
        return mode(votes)

    def confidence(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)

        choice_votes = votes.count(mode(votes))
        conf = choice_votes / len(votes)
        return conf

doc_greeting = open("drive/MyDrive/Greeting.txt","r").read()
doc_goodbye = open("drive/MyDrive/Goodbye.txt","r").read()
doc_appointment = open("drive/MyDrive/Appointment.txt","r").read()
doc_emergency = open("drive/MyDrive/Emergency.txt","r").read()
doc_find = open("drive/MyDrive/Find_hospital.txt","r").read()

documents = []

for r in doc_greeting.split('\n'):
    documents.append( (r, "Greeting") )

for r in doc_goodbye.split('\n'):
    documents.append( (r, "Goodbye") )

for r in doc_appointment.split('\n'):
    documents.append( (r, "Appointment") )

for r in doc_emergency.split('\n'):
    documents.append( (r, "Emergency") )

for r in doc_find.split('\n'):
    documents.append( (r, "Find") )

all_words = []

doc_greeting_words = word_tokenize(doc_greeting)
doc_goodbye_words = word_tokenize(doc_goodbye)
doc_appointment_words = word_tokenize(doc_appointment)
doc_emergency_words = word_tokenize(doc_emergency)
doc_find_words = word_tokenize(doc_find)

for w in doc_greeting_words:
    all_words.append(w.lower())
for w in doc_goodbye_words:
    all_words.append(w.lower())
for w in doc_appointment_words:
    all_words.append(w.lower())
for w in doc_emergency_words:
    all_words.append(w.lower())
for w in doc_find_words:
    all_words.append(w.lower())

all_words = nltk.FreqDist(all_words)

#Next, we also need to adjust our feature finding function,
# mainly tokenizing by word in the document, since we didn't have a nifty .words() feature for our new sample.
# I also went ahead and increased the most common words:

word_features = list(all_words.keys())

def find_features(document):
    words = word_tokenize(document)
    features = {}

    for w in word_features:
        features[w] = (w in words)

    return features

#ORIGINAL LINE	
featuresets = [(find_features(rev), category) for (rev, category) in documents]
random.shuffle(featuresets)

training_set = featuresets
testing_set =  featuresets[-5:]

#print(documents[0])
#print(testing_set[-1])
print("length training set" , len(training_set))
print("length testing set" , len(testing_set))
print("length documents" , len(documents))
print("length word_features" , len(word_features))
print("length all_words" , len(all_words))

classifier = nltk.NaiveBayesClassifier.train(training_set)
print("Original Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier, testing_set))*100)
#classifier.show_most_informative_features(15)

MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("MNB_classifier accuracy percent:", (nltk.classify.accuracy(MNB_classifier, testing_set))*100)

LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_set)
print("LogisticRegression_classifier accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set))*100)

SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(training_set)
print("SGDClassifier_classifier accuracy percent:", (nltk.classify.accuracy(SGDClassifier_classifier, testing_set))*100)

voted_classifier = VoteClassifier(
                                  MNB_classifier,
                                  LogisticRegression_classifier,
                                  LogisticRegression_classifier)

print("voted_classifier accuracy percent:", (nltk.classify.accuracy(voted_classifier, testing_set))*100)

length training set 55
length testing set 5
length documents 55
length word_features 80
length all_words 80
Original Naive Bayes Algo accuracy percent: 100.0
MNB_classifier accuracy percent: 100.0
LogisticRegression_classifier accuracy percent: 100.0
SGDClassifier_classifier accuracy percent: 100.0
voted_classifier accuracy percent: 100.0


**FUNCTIONS**

In [None]:
#FUNCTIONS

def func_emergency():
  print("We are sending an ambulance. Could you give us your location?")

def func_appointment():
  print("What date best suits you?")

def func_find():
  print("Looking for the closest hospital ")

**TAGGING**

In [None]:
def tagging(text):
    feats = find_features(text)
    return voted_classifier.classify(feats),voted_classifier.confidence(feats)

**MAIN**

In [None]:
answer, confiance = tagging("I am hurt. I need help")
#print("Tag:",answer,"--- Confidence:", confiance)
if confiance >= 0.8:
  if answer == 'Emergency':
    func_emergency()
  elif answer == 'Appointment':
    func_appointment()
  elif answer == 'Find':
    func_find()
  else: print("che ne compwen pa")

We are sending an ambulance. Could you give us your location?


ADDING **VOCABULARY**

In [None]:
#textwords = [w.lower() for w in word_tokenize('Emergency.json')]
from nltk.corpus import stopwords
#nltk.download('stopwords')

ignored_words = set(stopwords.words('english'))
print(ignored_words)

filterstops = lambda i: len(i) < 3 or i in ignored_words
print(filterstops)

finder = BigramCollocationFinder.from_words(doc_emergency_words)

finder.apply_word_filter(filterstops)

print(finder.nbest(BigramAssocMeasures.likelihood_ratio, 10))


{'by', 'mightn', 'until', 'aren', 'isn', 'so', 'his', 'theirs', 'weren', 'we', 'to', 'of', 'having', 'against', 'why', 'both', 'did', "hasn't", 'before', 'what', 'does', 'who', 'yourself', 'shan', 'each', 'and', 'wasn', 'doesn', 'further', 'its', 'because', 'up', 'these', "hadn't", 'had', 'in', 'any', 'over', 'or', 'if', 'll', 'on', 'him', 'hasn', 'above', 'her', 'being', 'won', 'as', "mustn't", "should've", 'but', "it's", 'she', "you're", 'a', "couldn't", 'be', 'ourselves', 'are', "don't", 'm', 'through', 't', 'only', 'very', 'down', 'out', 'after', 'which', 'our', 'whom', "won't", 'have', 'were', "haven't", 'from', 'yours', 'just', 'no', 'haven', 'o', 'it', 'the', 'again', "weren't", "shouldn't", 'them', 'needn', 're', 'this', "wouldn't", 'my', 'themselves', 'same', 'you', 'than', 'for', "she's", 'can', 'below', 'those', 'with', 'nor', 'few', 'been', 'when', 'they', 'myself', 'didn', 'shouldn', 'more', 'itself', 'hadn', 'was', 'too', "aren't", 'ain', 'has', 'herself', "didn't", 'do',

In [None]:

'''word_1 = "blood"
targets = wordnet.synsets(word_1)
print(targets)

syn_1 = targets[0]
print(syn_1.definition())
print(syn_1.hypernym_paths())
print(syn_1.hyponyms())

syn_1.lemmas()[0]
print("Lemmas: " , syn_1)

synArr = []
antArr = []

for lem in syn_1.lemmas():
  synArr.append(lem.name())

print(synArr)

synArr = set(synArr)

print(synArr)'''

**VOCABULARY**

In [None]:
Appointment
I would like to take an appointment
I have an appointment
when can I take an appointment
I am an apointee
when is the doctor available
Do I have an appointment
when can I get an appointment
I need a session
I want a session

I would like to call 911
I injured myself
I harmed myself
I was harmed
I need some assistance
911
I cut myself
Is it 911
It is an emergency
it is urgent
it is pressing
please help
I need aid
aid someone
I need a doctor
I need a medic
come help me
somebody is bleeding
someone is injured
someone is hemorrhaging
call an ambulance
Someone call an ambulance
there is blood everywhere

Find nearest hospital
Find the emergencies
Where is the hospital
The closest hospital
detect where the closest hospital is
find the way to the closest hospital
find the next hospital
notice where the closest hospital is
my wife is giving birth