<a href="https://colab.research.google.com/github/santhoopa/academic_adivisor_chatbot/blob/master/academic_adviser.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import nltk
import numpy as np
import random
import io
import re, string, unicodedata
from collections import defaultdict
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

nltk.download('punkt') 
nltk.download('wordnet') 
nltk.download('averaged_perceptron_tagger')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [None]:
# Uploading the text corpus
from google.colab import files
uploaded_file = files.upload()


Saving data.txt to data.txt


In [None]:
import warnings
warnings.filterwarnings('ignore') 

In [None]:
# Reading the text corpus file  
text_corpus_file=io.open('data.txt','r',errors = 'ignore')
text_corpus=text_corpus_file.read()

# Converting text into lowercase
lowercase_text_corpus=text_corpus.lower()

# Tokenizing the text corpus into sentence tokens 
sentences_tokens = nltk.sent_tokenize(lowercase_text_corpus) 

# Data preprocessing
def text_normalization(text):
    '''
    This function takes the text as the input, then applies data preprocessing techniques and returns normalized tokens
    Word tokenization, stop words removal, ASCII character removal, POS  tagging and Lemmatization 
    '''
    # Tokenizing the sentences into words and removing punctuation marks
    remove_punctuation_dict = dict((ord(punct), None) for punct in string.punctuation)
    word_token = nltk.word_tokenize(text.lower().translate(remove_punctuation_dict))

    # Removing ASCII characters 
    new_words = []
    for word in word_token:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)

    # Removing tags
    cleaned_words = []
    for w in new_words:
        text=re.sub("&lt;/?.*?&gt;","&lt;&gt;",w)
        cleaned_words.append(text)

    # Part of Speech(POS) tagging and Lemmatization
    tags = defaultdict(lambda : wn.NOUN)
    tags['J'] = wn.ADJ
    tags['V'] = wn.VERB
    tags['R'] = wn.ADV
    lemmatizer = WordNetLemmatizer()
    lemmatized_list = []
    cleaned_words = [i for i in cleaned_words if i]
    for token, tag in nltk.pos_tag(cleaned_words):
        lemma = lemmatizer.lemmatize(token, tags[tag[0]])
        lemmatized_list.append(lemma)
    return lemmatized_list


# Generating the response
def generate_response(user_query):
    '''
    This function takes the user query and generates a response for that response using the text corpus
    Techniques used: TF-IDF, Cosine Similarity
    '''
    chatbot_response = ''
    # Appending the user query to sentences token list 
    sentences_tokens.append(user_query)

    # Vectorizing the sentences using TF-IDF technique
    TfidfVec = TfidfVectorizer(tokenizer=text_normalization, stop_words='english')
    tfidf = TfidfVec.fit_transform(sentences_tokens)
    
    # Calculating the cosine similarity between the user query & and the text corpus
    similarity_score = cosine_similarity(tfidf[-1], tfidf)
    idx = similarity_score.argsort()[0][-2] 
    flatten_values = similarity_score.flatten()
    flatten_values.sort()
    req_tfidf = flatten_values[-2]
    if(req_tfidf==0):
        chatbot_response = chatbot_response + "No results found. I cannot help you right now"
        return chatbot_response
    else:
        chatbot_response = chatbot_response + sentences_tokens[idx]
        return chatbot_response

chatbot_run_flag = True
while(chatbot_run_flag==True):
    print("Academic Support Chatbot: Enter your query")
    user_query = input()
    user_query=user_query.lower()
    if(user_query !='bye'):
        print("Academic Support Chatbot: ",end="")
        print(generate_response(user_query))
        sentences_tokens.remove(user_query)
    else:
        chatbot_run_flag=False
        print("Academic Support Chatbot: Bye!")    

Academic Support Chatbot: Enter your query
What are the specialization areas?
Academic Support Chatbot: these specializations areas are business systems engineering(bse), operations and supply chain management (oscm), information technology (it), information systems (is).
Academic Support Chatbot: Enter your query
Credit requirements?
Academic Support Chatbot: each year undergraduates should complete 30 credits.
Academic Support Chatbot: Enter your query
Could you explain about the internship?
Academic Support Chatbot: it is compulsory to all the mit undergraduates to complete a 6 month internship in second semester of level 3, it accounts for 6 credits.
Academic Support Chatbot: Enter your query


KeyboardInterrupt: ignored

In [None]:
query = 'specialization areas'
print(generate_response(query))
sentences_tokens.remove(query)

25
  (0, 163)	0.8044943805435606
  (0, 18)	0.5939602610224295
1
[0.07050242 0.32435335 0.         0.14446444 0.33501305 0.
 0.         0.         0.         0.         0.08045387 0.
 0.09380204 0.         0.         0.         0.         0.
 0.         0.08784698 0.27447821 0.         0.         0.
 1.        ]
these specializations areas are business systems engineering(bse), operations and supply chain management (oscm), information technology (it), information systems (is).


  'stop_words.' % sorted(inconsistent))


In [None]:
sample_text = " Industrial Training program is one of the highlights of our program and the culmination of the first two years of the learning experience. It enables our undergraduates to integrate and apply theories, knowledge, skills and values acquired through their first and second year in areas related to their interests and learning needs. It also provides them with the opportunity for experiential learning, linking and reflecting upon the relationship between theoretical perspectives and field experience, and an opportunity to develop competence in a range of work skills. Industrial Training allows undergraduates to develop networks and career opportunities with leading organizations providing an advantageous position to gain future full-time employment. "
print(Normalize(sample_text))

['industrial', 'training', 'program', 'be', 'one', 'of', 'the', 'highlight', 'of', 'our', 'program', 'and', 'the', 'culmination', 'of', 'the', 'first', 'two', 'year', 'of', 'the', 'learning', 'experience', 'it', 'enable', 'our', 'undergraduate', 'to', 'integrate', 'and', 'apply', 'theory', 'knowledge', 'skill', 'and', 'value', 'acquire', 'through', 'their', 'first', 'and', 'second', 'year', 'in', 'area', 'relate', 'to', 'their', 'interest', 'and', 'learn', 'need', 'it', 'also', 'provide', 'them', 'with', 'the', 'opportunity', 'for', 'experiential', 'learning', 'link', 'and', 'reflect', 'upon', 'the', 'relationship', 'between', 'theoretical', 'perspective', 'and', 'field', 'experience', 'and', 'an', 'opportunity', 'to', 'develop', 'competence', 'in', 'a', 'range', 'of', 'work', 'skill', 'industrial', 'training', 'allow', 'undergraduate', 'to', 'develop', 'network', 'and', 'career', 'opportunity', 'with', 'leading', 'organization', 'provide', 'an', 'advantageous', 'position', 'to', 'gain

['industrial', 'training', 'program', 'is', 'one', 'of', 'the', 'highlight', 'of', 'our', 'program', 'and', 'the', 'culmination', 'of', 'the', 'first', 'two', 'year', 'of', 'the', 'learning', 'experience', 'it', 'enables', 'our', 'undergraduate', 'to', 'integrate', 'and', 'apply', 'theory', 'knowledge', 'skill', 'and', 'value', 'acquired', 'through', 'their', 'first', 'and', 'second', 'year', 'in', 'area', 'related', 'to', 'their', 'interest', 'and', 'learning', 'need', 'it', 'also', 'provides', 'them', 'with', 'the', 'opportunity', 'for', 'experiential', 'learning', 'linking', 'and', 'reflecting', 'upon', 'the', 'relationship', 'between', 'theoretical', 'perspective', 'and', 'field', 'experience', 'and', 'an', 'opportunity', 'to', 'develop', 'competence', 'in', 'a', 'range', 'of', 'work', 'skill', 'industrial', 'training', 'allows', 'undergraduate', 'to', 'develop', 'network', 'and', 'career', 'opportunity', 'with', 'leading', 'organization', 'providing', 'an', 'advantageous', 'positi