In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder, label_binarize
from sklearn.linear_model import SGDClassifier
from collections import Counter
from sklearn.metrics import precision_score, recall_score, accuracy_score, roc_auc_score
from nltk.stem import WordNetLemmatizer
from sklearn.pipeline import Pipeline

In [2]:
df = pd.read_csv("df_shuff.csv")
df

Unnamed: 0,question,answer
0,hi,Hello! How can I assist you today?
1,how are you,Im doing well thank you. Is there anything you...
2,good morning,Good morning to you too! How can I assist you ...
3,good afternoon,Good afternoon to you too! How can I assist yo...
4,good evening,Good evening to you too! How can I assist you ...
...,...,...
1350,gallery,https://nmims.edu/about/photogallery-shirpur-c...
1351,gallery photograph,https://nmims.edu/about/photogallery-shirpur-c...
1352,tell me about nmims,NMIMS (Narsee Monjee Institute of Management S...
1353,about nmims shirpur,NMIMS (Narsee Monjee Institute of Management S...


In [3]:
X = df["question"]
y = df["answer"]

In [4]:
le = LabelEncoder()

In [5]:
y = le.fit_transform(y)

In [6]:
y

array([ 32,  47,  26, ...,  66,  66, 144])

In [7]:
le.classes_

array(['80%.',
       'Admission to NMIMS is based on entrance exams like NMIMS-NPAT followed by personal interview.',
       'All medical facilities available on the campus.',
       'All the best to you too!',
       'As an AI language model I dont have a father or any human parents.',
       'As an AI language model I dont have parents.',
       'As an AI language model I dont have personal information but I can provide helpful answers and assistance.',
       'At post Muktainagar; Taluka Shirpur; Dist. Dhule; Maharashtra - 425405; India https://engineering-shirpur.nmims.edu/contact-us/',
       'Bye! Do you need help with anything else?',
       'Bye! Is there anything else youd like to discuss?',
       'Candidates have to appear for the entrance exam followed by personal interview for admission to NMIMS.',
       'Congratulations on your retirement! Is there something specific you need help with before you go?',
       'Course not Available', 'Curse not available.',
       'Dr. N

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X ,y , random_state = 100, test_size = 0.2)

In [9]:
tf = TfidfVectorizer(lowercase=False,ngram_range=(1, 3),min_df=0,stop_words='english')
X_train_tf = tf.fit_transform(X_train)
X_train_tf

<1084x1489 sparse matrix of type '<class 'numpy.float64'>'
	with 3981 stored elements in Compressed Sparse Row format>

In [10]:
X_test_tf = tf.transform(X_test)

In [11]:
model = SGDClassifier(n_jobs=-1,random_state=100,loss='modified_huber',alpha=0.0005)
model.fit(X_train_tf,y_train)

SGDClassifier(alpha=0.0005, loss='modified_huber', n_jobs=-1, random_state=100)

In [12]:
pipe = Pipeline([('vectorizer',tf),("SGDClassifier",model)])
pipe.fit(X_train, y_train)

Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(lowercase=False, min_df=0, ngram_range=(1, 3),
                                 stop_words='english')),
                ('SGDClassifier',
                 SGDClassifier(alpha=0.0005, loss='modified_huber', n_jobs=-1,
                               random_state=100))])

In [13]:
y_pred = pipe.predict(X_test)
y_pred

array([236, 139,  63, 243, 143, 149, 167, 104,   3, 240,  53, 212, 180,
       180, 259, 262, 135, 220,   1, 134, 211,  63, 155,  71, 191,  28,
       174, 201, 104,  18, 149,  62,  35,  95,  14,  72, 242, 175,  12,
        19,  50,  29, 248, 104,  40, 262, 119, 161,  95,  66,  53,  23,
        64, 104, 212, 202, 173, 219, 102, 247, 197, 120,  83, 220, 178,
        63, 100,  25, 124,  58,  11, 189, 170,  64, 190,  22, 195,  57,
       213, 183, 135,   7, 182,  86, 252,  25,  58, 202, 198, 117, 107,
       145, 253, 107,   3,  25, 107,  17,  12,  90,  63, 156, 167,  75,
         1,  73,  64,   5,  68, 120,  81,  12, 106, 232,  61, 181,  14,
       197, 226, 213, 241, 119, 107, 231, 246, 111, 258, 202, 112, 202,
       221, 200,  12, 212, 262, 252, 171, 191, 184, 127, 250, 250, 162,
       159, 196,  78, 172,  45, 136,  95,  64,  46, 252, 114,  28,  64,
       209,  63, 256, 160, 154, 241,  41, 233, 165, 233, 249, 107, 104,
       228, 250,   3, 191,  64, 231, 196,  42, 198, 108,   5,  9

In [14]:
with open('abc-0.1.0.pkl','wb') as f:
    pickle.dump(pipe,f)

<IPython.core.display.Javascript object>

In [15]:
y_pred2 = model.predict(X_test_tf)
y_pred2

array([236, 139,  63, 243, 143, 149, 167, 104,   3, 240,  53, 212, 180,
       180, 259, 262, 135, 220,   1, 134, 211,  63, 155,  71, 191,  28,
       174, 201, 104,  18, 149,  62,  35,  95,  14,  72, 242, 175,  12,
        19,  50,  29, 248, 104,  40, 262, 119, 161,  95,  66,  53,  23,
        64, 104, 212, 202, 173, 219, 102, 247, 197, 120,  83, 220, 178,
        63, 100,  25, 124,  58,  11, 189, 170,  64, 190,  22, 195,  57,
       213, 183, 135,   7, 182,  86, 252,  25,  58, 202, 198, 117, 107,
       145, 253, 107,   3,  25, 107,  17,  12,  90,  63, 156, 167,  75,
         1,  73,  64,   5,  68, 120,  81,  12, 106, 232,  61, 181,  14,
       197, 226, 213, 241, 119, 107, 231, 246, 111, 258, 202, 112, 202,
       221, 200,  12, 212, 262, 252, 171, 191, 184, 127, 250, 250, 162,
       159, 196,  78, 172,  45, 136,  95,  64,  46, 252, 114,  28,  64,
       209,  63, 256, 160, 154, 241,  41, 233, 165, 233, 249, 107, 104,
       228, 250,   3, 191,  64, 231, 196,  42, 198, 108,   5,  9

In [16]:
labels = np.unique(y_test)
ytest_prob = label_binarize(y_test, classes=labels)
ypred_prob = label_binarize(y_pred, classes=labels)

In [17]:
print("Accuracy Score:",accuracy_score(y_test,y_pred))
print("Precision Score:",precision_score(y_test,y_pred,average='micro'))
print("Recall Score:",recall_score(y_test,y_pred,average='micro'))
print("ROC-AUC Score:",roc_auc_score(ytest_prob,ypred_prob,multi_class='ovo',average='micro'))

Accuracy Score: 0.8265682656826568
Precision Score: 0.8265682656826568
Recall Score: 0.8265682656826568
ROC-AUC Score: 0.9130236596483612


In [119]:
idx = 5
print(f"Question: {X_test.iloc[idx]}")
print(f"\nPredicted Answer:\n{le.inverse_transform(model.predict(X_test_tf[idx]))[0]}")
print(f"\nActual Answer:\n{le.inverse_transform([y_test[idx]])[0]}")

Question: manner college to

Predicted Answer:
The college can be reached via bus train or car. The nearest railway station is Dhule and the nearest airport is Aurangabad or indore. https://engineering-shirpur.nmims.edu/contact-us/

Actual Answer:
The college can be reached via bus train or car. The nearest railway station is Dhule and the nearest airport is Aurangabad or indore. https://engineering-shirpur.nmims.edu/contact-us/


In [120]:
lemmatizer = WordNetLemmatizer()
def data(text):
    text = text.lower()
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = text.split()
    text = [lemmatizer.lemmatize(word) for word in text]
    text = " ".join(text)
    return text

In [121]:
questn = "who are your creators"
clean_ques = data(questn)
clean_ques = tf.transform([clean_ques])
print(f"Question: {questn}")
if np.amax(model.predict_proba(clean_ques))>0.1:
    print(f"\nPredicted Answer:\n{le.inverse_transform(model.predict(clean_ques))[0]}")
else:
    print(f"\nAnswer:\nThis might help you\n{le.inverse_transform(model.predict(clean_ques))[0]}")

<IPython.core.display.Javascript object>

Question: who are your creators

Predicted Answer:
My creators are Priyam Sekra and Vipul Bhatia.


In [122]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Assuming you have the true labels and predicted labels
true_labels = y_test
predicted_labels = y_pred

# Calculate precision, recall, and F1 score
precision = precision_score(true_labels, predicted_labels, average='macro')
recall = recall_score(true_labels, predicted_labels, average='macro')
f1 = f1_score(true_labels, predicted_labels, average='macro')

print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)


Precision: 0.7349162011173185
Recall: 0.7694684272896564
F1 Score: 0.7420058526203777


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
