### Importing neccessary library

In [141]:
import pandas as pd
import numpy as np
import string
import re
import nltk
import json
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords  
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder, label_binarize
from sklearn.linear_model import SGDClassifier
from collections import Counter
import nlpaug.augmenter.word as naw
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, accuracy_score, roc_auc_score
import gzip

### Importing data science faq dataset

In [142]:
path = "DataScience_FAQ.csv"
df = pd.read_csv(path)

In [143]:
df



Unnamed: 0,S.No,Question,Answer
0,,data,"the quantities, characters, or symbols on whic..."
1,,Data Science,Data science is an interdisciplinary field tha...
2,,Where should I learn Data Science from?,Kaggle is the most popular learning platform a...
3,,Do I need an extremely powerful computer to do...,While this may be the case for industrial leve...
4,,"Can you recommend some blogs, podcasts, course...","KDNuggets Blog, R Bloggers Blog Aggregator and..."
...,...,...,...
114,,confounding variables,These are extraneous variables in a statistica...
115,,eigenvalue,Eigenvalues are the directions along which a p...
116,,eigenvector,Eigenvectors are for understanding linear tran...
117,,selection bias,"Selection bias, in general, is a problematic s..."


In [144]:
df.drop(['S.No'],axis=1,inplace = True)

In [145]:
df.columns = ['question','answer']

In [146]:
stop_words=stopwords.words('english')
lemmatizer = WordNetLemmatizer()

### Clean our training data

In [147]:

def clean_data(text):
    text=text.lower() #lower the text
    text = re.sub(r'[^\w\s]', '', text) #remove irrelevant characters    
    text = text.split() #convert sentence to tokens
    text = [lemmatizer.lemmatize(word) for word in text] #lemmatization
    text = " ".join(text) #converting tokens to sentence
    return text
df["question"] = df["question"].apply(clean_data)
# model_type: word2vec, glove or fasttext
aug = naw.WordEmbsAug(
    model_type='glove', model_path="C:/Users/Sourav Kumar/Documents/College Assignment/glove.6B.100d.txt",
    action="substitute")
aug_data = {}
for ques,ans in tqdm(zip(df['question'],df['answer'])):
    for i in range(4):
        aug_data.update({aug.augment(ques):ans})

119it [00:06, 19.29it/s]


In [148]:
aug_df = pd.DataFrame(aug_data.items(),columns=['question','answer'])
aug_df

Unnamed: 0,question,answer
0,uses,"the quantities, characters, or symbols on whic..."
1,according,"the quantities, characters, or symbols on whic..."
2,these,"the quantities, characters, or symbols on whic..."
3,files,Raw Facts and figure
4,equipment science,Data science is an interdisciplinary field tha...
...,...,...
457,selection motivated,"Selection bias, in general, is a problematic s..."
458,matrimonial bias,Survivorship bias is the logical error of focu...
459,backplate bias,Survivorship bias is the logical error of focu...
460,greenspace bias,Survivorship bias is the logical error of focu...


In [149]:
final_df = pd.concat([df,aug_df])

In [150]:
final_df.to_csv("augmented1.csv",index=False)

### Modeling

In [151]:
final_df = pd.read_csv("augmented1.csv")

In [152]:
final_df

Unnamed: 0,question,answer
0,data,"the quantities, characters, or symbols on whic..."
1,data science,Data science is an interdisciplinary field tha...
2,where should i learn data science from,Kaggle is the most popular learning platform a...
3,do i need an extremely powerful computer to do...,While this may be the case for industrial leve...
4,can you recommend some blog podcasts course et...,"KDNuggets Blog, R Bloggers Blog Aggregator and..."
...,...,...
576,selection motivated,"Selection bias, in general, is a problematic s..."
577,matrimonial bias,Survivorship bias is the logical error of focu...
578,backplate bias,Survivorship bias is the logical error of focu...
579,greenspace bias,Survivorship bias is the logical error of focu...


In [153]:
X = final_df['question']
y = final_df['answer']

In [154]:
le = LabelEncoder()
y = le.fit_transform(y)

In [155]:
tf = TfidfVectorizer(ngram_range=(1, 3),min_df=0,stop_words='english')
X_w = tf.fit_transform(X)

model = SGDClassifier(n_jobs=-1,random_state=100,loss='modified_huber',alpha=0.0005)
model.fit(X_w,y)

SGDClassifier(alpha=0.0005, loss='modified_huber', n_jobs=-1, random_state=100)

In [156]:
# X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.20)

In [157]:

#X_train_tf = tf.fit_transform(X_train)

In [158]:
# X_test_tf = tf.transform(X_test)

In [159]:
# model = SGDClassifier(n_jobs=-1,random_state=100,loss='modified_huber',alpha=0.0005)
# model.fit(X_train_tf,y_train)

In [160]:
y_pred = model.predict(X_w)

In [161]:

labels = np.unique(y)
ytest_prob = label_binarize(y, classes=labels)
ypred_prob = label_binarize(y_pred, classes=labels)

In [162]:
X_w

<581x1277 sparse matrix of type '<class 'numpy.float64'>'
	with 2228 stored elements in Compressed Sparse Row format>

In [163]:
print("Accuracy Score:",accuracy_score(y,y_pred))
print("Precision Score:",precision_score(y,y_pred,average='micro'))
print("Recall Score:",recall_score(y,y_pred,average='micro'))
print("ROC-AUC Score:",roc_auc_score(ytest_prob,ypred_prob,multi_class='ovo',average='micro'))

Accuracy Score: 0.9707401032702238
Precision Score: 0.9707401032702238
Recall Score: 0.9707401032702238
ROC-AUC Score: 0.9852460690218501


In [164]:
idx = 7
print(f"Question: {X.iloc[idx]}")
print(f"\nPredicted Answer:\n{le.inverse_transform(model.predict(X_w[idx]))[0]}")
print(f"\nActual Answer:\n{le.inverse_transform([y[idx]])[0]}")

Question: what doe a data science job usually involve

Predicted Answer:
A very common misconception is that data scientists spend the majority of their time building predictive models and feature engineering.

Actual Answer:
A very common misconception is that data scientists spend the majority of their time building predictive models and feature engineering.


In [167]:
import pickle

file = open('DS_chatbot.pkl','wb')

pickle.dump(model,file)



In [168]:
model2 = open('DS_chatbot.pkl','rb')

forest = pickle.load(model2)

In [None]:
le1.inverse_transform(forest.predict(tf1.transform([clean_data('data')])))

array(['Raw Facts and figure'], dtype=object)

In [None]:
tf1 = TfidfVectorizer(ngram_range=(1, 3),min_df=0,stop_words='english')
le1 = LabelEncoder()

In [None]:
y = final_df['answer']
y_t = le1.fit_transform(y)

In [None]:
X_t = tf1.fit_transform(X)

In [None]:
X_tes = tf1.transform(X_test)