# Objective :
## To identify similar questions that have been already answered on a forum.
##  To provide the customers with a solution in the quickest way possible.

### Technique Used:
###   - Word Embedding Using infersent
###   - Cosine Similarity


# Imports

In [0]:
# Import Libraries that will be used for this project
import numpy as np
import pandas as pd
import time
from sklearn.metrics.pairwise import cosine_similarity

import torch
import torch.nn as nn
from matplotlib import pyplot as plt
%matplotlib inline

In [258]:
# Mount google colab
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
# Give the path 
import os
os.chdir('/content/drive/My Drive/capstone nlp')

# Data

In [260]:
#Importing dataset 
df = pd.read_csv("capstonelinks.csv")
df.shape

(38496, 5)

In [261]:
# dataset structure
df.head(10)

Unnamed: 0.1,Unnamed: 0,Questions,Summary,Tag,Link
0,0,Named Entity Recognition on reviews,i have set of reviews as mentioned belowinputT...,"python-3.x,machine-learning,spacy,text-analyti...",https://stackoverflow.com/questions/61024502/n...
1,1,When to do validation when we are training bas...,Sorry if this question seems a bit odd and per...,"validation,machine-learning,neural-network,tra...",https://stackoverflow.com/questions/61024500/w...
2,2,How to go about learning sklearn and scipy lib...,I have started an online course from Appliedco...,"python,machine-learning,scikit-learn,scipy,art...",https://stackoverflow.com/questions/61024110/h...
3,3,"I am trying to make binary classifications, bu...",Trying to make binary classifications and divi...,"pandas,dataframe,machine-learning,data-science",https://stackoverflow.com/questions/61024038/i...
4,4,Best practices in the selection of distance me...,I have been reading about this on various chan...,"r,machine-learning,cluster-analysis,distance,c...",https://stackoverflow.com/questions/61023899/b...
5,5,Smart compose in android,I am looking for a way to build smart compose ...,"python,machine-learning,gmail,artificial-intel...",https://stackoverflow.com/questions/61023709/s...
6,6,Very Big CSV File How to Read Only Certain Ro...,I have a very large csv file that I cannot use...,"python,pandas,csv,machine-learning,dask",https://stackoverflow.com/questions/61023632/v...
7,7,How can I teach a model using single class in ...,I am new to machine learning and currently got...,"python,machine-learning,keras",https://stackoverflow.com/questions/61023400/h...
8,8,tensorflow multilabel classification Incompat...,I am trying to create a multilabel classifier ...,"python,tensorflow,machine-learning,conv-neural...",https://stackoverflow.com/questions/61022922/t...
9,9,Is it possible to use a keras ModelCheckpoint ...,Can ModelCheckpoints or any combination of cal...,"machine-learning,keras,callback",https://stackoverflow.com/questions/61022542/i...


In [262]:
# Questions in the dataset
m=df["Questions"].head(50)
#print(m)
for i in m:
  print(i)


Named Entity Recognition on reviews
When to do validation when we are training based off of training steps
How to go about learning sklearn and scipy libraries 
I am trying to make binary classifications, but I ended up getting this error 
Best practices in the selection of distance metric and clustering methods for gene expression data in R
Smart compose in android 
Very Big CSV File  How to Read Only Certain Rows into Data Frame
How can I teach a model using single class in machine learning using Keras
tensorflow multilabel classification  Incompatible shapes 7,5 vs. 
Is it possible to use a keras ModelCheckpoint that minimizes both valloss and testloss
CNNLSTM with extra data but I got an error 
In Classification, can feature engineering by frequent onehot encoding prove ineffective
How can I write a function of Naive Bayes classifier which can handle both numerical and nominal attributes in python 
Cost function is rising with my gradient descent and not sure with the theory i read

# Infersent Model

In [0]:
# Infersent class for training
class InferSent(nn.Module):

    def __init__(self, config):
        super(InferSent, self).__init__()
        self.bsize = config['bsize']
        self.word_emb_dim = config['word_emb_dim']
        self.enc_lstm_dim = config['enc_lstm_dim']
        self.pool_type = config['pool_type']
        self.dpout_model = config['dpout_model']
        self.version = 1 if 'version' not in config else config['version']

        self.enc_lstm = nn.LSTM(self.word_emb_dim, self.enc_lstm_dim, 1,
                                bidirectional=True, dropout=self.dpout_model)

        assert self.version in [1, 2]
        if self.version == 1:
            self.bos = '<s>'
            self.eos = '</s>'
            self.max_pad = True
            self.moses_tok = False
        elif self.version == 2:
            self.bos = '<p>'
            self.eos = '</p>'
            self.max_pad = False
            self.moses_tok = True

    def is_cuda(self):
        # either all weights are on cpu or they are on gpu
        return self.enc_lstm.bias_hh_l0.data.is_cuda

    def forward(self, sent_tuple):
        # sent_len: [max_len, ..., min_len] (bsize)
        # sent: (seqlen x bsize x worddim)
        sent, sent_len = sent_tuple
        print(sent.shape,sent_len.shape)
        # Sort by length (keep idx)
        sent_len_sorted, idx_sort = np.sort(sent_len)[::-1], np.argsort(-sent_len)
        sent_len_sorted = sent_len_sorted.copy()
        idx_unsort = np.argsort(idx_sort)

        idx_sort = torch.from_numpy(idx_sort).cuda() if self.is_cuda() \
            else torch.from_numpy(idx_sort)
        sent = sent.index_select(1, idx_sort)

        # Handling padding in Recurrent Networks
        sent_packed = nn.utils.rnn.pack_padded_sequence(sent, sent_len_sorted)
        print("sentpacked",sent_packed)
        print("sent_sorted",sent_len_sorted)
        sent_output = self.enc_lstm(sent_packed)[0]  # seqlen x batch x 2*nhid
        sent_output = nn.utils.rnn.pad_packed_sequence(sent_output)[0]

        # Un-sort by length
        idx_unsort = torch.from_numpy(idx_unsort).cuda() if self.is_cuda() \
            else torch.from_numpy(idx_unsort)
        sent_output = sent_output.index_select(1, idx_unsort)
        print("sent_output",sent_output.shape)

        # Pooling
        if self.pool_type == "mean":
            sent_len = torch.FloatTensor(sent_len.copy()).unsqueeze(1).cuda()
            emb = torch.sum(sent_output, 0).squeeze(0)
            emb = emb / sent_len.expand_as(emb)
        elif self.pool_type == "max":
            if not self.max_pad:
                sent_output[sent_output == 0] = -1e9
            emb = torch.max(sent_output, 0)[0]
            if emb.ndimension() == 3:
                emb = emb.squeeze(0)
                assert emb.ndimension() == 2

        return emb

    def set_w2v_path(self, w2v_path):
        self.w2v_path = w2v_path

    def get_word_dict(self, sentences, tokenize=True):
        # create vocab of words
        word_dict = {}
        sentences = [s.split() if not tokenize else self.tokenize(s) for s in sentences]
        print("sentences after tokenization",sentences)
        for sent in sentences:
            for word in sent:
                if word not in word_dict:
                    word_dict[word] = ''
        word_dict[self.bos] = ''
        word_dict[self.eos] = ''
        return word_dict

    def get_w2v(self, word_dict):
        assert hasattr(self, 'w2v_path'), 'w2v path not set'
        # create word_vec with w2v vectors
        word_vec = {}
        with open(self.w2v_path, encoding='utf-8') as f:
            for line in f:
                word, vec = line.split(' ', 1)
                if word in word_dict:
                    word_vec[word] = np.fromstring(vec, sep=' ')
        print('Found %s(/%s) words with w2v vectors' % (len(word_vec), len(word_dict)))
        print("word_vec",word_vec.keys())
        return word_vec

    def get_w2v_k(self, K):
        assert hasattr(self, 'w2v_path'), 'w2v path not set'
        # create word_vec with k first w2v vectors
        k = 0
        word_vec = {}
        with open(self.w2v_path, encoding='utf-8') as f:
            for line in f:
                word, vec = line.split(' ', 1)
                if k <= K:
                    word_vec[word] = np.fromstring(vec, sep=' ')
                    k += 1
                if k > K:
                    if word in [self.bos, self.eos]:
                        word_vec[word] = np.fromstring(vec, sep=' ')

                if k > K and all([w in word_vec for w in [self.bos, self.eos]]):
                    break
        return word_vec

    def build_vocab(self, sentences, tokenize=True):
        assert hasattr(self, 'w2v_path'), 'w2v path not set'
        word_dict = self.get_word_dict(sentences, tokenize)
        self.word_vec = self.get_w2v(word_dict)
        print('Vocab size : %s' % (len(self.word_vec)))
        #print(self.word_vec)

    # build w2v vocab with k most frequent words
    def build_vocab_k_words(self, K):
        assert hasattr(self, 'w2v_path'), 'w2v path not set'
        self.word_vec = self.get_w2v_k(K)
        print('Vocab size : %s' % (K))

    def update_vocab(self, sentences, tokenize=True):
        assert hasattr(self, 'w2v_path'), 'warning : w2v path not set'
        assert hasattr(self, 'word_vec'), 'build_vocab before updating it'
        word_dict = self.get_word_dict(sentences, tokenize)

        # udpate vocabulary
        if word_dict:
            new_word_vec = self.get_w2v(word_dict)
            self.word_vec.update(new_word_vec)
        else:
            new_word_vec = []
        print('New vocab size : %s (added %s words)'% (len(self.word_vec), len(new_word_vec)))

    def get_batch(self, batch):
        # sent in batch in decreasing order of lengths
        # batch: (bsize, max_len, word_dim)
        embed = np.zeros((len(batch[0]), len(batch), self.word_emb_dim))
        print("embed",embed.shape)
        for i in range(len(batch)):
            for j in range(len(batch[i])):
                embed[j, i, :] = self.word_vec[batch[i][j]]

        return torch.FloatTensor(embed)

    def tokenize(self, s):
        # tokenize the sentences
        from nltk.tokenize import word_tokenize
        if self.moses_tok:
            s = ' '.join(word_tokenize(s))
            s = s.replace(" n't ", "n 't ")  # HACK to get ~MOSES tokenization
            return s.split()
        else:
            return word_tokenize(s)

    def prepare_samples(self, sentences, bsize, tokenize, verbose):
        sentences = [[self.bos] + s.split() + [self.eos] if not tokenize else
                     [self.bos] + self.tokenize(s) + [self.eos] for s in sentences]
        n_w = np.sum([len(x) for x in sentences])

        # filters words without w2v vectors
        for i in range(len(sentences)):
            s_f = [word for word in sentences[i] if word in self.word_vec]
            if not s_f:
                import warnings
                warnings.warn('No words in "%s" (idx=%s) have w2v vectors. \
                               Replacing by "</s>"..' % (sentences[i], i))
                s_f = [self.eos]
            sentences[i] = s_f

        lengths = np.array([len(s) for s in sentences])
        n_wk = np.sum(lengths)
        if verbose:
            print('Nb words kept : %s/%s (%.1f%s)' % (
                        n_wk, n_w, 100.0 * n_wk / n_w, '%'))

        # sort by decreasing length
        lengths, idx_sort = np.sort(lengths)[::-1], np.argsort(-lengths)
        sentences = np.array(sentences)[idx_sort]

        return sentences, lengths, idx_sort

    def encode(self, sentences, bsize=64, tokenize=True, verbose=False):
        tic = time.time()
        sentences, lengths, idx_sort = self.prepare_samples(
                        sentences, bsize, tokenize, verbose)   #[[sentense1_tokens],["sentense_2_tokens"]]
                                                               # [sent1_len,sent2_len]
                                                               # [flagofsentence]

        print(f"Sentences:{sentences}, length{lengths},idx_sort{idx_sort}")
        embeddings = []
        for stidx in range(0, len(sentences), bsize):
            print("stidx",stidx)
            print("before_get_batch",sentences[stidx:stidx + bsize])
            batch = self.get_batch(sentences[stidx:stidx + bsize])
            print("batch",batch,batch.shape)
            if self.is_cuda():
                batch = batch.cuda()
            with torch.no_grad():
                batch = self.forward((batch, lengths[stidx:stidx + bsize])).data.cpu().numpy()
                print(batch.shape)
            embeddings.append(batch)
        embeddings = np.vstack(embeddings)

        # unsort
        idx_unsort = np.argsort(idx_sort)
        embeddings = embeddings[idx_unsort]

        if verbose:
            print('Speed : %.1f sentences/s (%s mode, bsize=%s)' % (
                    len(embeddings)/(time.time()-tic),
                    'gpu' if self.is_cuda() else 'cpu', bsize))
        return embeddings

    def visualize(self, sent, tokenize=True):

        sent = sent.split() if not tokenize else self.tokenize(sent)
        sent = [[self.bos] + [word for word in sent if word in self.word_vec] + [self.eos]]

        if ' '.join(sent[0]) == '%s %s' % (self.bos, self.eos):
            import warnings
            warnings.warn('No words in "%s" have w2v vectors. Replacing \
                           by "%s %s"..' % (sent, self.bos, self.eos))
        batch = self.get_batch(sent)

        if self.is_cuda():
            batch = batch.cuda()
        output = self.enc_lstm(batch)[0]
        output, idxs = torch.max(output, 0)
        # output, idxs = output.squeeze(), idxs.squeeze()
        idxs = idxs.data.cpu().numpy()
        argmaxs = [np.sum((idxs == k)) for k in range(len(sent[0]))]

        # visualize model
        import matplotlib.pyplot as plt
        x = range(len(sent[0]))
        y = [100.0 * n / np.sum(argmaxs) for n in argmaxs]
        plt.xticks(x, sent[0], rotation=45)
        plt.bar(x, y)
        plt.ylabel('%')
        plt.title('Visualisation of words importance')
        plt.show()

        return output, idxs, x, y,sent


# Sentence Embedding

In [264]:
# Assign the batch size ,model version ,word_emb_dim that is to passed to class infersent
model_version = 2
MODEL_PATH = 'infersent%s.pkl' % model_version
params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                'pool_type': 'max', 'dpout_model': 0.0, 'version': model_version}
model = InferSent(params_model)
model.load_state_dict(torch.load(MODEL_PATH))

<All keys matched successfully>

In [0]:
# Keep it on CPU or put it on GPU
use_cuda = False
model = model.cuda() if use_cuda else model

In [0]:
W2V_PATH = 'GloVe/glove.840B.300d.txt' if model_version == 1 else 'crawl-300d-2M.vec'
model.set_w2v_path(W2V_PATH)

In [0]:
# If infersent1 -> use GloVe embeddings. If infersent2 -> use InferSent embeddings.
# crawl-300d-2M is used here
W2V_PATH = 'crawl-300d-2M.vec'
model.set_w2v_path(W2V_PATH)

In [268]:
for line in df['Questions']:
  print(line)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Finding the most treelike hierarchy that explains the data
Event Extractor from text
Speedefficient classification in Matlab
how much linear algebra should i know to study machine learning 
compute AUC metric for Matrix Factorization output
Protein interaction dataset and MLN script explanation
How to speed up the model creation process of OpenNLP
dyld Library not loaded lib/libopencvcore.3.0.dylib Reason image not found
How to get most informative features for scikitlearn classifier for different class
R, Confusion Matrix in percent
Naive Bayes classifier  accuracy
How to use a custom SVM kernel
Confused with repect to working of GridSearchCV
How to detect text region from a document image
Detecting danger in tweets 
Sentiment Analysis java Library 
extracting the meaning of a sentence
Python scikit regression PCA on faces
Neural network with batch training algorithm, when to apply momentum and weight decay
Why is there 

In [269]:
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [270]:
# Building model voacb
model.build_vocab(df.Questions.values)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Found 14084(/22663) words with w2v vectors
Vocab size : 14084


In [271]:
# Encoding the model
embedding = model.encode(df.Questions.values,tokenize=True)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
 list(['<p>', 'Module', 'has', 'no', 'attribute', '</p>'])
 list(['<p>', 'not', 'showing', 'column', 'image', '</p>'])
 list(['<p>', 'Azure', 'machine', 'learning', 'classification', '</p>'])
 list(['<p>', 'Modeling', 'using', 'Neural', 'Network', '</p>'])
 list(['<p>', 'Depth', 'Estimation', 'using', 'Keras', '</p>'])
 list(['<p>', 'Joining', 'two', 'in', 'Keras', '</p>'])
 list(['<p>', 'Theano', 'learning', 'AND', 'gate', '</p>'])
 list(['<p>', 'Imbalance', 'Data', 'For', 'Classification', '</p>'])
 list(['<p>', 'Error', 'during', 'feature', 'selection', '</p>'])
 list(['<p>', 'Neural', 'Networks', 'activation', 'function', '</p>'])
 list(['<p>', 'Bayesian', 'curve', 'fitting', 'model', '</p>'])
 list(['<p>', 'time', 'series', 'anomaly', 'detection', '</p>'])]
embed (6, 64, 300)
batch tensor([[[-0.3398,  0.3010,  0.1689,  ...,  0.0639,  0.0120, -0.1761],
         [-0.3398,  0.3010,  0.1689,  ...,  0.0639,  0.0120, -0.17

In [272]:
# shape of the embedding numpy array
embedding.shape

(38496, 4096)

In [0]:
# Saving the model
import pickle
pickle.dump(model, open('modelAnkitnew.pkl','wb'))

In [0]:
# Saving the embedding numpy array
import pickle
embedlinks=embedding[:10000][:]
pickle.dump(embedlinks, open('embedlinkspeak10000.pkl','wb'))

In [0]:
# From sklearn use cosine similarity
from sklearn.metrics.pairwise import cosine_similarity
x=cosine_similarity(embedding)

# 1st Level Filtering based on question

## Question Asked:"How is cross validation implemented"

In [276]:
# Displaying the most similar question to the original question
import numpy as np     

np.fill_diagonal(x, np.nan)   # Putting nan value in the diagonal so that similar question and original question is not same                                                                                                                                                                                                                          
input_idx = list(np.where(df["Questions"] == "How is cross validation implemented")[0])     # input index of question asked                                                                                                                                                                                                                
result_idx = np.nanargmax(x[input_idx])   # nanargmax is used to display similar question                                                                                                                                                                                                             
t=df.iloc[result_idx] 
t['Questions']
                                                                                                                                                                                                                                                
                                                                                                                                                                                                                                     


'How to implement kfold cross validation in hmmlearn'

In [277]:
# argsort is used to display top similar questions to the question asked
L=np.argsort(x[input_idx],axis=1)
t1=L[:,-2]
t2=L[:,-3]
t3=L[:,-4]
t4=L[:,-5]
t5=L[:,-6]
t6=L[:,-7]
t7=L[:,-8]
t8=L[:,-9]
t9=L[:,-10]
t10=L[:,-11]
t1=np.asscalar(t1)   # converting to number
t2=np.asscalar(t2)
t3=np.asscalar(t3)
t4=np.asscalar(t4)
t5=np.asscalar(t5)
t6=np.asscalar(t6)
t7=np.asscalar(t7)
t8=np.asscalar(t8)
t9=np.asscalar(t9)
t10=np.asscalar(t10)
df.iloc[[t1,t2,t3,t4,t5,t6,t7,t8,t9,t10]]         

  if sys.path[0] == '':
  del sys.path[0]
  
  from ipykernel import kernelapp as app
  app.launch_new_instance()


Unnamed: 0.1,Unnamed: 0,Questions,Summary,Tag,Link
22870,22870,How to implement kfold cross validation in hmm...,The hmmlearn tutorial demonstrates how a Hidde...,"python,machine-learning,cross-validation,hmmle...",https://stackoverflow.com/questions/44667898/h...
7683,7683,How to create a kfold cross validation test,I have a data from a pollution sensor that I w...,"python,machine-learning,cross-validation,k-fold",https://stackoverflow.com/questions/57021928/h...
35298,35298,How does cross validation work for testing,So Im having some trouble on understanding how...,machine-learning,https://stackoverflow.com/questions/20075338/h...
23650,23650,How to get cross validation accuracy,I am learning dataset from this page.https//ww...,"matlab,machine-learning,naivebayes",https://stackoverflow.com/questions/43903276/h...
12035,12035,How to apply cross validation on data,I want to evaluate a ML model using the averag...,"machine-learning,scikit-learn,random-forest,cr...",https://stackoverflow.com/questions/53873571/h...
1795,1795,What kind of cross validation to use,I have a text dataset that consists of about 5...,"machine-learning,deep-learning,nlp,cross-valid...",https://stackoverflow.com/questions/60315352/w...
7351,7351,How do you do cross validation correctly,I am confused on how to properly do kfold cros...,"machine-learning,cross-validation",https://stackoverflow.com/questions/57233712/h...
34891,34891,Why use cross validation,I am entering several Kaggle Machine Learning ...,"machine-learning,artificial-intelligence,cross...",https://stackoverflow.com/questions/21445750/w...
26824,26824,Which model should kfold cross validation return,I have read a bit about kfold cross validation...,machine-learning,https://stackoverflow.com/questions/40347615/w...
366,366,kfold cross validation using DataLoaders in Py...,I have splitted my training dataset into 80 tr...,"machine-learning,deep-learning,computer-vision...",https://stackoverflow.com/questions/60883696/k...


# 2nd Level Filtering based on question and summary

In [278]:
# making a dataframe from question asked
level2Question=df.loc[df['Questions']=='How is cross validation implemented']
level2Question

Unnamed: 0.1,Unnamed: 0,Questions,Summary,Tag,Link
26077,26077,How is cross validation implemented,I am currently trying to train a neural networ...,"validation,machine-learning,cross-validation",https://stackoverflow.com/questions/41216976/h...


In [0]:
#most similar questions dataframe
level2=df.iloc[[t1,t2,t3,t4,t5,t6,t7,t8,t9,t10]]


In [280]:
# Append the question dataframe to most similar questions dataframe
level2Question.append(level2)

Unnamed: 0.1,Unnamed: 0,Questions,Summary,Tag,Link
26077,26077,How is cross validation implemented,I am currently trying to train a neural networ...,"validation,machine-learning,cross-validation",https://stackoverflow.com/questions/41216976/h...
22870,22870,How to implement kfold cross validation in hmm...,The hmmlearn tutorial demonstrates how a Hidde...,"python,machine-learning,cross-validation,hmmle...",https://stackoverflow.com/questions/44667898/h...
7683,7683,How to create a kfold cross validation test,I have a data from a pollution sensor that I w...,"python,machine-learning,cross-validation,k-fold",https://stackoverflow.com/questions/57021928/h...
35298,35298,How does cross validation work for testing,So Im having some trouble on understanding how...,machine-learning,https://stackoverflow.com/questions/20075338/h...
23650,23650,How to get cross validation accuracy,I am learning dataset from this page.https//ww...,"matlab,machine-learning,naivebayes",https://stackoverflow.com/questions/43903276/h...
12035,12035,How to apply cross validation on data,I want to evaluate a ML model using the averag...,"machine-learning,scikit-learn,random-forest,cr...",https://stackoverflow.com/questions/53873571/h...
1795,1795,What kind of cross validation to use,I have a text dataset that consists of about 5...,"machine-learning,deep-learning,nlp,cross-valid...",https://stackoverflow.com/questions/60315352/w...
7351,7351,How do you do cross validation correctly,I am confused on how to properly do kfold cros...,"machine-learning,cross-validation",https://stackoverflow.com/questions/57233712/h...
34891,34891,Why use cross validation,I am entering several Kaggle Machine Learning ...,"machine-learning,artificial-intelligence,cross...",https://stackoverflow.com/questions/21445750/w...
26824,26824,Which model should kfold cross validation return,I have read a bit about kfold cross validation...,machine-learning,https://stackoverflow.com/questions/40347615/w...


In [282]:
level2Summary=level2Question.append(level2)
for line in level2Summary['Summary']:
  print(line)

I am currently trying to train a neural network using cross validation, but I am not sure if I am getting how cross validation works. I understand the concept, but I cant totally see yet how the concept ...
The hmmlearn tutorial demonstrates how a Hidden Markov Model can be fitted to a datasetmodel  hmm.GaussianHMMncomponents3, covariancetypefull, niter100model.fitXIs there a built...
I have a data from a pollution sensor that I wish to validate. I am comparing it to data from londonair.org.uk to compare it. I have created a simple linear regression model with my sensor data on the ...
So Im having some trouble on understanding how cross validation works in machine learning for model building.Suppose I have a dataset with 100 samples, and I perform 10 fold cross validation.From ...
I am learning dataset from this page.https//www.mathworks.com/help/stats/examples/classification.htmlThe code I have now isload irisdataset.datgscattermeas,1, meas,2, species,rgb,...
I want to evaluate a ML 

In [283]:
# vocab of model
model.build_vocab(level2Summary.Summary.values)

sentences after tokenization [['I', 'am', 'currently', 'trying', 'to', 'train', 'a', 'neural', 'network', 'using', 'cross', 'validation', ',', 'but', 'I', 'am', 'not', 'sure', 'if', 'I', 'am', 'getting', 'how', 'cross', 'validation', 'works', '.', 'I', 'understand', 'the', 'concept', ',', 'but', 'I', 'cant', 'totally', 'see', 'yet', 'how', 'the', 'concept', '...'], ['The', 'hmmlearn', 'tutorial', 'demonstrates', 'how', 'a', 'Hidden', 'Markov', 'Model', 'can', 'be', 'fitted', 'to', 'a', 'datasetmodel', 'hmm.GaussianHMMncomponents3', ',', 'covariancetypefull', ',', 'niter100model.fitXIs', 'there', 'a', 'built', '...'], ['I', 'have', 'a', 'data', 'from', 'a', 'pollution', 'sensor', 'that', 'I', 'wish', 'to', 'validate', '.', 'I', 'am', 'comparing', 'it', 'to', 'data', 'from', 'londonair.org.uk', 'to', 'compare', 'it', '.', 'I', 'have', 'created', 'a', 'simple', 'linear', 'regression', 'model', 'with', 'my', 'sensor', 'data', 'on', 'the', '...'], ['So', 'Im', 'having', 'some', 'trouble', '

In [284]:
# Encoding the summary of the model
embedding1 = model.encode(level2Summary.Summary.values,tokenize=True)

Sentences:[list(['<p>', 'I', 'am', 'currently', 'trying', 'to', 'train', 'a', 'neural', 'network', 'using', 'cross', 'validation', ',', 'but', 'I', 'am', 'not', 'sure', 'if', 'I', 'am', 'getting', 'how', 'cross', 'validation', 'works', '.', 'I', 'understand', 'the', 'concept', ',', 'but', 'I', 'cant', 'totally', 'see', 'yet', 'how', 'the', 'concept', '...', '</p>'])
 list(['<p>', 'I', 'have', 'read', 'a', 'bit', 'about', 'cross', 'validation', '.', 'I', 'know', 'that', 'it', 'measures', 'each', 'models', 'performance', 'as', 'a', 'number', 'and', 'returns', 'the', 'average', '.', 'But', 'I', 'still', 'dont', 'know', ',', 'which', 'of', 'the', 'k', 'models', 'is', 'returned', 'as', 'a', '...', '</p>'])
 list(['<p>', 'I', 'have', 'a', 'data', 'from', 'a', 'pollution', 'sensor', 'that', 'I', 'wish', 'to', 'validate', '.', 'I', 'am', 'comparing', 'it', 'to', 'data', 'from', 'to', 'compare', 'it', '.', 'I', 'have', 'created', 'a', 'simple', 'linear', 'regression', 'model', 'with', 'my', 'se

In [285]:
embedding1.shape

(11, 4096)

In [286]:
# cosine similarity on the encoded summary
y=cosine_similarity(embedding1)
y

array([[1.0000002 , 0.44677645, 0.65196824, 0.62999755, 0.51978886,
        0.75955594, 0.6531895 , 0.68950254, 0.633119  , 0.7005533 ,
        0.7224381 ],
       [0.44677645, 1.        , 0.53578514, 0.46893048, 0.41626927,
        0.4987889 , 0.48775285, 0.46634987, 0.47827536, 0.48248082,
        0.43745732],
       [0.65196824, 0.53578514, 1.0000001 , 0.6304186 , 0.56530774,
        0.6833227 , 0.68157494, 0.659302  , 0.61168474, 0.67167807,
        0.65682817],
       [0.62999755, 0.46893048, 0.6304186 , 0.9999998 , 0.52255493,
        0.6745768 , 0.7434357 , 0.6824467 , 0.6506576 , 0.6524363 ,
        0.6917036 ],
       [0.51978886, 0.41626927, 0.56530774, 0.52255493, 0.99999994,
        0.5044168 , 0.5854488 , 0.5533823 , 0.526441  , 0.49298272,
        0.5150052 ],
       [0.75955594, 0.4987889 , 0.6833227 , 0.6745768 , 0.5044168 ,
        1.0000001 , 0.716784  , 0.69733196, 0.66135925, 0.7239845 ,
        0.75512826],
       [0.6531895 , 0.48775285, 0.68157494, 0.7434357 , 0.

In [287]:
np.fill_diagonal(y, np.nan)
y

array([[       nan, 0.44677645, 0.65196824, 0.62999755, 0.51978886,
        0.75955594, 0.6531895 , 0.68950254, 0.633119  , 0.7005533 ,
        0.7224381 ],
       [0.44677645,        nan, 0.53578514, 0.46893048, 0.41626927,
        0.4987889 , 0.48775285, 0.46634987, 0.47827536, 0.48248082,
        0.43745732],
       [0.65196824, 0.53578514,        nan, 0.6304186 , 0.56530774,
        0.6833227 , 0.68157494, 0.659302  , 0.61168474, 0.67167807,
        0.65682817],
       [0.62999755, 0.46893048, 0.6304186 ,        nan, 0.52255493,
        0.6745768 , 0.7434357 , 0.6824467 , 0.6506576 , 0.6524363 ,
        0.6917036 ],
       [0.51978886, 0.41626927, 0.56530774, 0.52255493,        nan,
        0.5044168 , 0.5854488 , 0.5533823 , 0.526441  , 0.49298272,
        0.5150052 ],
       [0.75955594, 0.4987889 , 0.6833227 , 0.6745768 , 0.5044168 ,
               nan, 0.716784  , 0.69733196, 0.66135925, 0.7239845 ,
        0.75512826],
       [0.6531895 , 0.48775285, 0.68157494, 0.7434357 , 0.

In [288]:
input_idx1 = list(np.where(level2Summary["Questions"] == "How is cross validation implemented")[0])
input_idx1

[0]

In [289]:
result_idx1 = np.nanargmax(y[input_idx1])                                                                                                                                                                                                                
level2Summary.iloc[result_idx1]                                                                                                                                                                                                                             
                                                                                                                                                                                                                   


Unnamed: 0                                                12035
Questions                 How to apply cross validation on data
Summary       I want to evaluate a ML model using the averag...
Tag           machine-learning,scikit-learn,random-forest,cr...
Link          https://stackoverflow.com/questions/53873571/h...
Name: 12035, dtype: object

# Top 5 similar questions after filtering based on question and summary

In [290]:
# After filtering both question and summary we get the final dataframe
L=np.argsort(y[input_idx1],axis=1)
t1=L[:,-2]
t2=L[:,-3]
t3=L[:,-4]
t4=L[:,-5]
t5=L[:,-6]
t1=np.asscalar(t1)
t2=np.asscalar(t2)
t3=np.asscalar(t3)
t4=np.asscalar(t4)
t5=np.asscalar(t5)
level2Summary.iloc[[t1,t2,t3,t4,t5]]   # final dataframe

  import sys
  
  if __name__ == '__main__':
  # Remove the CWD from sys.path while we load stuff.
  # This is added back by InteractiveShellApp.init_path()


Unnamed: 0.1,Unnamed: 0,Questions,Summary,Tag,Link
12035,12035,How to apply cross validation on data,I want to evaluate a ML model using the averag...,"machine-learning,scikit-learn,random-forest,cr...",https://stackoverflow.com/questions/53873571/h...
366,366,kfold cross validation using DataLoaders in Py...,I have splitted my training dataset into 80 tr...,"machine-learning,deep-learning,computer-vision...",https://stackoverflow.com/questions/60883696/k...
26824,26824,Which model should kfold cross validation return,I have read a bit about kfold cross validation...,machine-learning,https://stackoverflow.com/questions/40347615/w...
7351,7351,How do you do cross validation correctly,I am confused on how to properly do kfold cros...,"machine-learning,cross-validation",https://stackoverflow.com/questions/57233712/h...
1795,1795,What kind of cross validation to use,I have a text dataset that consists of about 5...,"machine-learning,deep-learning,nlp,cross-valid...",https://stackoverflow.com/questions/60315352/w...


In [291]:
for i in level2Summary["Questions"]:
    print(i)

How is cross validation implemented
How to implement kfold cross validation in hmmlearn
How to create a kfold cross validation test
How does cross validation work for testing
How to get cross validation accuracy
How to apply cross validation on data
What kind of cross validation to use
How do you do cross validation correctly 
Why use cross validation 
Which model should kfold cross validation return
kfold cross validation using DataLoaders in PyTorch


# New Questions asked in stackoverflow forum

## Question Asked:"machine learning model"
## Summary:"machine learning"

In [303]:
d={'Questions':"machine learning model",'Summary':"machine learning"}
new=pd.DataFrame([d])
new         # Dataframe for new question and answer 


Unnamed: 0,Questions,Summary
0,machine learning model,machine learning


In [304]:
# encoding question of new dataframe
newd = model.encode(new.Questions.values,tokenize=True)

Sentences:[['<p>' 'machine' 'learning' 'model' '</p>']], length[5],idx_sort[0]
stidx 0
before_get_batch [['<p>' 'machine' 'learning' 'model' '</p>']]
embed (5, 1, 300)
batch tensor([[[-0.3398,  0.3010,  0.1689,  ...,  0.0639,  0.0120, -0.1761]],

        [[ 0.3476, -0.0858,  0.0831,  ...,  0.0441, -0.0252, -0.0936]],

        [[-0.1421, -0.2200, -0.2103,  ..., -0.0515,  0.3455, -0.2057]],

        [[-0.2713,  0.0176,  0.4712,  ...,  0.1506, -0.0695, -0.1137]],

        [[-0.2010,  0.3212, -0.0270,  ...,  0.1667, -0.0982, -0.0186]]]) torch.Size([5, 1, 300])
torch.Size([5, 1, 300]) (1,)
sentpacked PackedSequence(data=tensor([[-0.3398,  0.3010,  0.1689,  ...,  0.0639,  0.0120, -0.1761],
        [ 0.3476, -0.0858,  0.0831,  ...,  0.0441, -0.0252, -0.0936],
        [-0.1421, -0.2200, -0.2103,  ..., -0.0515,  0.3455, -0.2057],
        [-0.2713,  0.0176,  0.4712,  ...,  0.1506, -0.0695, -0.1137],
        [-0.2010,  0.3212, -0.0270,  ...,  0.1667, -0.0982, -0.0186]]), batch_sizes=tensor([1, 1,

In [305]:
# cosine similarity on new question asked dataframe and the original dataframe
L=[]
for i in range(len(embedding)):
  cosine=cosine_similarity(embedding[i].reshape(1,-1),newd.reshape(1,-1))  # value after cosine similarity
  cosine
  z=str(cosine).replace('[','').replace(']','')  # remove [ tags from array
  print(z)
  L.append(z)
  L
  npa = np.asarray(L, dtype=np.float32)
npa        # numpy array 

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
0.2917515
0.41705617
0.51103824
0.42471647
0.37232363
0.42196214
0.3454489
0.10642578
0.32618177
0.25243992
0.4324445
0.39665544
0.34042537
0.35897654
0.34119618
0.40361243
0.40343964
0.33257765
0.26501638
0.19123723
0.34159145
0.46960795
0.18824816
0.46476614
0.2508157
0.30308622
0.35761625
0.34155998
0.46064836
0.0480756
0.38137394
0.4875955
0.26024103
0.36482656
0.26290518
0.21857671
0.41010517
0.47715175
0.36148357
0.3980922
0.48689097
0.4722358
0.16364527
0.41551068
0.43979305
0.24025613
0.4022683
0.38050616
0.278855
0.4351678
0.33703208
0.5071886
0.344545
0.29830506
0.4043988
0.44969213
0.18444312
0.39033556
0.3445208
0.4451235
0.24552962
0.38280892
0.20692948
0.40425062
0.5856088
0.49590334
0.25355542
0.41431996
0.40508303
0.392249
0.42027977
0.11998904
0.50333416
0.1418265
0.38780057
0.35773042
0.3581543
0.25819173
0.25832075
0.17825541
0.37019834
0.44415194
0.19429143
0.64883757
0.4270531
0.3289405
0.61702305
0.0

array([0.39823952, 0.3022942 , 0.27777642, ..., 0.48312193, 0.27615023,
       0.21258092], dtype=float32)

# 1st Level filtering based on question

In [0]:
# Display top 25 similar questions
newquestions=npa.argsort()[:-26:-1]

In [307]:
npa[newquestions]

array([0.89553607, 0.88530374, 0.88360226, 0.8708282 , 0.8617483 ,
       0.84713745, 0.8404884 , 0.83513784, 0.8249777 , 0.8224673 ,
       0.8182874 , 0.816903  , 0.8123721 , 0.8121997 , 0.8105582 ,
       0.81007427, 0.80899954, 0.8043939 , 0.79588497, 0.7931564 ,
       0.7922145 , 0.79109   , 0.7902652 , 0.7886176 , 0.78445256],
      dtype=float32)

In [0]:
m=df.iloc[newquestions]

In [309]:
# Displaying question asked and 25 top similar questions  after 1st level filtering
level2Summarynew=new.append(m)
level2Summarynew.drop('Unnamed: 0', axis=1, inplace=True)
level2Summarynew

Unnamed: 0,Questions,Summary,Tag,Link
0,machine learning model,machine learning,,
4095,machine learning and model training,I am working on a machine learning project whe...,"machine-learning,deep-learning",https://stackoverflow.com/questions/59263928/m...
18153,machine learning Instancebased learning,I am quite new to machine learning and Ive bee...,"machine-learning,artificial-intelligence",https://stackoverflow.com/questions/48918391/m...
28467,Export machine learning model,I am creating a machine learning algorithm and...,"python,python-2.7,machine-learning,scikit-learn",https://stackoverflow.com/questions/38021937/e...
36910,machine learning and mestimate,I am working on a machine learning problem and...,"machine-learning,artificial-intelligence",https://stackoverflow.com/questions/12272269/m...
5237,inavalid element type in machine learning model,I am using a simple model of tensorflow/tfjswh...,"javascript,react-native,machine-learning,tenso...",https://stackoverflow.com/questions/58724591/i...
24920,bayesianoptimization in machine learning,Thanks for reading this. I am currently studyi...,machine-learning,https://stackoverflow.com/questions/42562883/b...
37588,C machine learning framework,I cant seem to find a C based ML/AI framework ...,"c++,frameworks,artificial-intelligence,machine...",https://stackoverflow.com/questions/8682766/c-...
35034,machine learning in c,I am working on vision project using c and op...,"c++,opencv,machine-learning",https://stackoverflow.com/questions/20874480/m...
38068,Basic machine learning,I am developing a tool where I need to predict...,"java,machine-learning",https://stackoverflow.com/questions/4816546/ba...


# 2nd Level filtering based on question and summary

In [310]:
# Encoding the summary 
embeddingsummary = model.encode(level2Summarynew.Summary.values,tokenize=True)

Sentences:[list(['<p>', 'I', 'am', 'a', 'in', 'Machine', 'learning', '.', 'I', 'have', 'seen', 'which', 'machine', 'learning', '.', 'But', 'my', 'is', 'can', 'we', 'model', 'our', 'we', 'get', 'data', '.', 'can', 'I', 'that', '...', '</p>'])
 list(['<p>', 'be', 'a', 'question', ',', 'but', 'I', 'am', 'to', 'ML', 'and', 'cant', 'to', 'a', 'have', 'a', 'ML', 'on', 'a', 'now', 'I', 'am', 'the', 'data', 'that', 'the', '...', '</p>'])
 list(['<p>', 'I', 'am', 'on', 'a', 'machine', 'learning', 'and', 'have', 'some', 'in', 'my', 'data', 'and', 'to', '.', 'I', 'read', 'about', 'using', 'to', '.', 'I', 'have', '...', '</p>'])
 list(['<p>', 'I', 'have', 'a', 'question', 'on', 'machine', 'learning', 'training', 'there', 'a', 'way', 'to', 'the', 'data', 'that', 'the', 'to', 'data', 'points', '.', ',', 'if', 'I', '...', '</p>'])
 list(['<p>', 'I', 'am', 'on', 'a', 'machine', 'learning', 'where', 'I', 'am', 'my', 'model', 'on', 'have', 'the', 'and', 'model', 'is', 'with', ',', 'my', 'dataset', 'is',

In [311]:
p=cosine_similarity(embeddingsummary)
input_1 = list(np.where(level2Summarynew["Questions"] == "machine learning model")[0]) # Displaying the index of question asked
input_1

[0]

# Top 5 similar questions after filtering based on question and summary

In [312]:
L=np.argsort(p[input_1],axis=1)
for i in L:
  print(i)
t=level2Summarynew.iloc[i]
t.head(5)


[ 7 21  8  5 13 12  9  6 18 24 20  4  1 22 10 16 19 15 25 14 11 17 23  2
  3  0]


Unnamed: 0,Questions,Summary,Tag,Link
37588,C machine learning framework,I cant seem to find a C based ML/AI framework ...,"c++,frameworks,artificial-intelligence,machine...",https://stackoverflow.com/questions/8682766/c-...
17770,Preprocessing machine learning data,"This may be a stupid question, but I am new to...","python,python-3.x,algorithm,machine-learning",https://stackoverflow.com/questions/49195008/p...
35034,machine learning in c,I am working on vision project using c and op...,"c++,opencv,machine-learning",https://stackoverflow.com/questions/20874480/m...
5237,inavalid element type in machine learning model,I am using a simple model of tensorflow/tfjswh...,"javascript,react-native,machine-learning,tenso...",https://stackoverflow.com/questions/58724591/i...
15303,Machine learning data modeling,I am a beginner in Machine learning. I have se...,"machine-learning,data-mining",https://stackoverflow.com/questions/51075544/m...
