<a href="https://www.kaggle.com/code/shamimahossain/get-similar-stack-overflow-questions-from-query?scriptVersionId=111098046" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [14]:
import numpy as np 
import pandas as pd 
import os
from sklearn.model_selection import train_test_split
import re                                  # library for regular expression operations
import string                              # for string operations

from nltk.corpus import stopwords          # module for stop words that come with NLTK
from nltk.stem import PorterStemmer        # module for stemming
from nltk.tokenize import word_tokenize   # module for tokenizing strings
from tqdm import tqdm
import matplotlib.pyplot as plt
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.metrics.pairwise import cosine_similarity

In [9]:
DATA_DIR = '../input/60k-stack-overflow-questions-with-quality-rate/'

In [10]:
data_df = pd.read_csv(DATA_DIR+'train.csv')
data_df.head()

Unnamed: 0,Id,Title,Body,Tags,CreationDate,Y
0,34552656,Java: Repeat Task Every Random Seconds,<p>I'm already familiar with repeating tasks e...,<java><repeat>,2016-01-01 00:21:59,LQ_CLOSE
1,34553034,Why are Java Optionals immutable?,<p>I'd like to understand why Java 8 Optionals...,<java><optional>,2016-01-01 02:03:20,HQ
2,34553174,Text Overlay Image with Darkened Opacity React...,<p>I am attempting to overlay a title over an ...,<javascript><image><overlay><react-native><opa...,2016-01-01 02:48:24,HQ
3,34553318,Why ternary operator in swift is so picky?,"<p>The question is very simple, but I just cou...",<swift><operators><whitespace><ternary-operato...,2016-01-01 03:30:17,HQ
4,34553755,hide/show fab with scale animation,<p>I'm using custom floatingactionmenu. I need...,<android><material-design><floating-action-but...,2016-01-01 05:21:48,HQ


In [11]:
data_df_text = data_df[['Id','Title']]
data_df_text.head()

Unnamed: 0,Id,Title
0,34552656,Java: Repeat Task Every Random Seconds
1,34553034,Why are Java Optionals immutable?
2,34553174,Text Overlay Image with Darkened Opacity React...
3,34553318,Why ternary operator in swift is so picky?
4,34553755,hide/show fab with scale animation


# Preprocessing Helper Functions

In [12]:
def remove_punctuation(text):
    return "".join(["" if ch in string.punctuation else ch.lower() for ch in text])
stopwords_english = set(stopwords.words('english'))
def clean_words(headline):
    return [
    word for word in headline
    if word not in stopwords_english
]  
stemmer = PorterStemmer()
def words_stems(headline):
    return [
    stemmer.stem(word) for word in headline
]
def tokenize_text(text):
    return word_tokenize(text)
def remove_numbers(text):
    return re.sub("[^a-zA-Z]", " ", text)

In [13]:
data_df_text['Title'] = data_df_text['Title'].apply(remove_punctuation).apply(remove_numbers).apply(tokenize_text).apply(clean_words)
data_df_text.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,Id,Title
0,34552656,"[java, repeat, task, every, random, seconds]"
1,34553034,"[java, optionals, immutable]"
2,34553174,"[text, overlay, image, darkened, opacity, reac..."
3,34553318,"[ternary, operator, swift, picky]"
4,34553755,"[hideshow, fab, scale, animation]"


# Building corpus for Training

In [15]:
tagged_data = [TaggedDocument(row['Title'], [i]) for i, row in data_df_text.iterrows()]

# Define Model

In [16]:
model = Doc2Vec(tagged_data, vector_size=20, window=2, min_count=1, workers=4, epochs = 100)

In [17]:
model.save("st_doc2vec.model")


In [18]:
## Load saved doc2vec model
model= Doc2Vec.load("st_doc2vec.model")

# Generate embeddings from trained model

In [25]:
def get_embedding(sentence):
    func_embeddings, func_item_name = [], []
    for word in sentence:

        try:
            vec = model.wv[word]
            func_embeddings.append(vec)
            func_item_name.append(sentence)
        except:
            pass
    return func_embeddings

In [26]:
texts = data_df_text["Title"]
embed_list = []
for text in texts:
    embed_list.append(get_embedding(text))

data_df_text["embeddings"] = embed_list

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [27]:
data_df_text.head()

Unnamed: 0,Id,Title,embeddings
0,34552656,"[java, repeat, task, every, random, seconds]","[[0.5479711, -1.0318434, -0.3358808, 1.8914942..."
1,34553034,"[java, optionals, immutable]","[[0.5479711, -1.0318434, -0.3358808, 1.8914942..."
2,34553174,"[text, overlay, image, darkened, opacity, reac...","[[2.4261212, -2.4814582, -0.13593566, 3.406741..."
3,34553318,"[ternary, operator, swift, picky]","[[-1.2538328, 2.429045, 0.7572507, 1.8956759, ..."
4,34553755,"[hideshow, fab, scale, animation]","[[-0.1718769, -0.72302026, 0.82044405, -0.1304..."


# Evaluate model on user generated text

In [36]:
def find_similar_questions(ques):
    score_list = []
    for i in range(len(data_df_text)):
        func_embeddings = data_df_text.iloc[i, 2]
        func_embeddings2, func_item_name2 = [], []
        for word in ques.split():

            try:
                vec = model.wv[word]
                func_embeddings2.append(vec)
  
            except:
                pass
        final_vec2 = [0]*model.wv.vector_size
        for v in func_embeddings2:
            final_vec2 += v 

        try:
            score = cosine_similarity(func_embeddings, func_embeddings2)
            score = np.mean(score)
            score_list.append([score, data_df.iloc[i, 1]])
        except:
            pass
    #print(score_list)
    score_list.sort(reverse = True)
    res = []
    try:
        res = score_list[:5]
        return res
    except:
        return "nan"

In [37]:
find_similar_questions("overlay an image in CSS")

[[0.782632, 'Image in <title>'],
 [0.7653239, 'CSS Color overlay with background image'],
 [0.7558745, 'CSS Background Image overlay'],
 [0.7550109, 'Image overlay you can click through'],
 [0.72615725, 'How to center the image in bootstrap 4?']]

* References for doc2vec function usage: https://www.kaggle.com/code/yashtiwari1906/doc2vec-for-search