# In this Notebook, I have created a search engine that looks through the Collection of StackOverFlow Python Questions(collected between 2008-2016) and retrieves the Questions most similar to input python related question.   

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
%cd /kaggle/working

In [None]:
!pip install transformers

In [None]:
!pip install simpletransformers==0.32.3

In [None]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
import re
from wordcloud import WordCloud, STOPWORDS 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from transformers import (GPT2Config,GPT2LMHeadModel,GPT2Tokenizer)
import torch
from string import punctuation as pnc
from collections import Counter
from scipy import spatial
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm
import torch
import pylab as pl
pd.set_option('display.max_colwidth', -1)

In [None]:
import nltk
nltk.download('stopwords')

In [None]:
questions = pd.read_csv("/kaggle/input/pythonquestions/Questions.csv", encoding = "ISO-8859-1")
print(len(questions))
display(questions.head(5))

In [None]:
print("Number of unique Questions : ", questions['Id'].nunique())

In [None]:
questions['qLen'] = questions['Title'].apply(lambda x : len(x.split(" ")))
questions['qBodyLen'] = questions['Body'].apply(lambda x : len(x.split(" ")))

## Number of words in Title and Body

In [None]:
questions['qLen'].hist(bins=35)
plt.title("No. of words in Title")

In [None]:
questions[questions['qBodyLen']<500]['qBodyLen'].hist(bins=100)
plt.title("No. of words in Body")

## Most of the Titles have around 7 words while for Body it stands at around 50. So, to get the most similar questions I will use Title instead  of Body.

## Word Cloud

In [None]:
def getWordCloud(df,col):
  comment_words = '' 
  stopwords = set(STOPWORDS) 
    
  for val in tqdm(df[col]): 
        
      val = str(val) 
      tokens = val.split() 
        
      for i in range(len(tokens)): 
          tokens[i] = tokens[i].lower() 
        
      comment_words += " ".join(tokens)+" "
    
  wordcloud = WordCloud(width = 800, height = 800, 
                  background_color ='white', 
                  stopwords = stopwords, 
                  min_font_size = 10).generate(comment_words) 
    
                       
  plt.figure(figsize = (5, 5), facecolor = None) 
  plt.imshow(wordcloud) 
  plt.axis("off")
  plt.tight_layout(pad = 0) 
    
  plt.show()

In [None]:
getWordCloud(questions,'Title')

## Preprocessing the title -- Removal of Stop Words, non alphanumeric charcters, and lower case all the charcters.

In [None]:
stop = stopwords.words('english')
def preprocess(df, col):
  df['preprocessed'+col] = df[col].apply(lambda x : " ".join([word for word in x.split(" ") if word not in stop]))
  df['preprocessed'+col] = df['preprocessed'+col].str.replace('[^a-zA-Z0-9 ]', '')
  df['preprocessed'+col] = df['preprocessed'+col].str.lower()
  return df

In [None]:
questions = preprocess(questions, 'Title')

## Tags

In [None]:
tags = pd.read_csv("/kaggle/input/pythonquestions/Tags.csv", encoding = "ISO-8859-1")
print(len(tags))
display(tags.head(5))

In [None]:
print("Number of unique Tags : ", tags['Tag'].nunique())

## 20  Most frequent Tags Except Python which is obviously the most frequent.

In [None]:
fig, ax = plt.subplots()
tags[tags['Tag']!='python']['Tag'].value_counts().sort_values(ascending = False)[:20].plot(ax=ax, kind='bar')

## Encoding the Processed Question Title and Embedding it using GPT2 Tokenizer. 

In [None]:
config_class, model_class, tokenizer_class = GPT2Config, GPT2LMHeadModel, GPT2Tokenizer
model = model_class.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

In [None]:
preprocessedTitle = questions['preprocessedTitle'].values
QID = questions['Id'].values
print(len(preprocessedTitle), len(QID))

## Using tokenizer.batch_encode_plus to encode all the titles in 1 go. Instead of tokenizer.batch_encode_plus, tokenizer.encode can be used to encode 1 instance at a time

In [None]:
encodedpreprocessedTitle = tokenizer.batch_encode_plus(preprocessedTitle)['input_ids']
print(len(encodedpreprocessedTitle))

## Loading the embedding from GPT2 Model. Each token in the embedding matrix is 768 length vector and the embedding has 50257 unique tokens.

In [None]:
embeddigs = model.transformer.wte
print("Shape of embedding matrix : ",embeddigs.weight.shape)
print("Type of embedding matrix : ", type(embeddigs))

## In the below code, I have taken the mean across embeddings of all the tokens in a particular Title. So after taking the mean every Title would be represented by a 768 length vector. And this same thing has been done for all the 607282 Titles in the for loop. 
## Note -- There might be cases where number of tokens in the processed Title can be 0, hence the condition of len(encodedTitle) > 0 has been applied.

In [None]:
TitleEmbeddingList = []
QIDList = []
for idx, (qid, encodedTitle) in tqdm(enumerate(zip(QID, encodedpreprocessedTitle))):
  if len(encodedTitle) > 0 :
    embeddedTitle = embeddigs(torch.tensor(encodedTitle).to(torch.int64)).mean(axis=0)
    TitleEmbeddingList.append(embeddedTitle)
    QIDList.append(qid)

In [None]:
numQ = len(TitleEmbeddingList)
embedDim = len(TitleEmbeddingList[0])
print("Number of Titles : ",numQ," and Length of vector of each Title : ",embedDim)

In [None]:
print("Type of TitleEmbeddingList : ",type(TitleEmbeddingList))

## Converting  TitleEmbeddingList from List of tensors to tensor.

In [None]:
TitleEmbeddingTensor = torch.cat(TitleEmbeddingList, dim=0)
TitleEmbeddingTensor = torch.reshape(TitleEmbeddingTensor, (numQ, embedDim))
print("Shape of TitleEmbeddingTensor : ",TitleEmbeddingTensor.shape)
print("Type of TitleEmbeddingTensor : ", type(TitleEmbeddingTensor))

# Once we created the embedding representation of each title. Now we are going to feed in an input question and search among StackOverflow Question Titles which ones are most similar to the input by using cosine similarity between embedding of input question and the titles. 

## PreProcess the Input text

In [None]:
def preprocesstext(text):
  text =  " ".join([word for word in text.split(" ") if word not in stop])
  text = re.sub(r'[^a-zA-Z0-9 ]','',text)
  text = text.lower()
  return text

## Perform Cosine Similarity between the input question and all the StackOverFlow Titles and Get index of the most simillar K Titles 

In [None]:
def getMostSimilarQuestionsIdx(K, a, b):
  a_norm = a / a.norm(dim=1)[:, None]
  b_norm = b / b.norm(dim=1)[:, None]
  res = torch.mm(a_norm, b_norm.transpose(0,1)).squeeze(0)
  res = res.tolist()
  mostSimIdx = sorted(range(len(res)), key=lambda x: res[x])[-K:]
  return mostSimIdx

## Print out the Most Similar Question Titles With the Question ID

In [None]:
def getMostSimilarQuestions(K, input, QuestionDF, QIDList):
  input = input
  preprocessedinput = preprocesstext(input)
  inputEncoded = tokenizer.batch_encode_plus([preprocessedinput])['input_ids']
  inputEmbedded = embeddigs(torch.tensor(inputEncoded).to(torch.int64)).squeeze(0).mean(axis=0).unsqueeze(0)
  mostSimilarIdx = getMostSimilarQuestionsIdx(K, inputEmbedded, TitleEmbeddingTensor)
  mostSimilarIdx.reverse()
  print("Most similar ",K, " questions : ")
  for idx, simidx in enumerate(mostSimilarIdx):
    IDQ = QuestionDF[QuestionDF['Id']==QIDList[simidx]][['Id','Title']].values
    parentId = IDQ[0][0]
    simQuestion = IDQ[0][1]
    print((idx+1), "Question Id : ", parentId, "Question : ",simQuestion)

## Lets test !!!

In [None]:
getMostSimilarQuestions(5, "How to MUltiply 2 columns pandas ?", questions ,QIDList)

In [None]:
getMostSimilarQuestions(5, "regex pandas", questions ,QIDList)

In [None]:
getMostSimilarQuestions(5, "logistic regression sklearn", questions ,QIDList)

In [None]:
getMostSimilarQuestions(5, "covert csv to json file pandas", questions ,QIDList)

In [None]:
getMostSimilarQuestions(5, "Build website using python", questions ,QIDList)

In [None]:
getMostSimilarQuestions(5, "How to install Pandas", questions ,QIDList)