In [None]:
# set working directory
import os
#os.chdir(path = {your path})

In [None]:
# import basics
import pandas as pd
import json
import numpy as np
import pickle
import regex as re

In [None]:
# import nlp relevants
import nltk
from nltk.tokenize import word_tokenize
from collections import defaultdict
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
set(stopwords.words('english'))
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

# for bag-of-words (bow)
from sklearn import feature_extraction, model_selection, naive_bayes, pipeline, manifold, preprocessing

In [None]:
# Loading the dataset

data = pd.read_json('posts.json')
data.head()

In [None]:
class ETL:

    # text normalization - stemming, lemmatization, stopwords
    ps = PorterStemmer()
    wordnet_lemmatizer = WordNetLemmatizer() 
    s_words = stopwords.words()
    
    
    # normalization of question sentences
    def _norm_sent(self, sent, rm_stopwords = False, stemming = True, lemmatization = False):
        
        # tokenize - sentence to word
        words = word_tokenize(sent)
        
        # take if all characters in the string are alphabets and then decapitalize
        sent = [w.lower() for w in words if w.isalpha()] 

        # remove stopwords
        if rm_stopwords:
          sent = [w for w in sent if w not in self.s_words]    

        # apply lemmatization 
        if lemmatization:
          sent = [self.wordnet_lemmatizer.lemmatize(w, pos = "n") for w in sent]
          sent = [self.wordnet_lemmatizer.lemmatize(w, pos = "v") for w in sent]
          sent = [self.wordnet_lemmatizer.lemmatize(w, pos = ("a")) for w in sent]

        # apply stemming 
        if stemming:
          sent = [self.ps.stem(w) for w in sent]

        sent = " ".join(sent)
        return sent  
    
    
    def norm_data(self, data):   
        data.loc[:, "title_processed"] = data["title"].apply(lambda x: self._norm_sent(x, rm_stopwords = True, lemmatization = True, stemming = True))
        return data   
    
    
    def bow_fit(self, corpus, type = "tfidf", max_features = 10000, ngram_range = (1,2)):
        
        if type == "tfidf": 
            self.tfidf_vectorizer = feature_extraction.text.TfidfVectorizer(max_features = max_features, ngram_range = ngram_range)
            self.tfidf_vectorizer.fit(corpus["title"])

            # create a reverse mapping for the vocab
            self.inv_tfidf_vectorizer_vocab = {}
            
            for label, ind in self.tfidf_vectorizer.vocabulary_.items():
                self.inv_tfidf_vectorizer_vocab[ind] = label

        else:
            return NotImplementedError
        
        
    def bow_transform(self, data, type = "tfidf"):
        
        if type == "tfidf":
            return self.tfidf_vectorizer.transform(data["title"])
        
        else:
            return NotImplementedError

    # save output
    def save_vectorizers(self, path):

        # make sure directory exists
        os.makedirs(exist_ok= True, name=path)

        if self.tfidf_vectorizer != None:
            with open(os.path.join(path, "tfidf_vectorizer.pkl"), "wb") as tfidf_file:
                pickle.dump(self.tfidf_vectorizer, tfidf_file) 


In [None]:
etl = ETL()

In [None]:
df = etl.norm_data(data)

In [None]:
df.head()

In [None]:
# vectorization - bag of words model
etl.bow_fit(corpus = df, type = "tfidf")

In [None]:
etl.save_vectorizers(path="sklearn_objects")

def retrieve(query: str):
    query = etl._norm_sent(query, rm_stopwords = True, lemmatization = True, stemming = True)
    query = etl.tfidf_vectorizer.transform([query])
    scores = cosine_similarity(query, etl.bow_transform(df, type = "tfidf"))
    scores = scores.flatten()
    return df.iloc[np.argsort(-scores)[:10]]