In [1]:
# set working directory
import os
#os.chdir(path = {your path})

In [2]:
# import basics
import pandas as pd
import json
import numpy as np
import pickle
import regex as re

In [3]:
# import nlp relevants
import nltk
from nltk.tokenize import word_tokenize
from collections import defaultdict
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
set(stopwords.words('english'))
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

# for bag-of-words (bow)
from sklearn import feature_extraction, model_selection, naive_bayes, pipeline, manifold, preprocessing

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\chaey\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\chaey\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\chaey\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
# Loading the dataset

data = pd.read_json('posts.json')
data.head()

Unnamed: 0,title,author,htmlBody
0,The Need For Work On Technical AI Alignment (I...,,
1,"Why I’m not working on {debate, RRM, ELK, natu...",,<p>[For background &amp; spelling out the acro...
2,EIS II: What is “Interpretability”?,,"<p>Part 2 of 12 in the&nbsp;<a href=""https://w..."
3,The Engineer’s Interpretability Sequence (EIS)...,,"<p><br>Part 1 of 12 in the <a href=""https://ww..."
4,Notes on the Mathematics of LLM Architectures,,<blockquote><p><i>From a mathematical point of...


In [5]:
class ETL:

    # text normalization - stemming, lemmatization, stopwords
    ps = PorterStemmer()
    wordnet_lemmatizer = WordNetLemmatizer() 
    s_words = stopwords.words()
    
    
    # normalization of question sentences
    def _norm_sent(self, sent, rm_stopwords = False, stemming = True, lemmatization = False):
        
        # tokenize - sentence to word
        words = word_tokenize(sent)
        
        # take if all characters in the string are alphabets and then decapitalize
        sent = [w.lower() for w in words if w.isalpha()] 

        # remove stopwords
        if rm_stopwords:
          sent = [w for w in sent if w not in self.s_words]    

        # apply lemmatization 
        if lemmatization:
          sent = [self.wordnet_lemmatizer.lemmatize(w, pos = "n") for w in sent]
          sent = [self.wordnet_lemmatizer.lemmatize(w, pos = "v") for w in sent]
          sent = [self.wordnet_lemmatizer.lemmatize(w, pos = ("a")) for w in sent]

        # apply stemming 
        if stemming:
          sent = [self.ps.stem(w) for w in sent]

        sent = " ".join(sent)
        return sent  
    
    
    def norm_data(self, data):   
        data.loc[:, "title_processed"] = data["title"].apply(lambda x: self._norm_sent(x, rm_stopwords = True, lemmatization = True, stemming = True))
        return data   
    
    
    def bow_fit(self, corpus, type = "tfidf", max_features = 10000, ngram_range = (1,2)):
        
        if type == "tfidf": 
            self.tfidf_vectorizer = feature_extraction.text.TfidfVectorizer(max_features = max_features, ngram_range = ngram_range)
            self.tfidf_vectorizer.fit(corpus["title"])

            # create a reverse mapping for the vocab
            self.inv_tfidf_vectorizer_vocab = {}
            
            for label, ind in self.tfidf_vectorizer.vocabulary_.items():
                self.inv_tfidf_vectorizer_vocab[ind] = label

        else:
            return NotImplementedError
        
        
    def bow_transform(self, data, type = "tfidf"):
        
        if type == "tfidf":
            return self.tfidf_vectorizer.transform(data["title"])
        
        else:
            return NotImplementedError

    # save output
    def save_vectorizers(self, path):

        # make sure directory exists
        os.makedirs(exist_ok= True, name=path)

        if self.tfidf_vectorizer != None:
            with open(os.path.join(path, "tfidf_vectorizer.pkl"), "wb") as tfidf_file:
                pickle.dump(self.tfidf_vectorizer, tfidf_file) 


In [6]:
etl = ETL()

In [7]:
df = etl.norm_data(data)

In [8]:
df.head()

Unnamed: 0,title,author,htmlBody,title_processed
0,The Need For Work On Technical AI Alignment (I...,,,work technic align intro explain
1,"Why I’m not working on {debate, RRM, ELK, natu...",,<p>[For background &amp; spelling out the acro...,work debat rrm elk natur abstract
2,EIS II: What is “Interpretability”?,,"<p>Part 2 of 12 in the&nbsp;<a href=""https://w...",ei interpret
3,The Engineer’s Interpretability Sequence (EIS)...,,"<p><br>Part 1 of 12 in the <a href=""https://ww...",engin interpret sequenc ei intro
4,Notes on the Mathematics of LLM Architectures,,<blockquote><p><i>From a mathematical point of...,note mathemat llm architectur


In [9]:
# vectorization - bag of words model
etl.bow_fit(corpus = df, type = "tfidf")

In [10]:
etl.save_vectorizers(path="sklearn_objects")