In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time

import nltk
from nltk.corpus import stopwords
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.cluster import AgglomerativeClustering 
from scipy.cluster.hierarchy import dendrogram

import warnings
warnings.filterwarnings('ignore')

In [2]:
#nltk.download('stopwords')

In [2]:
filename = 'data/nlp_nltk_stemmed_preproc.csv'

In [3]:
df = pd.read_csv(filename).drop(columns='Unnamed: 0')

In [4]:
df.head()

Unnamed: 0,claps,text
0,489,ultim guid ace code interview data scientist d...
1,139,shakespear versu eminem— who’ better lyricist ...
2,133,implement visualttransform pytorch hi guy happ...
3,92,stock price analysi panda altair practic guid ...
4,58,optim threshold imbalanc classif handson tutor...


In [5]:
X = df['text']
y = df['claps']

### Latent Dirichlet Allocation

In [13]:
#import
from sklearn.decomposition import LatentDirichletAllocation

In [43]:
stop_words = stopwords.words('english')
stop_words.extend(['use', 'used', 'using', 'one', 'like', 'user', 'make', 'also', 'get', 'point', 'let', 'go'])

In [31]:
#custom class
class SKTopics(BaseEstimator, TransformerMixin):
    
    def __init__(self, n_topics = 20):
        self.n_topics = n_topics
        self.model = make_pipeline(CountVectorizer(ngram_range=(1,3), stop_words= stop_words, max_features=100), 
                                   LatentDirichletAllocation(n_components = self.n_topics))
    #fit_transform method
    def fit_transform(self, documents):
        self.model.fit_transform(documents)
        return self.model
    
    #get_topics method
    def get_topics(self, n = 25):
        vectorizer = self.model.named_steps['countvectorizer']
        model = self.model.steps[-1][1]
        names = vectorizer.get_feature_names()
        topics = dict()
        for idx, topic in enumerate(model.components_):
            features = topic.argsort()[:-(n-1): -1]
            tokens = [names[i] for i in features]
            topics[idx] = tokens
        return topics

  and should_run_async(code)


In [44]:
#instantiate
topics = SKTopics(n_topics=10)

In [45]:
topics.fit_transform(X)


Pipeline(steps=[('countvectorizer',
                 CountVectorizer(max_features=100, ngram_range=(1, 3),
                                 stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...])),
                ('latentdirichletallocation', LatentDirichletAllocation())])

In [46]:
#get topics
topics.get_topics(n = 10)

{0: ['data',
  'scienc',
  'data scienc',
  'learn',
  'work',
  'project',
  'need',
  'time'],
 1: ['need', 'time', 'run', 'work', 'would', 'test', 'file', 'want'],
 2: ['distribut',
  'probabl',
  'mean',
  'learn',
  'machin',
  'valu',
  'number',
  'machin learn'],
 3: ['model',
  'train',
  'learn',
  'predict',
  'data',
  'machin',
  'machin learn',
  'dataset'],
 4: ['data', 'plot', 'analysi', 'variabl', 'valu', 'visual', 'time', 'differ'],
 5: ['ai',
  'system',
  'dataset',
  'process',
  'learn',
  'develop',
  'articl',
  'gener'],
 6: ['column', 'word', 'creat', 'function', 'data', 'valu', 'name', 'file'],
 7: ['imag', 'code', 'python', 'object', 'creat', 'list', 'articl', 'gener'],
 8: ['function',
  'network',
  'class',
  'input',
  'output',
  'method',
  'valu',
  'number'],
 9: ['featur',
  'algorithm',
  'valu',
  'predict',
  'dataset',
  'model',
  'import',
  'perform']}

In [47]:
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()
warnings.filterwarnings('ignore')

In [48]:
tf_vect = TfidfVectorizer(stop_words = stop_words, max_features = 100, ngram_range = (1,3))
dtm_tf = tf_vect.fit_transform(X)

In [49]:
lda = LatentDirichletAllocation(n_components = 5, random_state = 42)

In [50]:
lda.fit(dtm_tf)

LatentDirichletAllocation(n_components=5, random_state=42)

In [51]:
pyLDAvis.sklearn.prepare(lda, dtm_tf, tf_vect)

In [53]:
pyLDAvis.save_html(pyLDAvis.sklearn.prepare(lda, dtm_tf, tf_vect), 'sklearn_LDAvis_5.html')


### Latent Semantic Analysis

In [54]:
from sklearn.decomposition import TruncatedSVD, NMF

In [58]:
class SKTopics(BaseEstimator, TransformerMixin):
    def __init__(self, estimator = 'LDA', n_topics = 20):
        self.n_topics = n_topics
        if estimator == 'LSA':
            self.estimator = TruncatedSVD(n_components = self.n_topics)
        elif estimator == 'NMF':
            self.estimator = NMF(n_components=self.n_topics)
        else:
            self.estimator = LatentDirichletAllocation(n_components = self.n_topics)
        self.model = make_pipeline(CountVectorizer(stop_words = 'english', max_features = 100, ngram_range = (1,2)), self.estimator)
    def fit_transform(self, documents):
        self.model.fit_transform(documents)
        return self.model
    def get_topics(self, n = 25):
        vectorizer = self.model.named_steps['countvectorizer']
        model = self.model.steps[-1][1]
        names = vectorizer.get_feature_names()
        topics = dict()
        for idx, topic in enumerate(model.components_):
            features = topic.argsort()[:-(n-1): -1]
            tokens = [names[i] for i in features]
            topics[idx] = tokens
        return topics

In [59]:
topics = SKTopics(estimator = 'LSA', n_topics=10)
topics.fit_transform(X)
topics.get_topics(n = 10)

{0: ['data', 'use', 'model', 'learn', 'need', 'valu', 'train', 'function'],
 1: ['data',
  'scienc',
  'data scienc',
  'scientist',
  'tool',
  'column',
  'project',
  'work'],
 2: ['model',
  'data',
  'train',
  'predict',
  'learn',
  'machin learn',
  'machin',
  'featur'],
 3: ['learn',
  'machin',
  'machin learn',
  'scienc',
  'data scienc',
  'work',
  'ai',
  'project'],
 4: ['imag',
  'train',
  'network',
  'gener',
  'data',
  'dataset',
  'object',
  'input'],
 5: ['valu',
  'learn',
  'function',
  'featur',
  'distribut',
  'algorithm',
  'sampl',
  'number'],
 6: ['function', 'model', 'code', 'python', 'train', 'learn', 'data', 'list'],
 7: ['featur',
  'use',
  'learn',
  'machin',
  'machin learn',
  'dataset',
  'column',
  'creat'],
 8: ['featur',
  'scienc',
  'imag',
  'data scienc',
  'code',
  'valu',
  'project',
  'work'],
 9: ['word',
  'scienc',
  'featur',
  'function',
  'data scienc',
  'languag',
  'use',
  'network']}