In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Run in terminal or command prompt
# python3 -m spacy download en
import numpy as np
import pandas as pd
import re, nltk, spacy, gensim
# Sklearn
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from pprint import pprint
# Plotting tools
import pyLDAvis
import pyLDAvis.sklearn
import matplotlib.pyplot as plt
%matplotlib inline
from unidecode import unidecode
!pip install wordcloud
from wordcloud import WordCloud, STOPWORDS

In [None]:
# Run in terminal or command prompt
# python3 -m spacy download en
import numpy as np
import pandas as pd
import re, nltk, spacy, gensim
# Sklearn
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from pprint import pprint
# Plotting tools
import pyLDAvis
import pyLDAvis.sklearn
import matplotlib.pyplot as plt
%matplotlib inline
from unidecode import unidecode
!pip install wordcloud
from wordcloud import WordCloud, STOPWORDS

# Finding the best tags for news with LDA

The idea here is to find the best tags for this news database applying a LDA model to the description column. LDA is an generative statistical model for dimensionality reduction of text variables, grouping them in topics. 

First, I'll apply the LDA, see which is the topic probability for each document, then I'll calculate the importance of each word from the document and see if it's relevant enough to be a tag.

The reasons why I chose description column instead of content is that, first it's faster to train, also because it seems that content column is the same of description, but with the names of the newspaper, location, and other useless things for this model.

# Data Cleaning

In [None]:
df = pd.read_csv('/kaggle/input/internet-articles-data-with-users-engagement/articles_data.csv')

In [None]:
df.head()

In [None]:
df.drop('Unnamed: 0', axis = 1, inplace = True)

In [None]:
df.shape

In [None]:
df.dtypes

In [None]:
df.isna().sum()

In [None]:
df_model = df.dropna(subset=['description'])[['title', 'description']]

In [None]:
df_model.reset_index(inplace=True, drop=True)

In [None]:
df_model.head()

# Data Preprocessing

First, I'll follow these steps to clean the data and make it ready to apply the model:

- Split dataset
- Remove special characters
- Tokenize data
- Lemmatize data

I'll split the data just to have some news to test. So I'll use only 0.1 of test size.

In [None]:
msk = np.random.rand(len(df_model)) < 0.9

df_train = df_model[msk]
df_test = df_model[~msk]

df_train.reset_index(inplace=True, drop=True)
df_test.reset_index(inplace=True, drop=True)

In [None]:
# Convert to list
data = df_train.description.values.tolist()
# Remove new line characters
data = [re.sub(r'\s+', ' ', sent) for sent in data]
# Remove distracting single quotes
data = [re.sub(r"\'", "", sent) for sent in data]
# Remove special characters
data = [re.sub('[^A-Za-z0-9]+', ' ', sent) for sent in data]
# Remove accentuation
data = [unidecode(text) for text in data]
pprint(data[:1])

Tokenizing the data:

In [None]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))
print(data_words[:1])

Lemmatizing the data:

In [None]:
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']): #'NOUN', 'ADJ', 'VERB', 'ADV'
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append(" ".join([token.lemma_ if token.lemma_ not in ['-PRON-'] else '' for token in doc if token.pos_ in allowed_postags]))
    return texts_out

# Initialize spacy ‘en’ model, keeping only tagger component (for efficiency)
nlp = spacy.load('en', disable=['parser', 'ner'])
# Do lemmatization keeping only Noun, Adj, Verb, Adverb
data_lemmatized = lemmatization(data_words, allowed_postags=['NOUN', 'VERB']) #select noun and verb
print(data_lemmatized[:2])

# Vectorizing and Creating the Model
I'll create the vector with the bag of words, and then apply TF-IDF to measure the importance of each word to the document. After that, I'll create the LDA Model.

In [None]:
vectorizer = CountVectorizer(analyzer='word',       
                             min_df=10,# minimum reqd occurences of a word 
                             stop_words='english',             # remove stop words
                             lowercase=True,                   # convert all words to lowercase
                             token_pattern='[a-zA-Z0-9]{3,}',  # num chars > 3
                             # max_features=50000,             # max number of uniq words    
)
data_vectorized = vectorizer.fit_transform(data_lemmatized)

In [None]:
#TF IDF
tfidf = TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf.fit(data_vectorized)

data_tfidf = tfidf.transform(data_vectorized)

Finally, I'll build the LDA model: I won't do grid search, due to computational cost, but I'll test to find the best number of topics. After that, I'll print the model to check if lda_model contains the best one from the for loop.

In [None]:
# Build LDA Model

lw_perp = np.inf
lda_model = 0
perps = []
n_topics = range(10,50,5)
n_topics = list(range(10,55,5))

for topic in n_topics:
    lda = LatentDirichletAllocation(n_components=topic,               # Number of topics
                                          max_iter=10,               # Max learning iterations
                                          learning_method='online',   
                                          random_state=100,          # Random state
                                          batch_size=128,            # n docs in each learning iter
                                          evaluate_every = -1,       # compute perplexity every n iters, default: Don't
                                          n_jobs = -1,               # Use all available CPUs
                                         )
    lda.fit(data_tfidf)
    
    perp = lda.perplexity(data_tfidf)
    perps.append(perp)
    
    if perp <= lw_perp:
        lw_perp = perp
        lda_model = lda
    
lda_output = lda_model.fit_transform(data_tfidf)

plt.plot(n_topics, perps)
plt.xlabel('Number of Topics')
plt.ylabel('Perplexity')
plt.title('Perplexity for each number of topics')
plt.show()

In [None]:
print(lda_model)

Yes, we have now the LDA model with 10 components (topics).

Now, I want to see two things: First, which words are the most important for each topic, that I'll show using wordclouds, and in which topic each document is being placed.

In [None]:
def show_topics(vectorizer=vectorizer, lda_model=lda_model, n_words=20):
    keywords = np.array(vectorizer.get_feature_names())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords

topic_keywords = show_topics(vectorizer=vectorizer, lda_model=lda_model, n_words=15)
df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
df_topic_keywords

df_topic_keywords stores all the topics and its most important words. Let's see that in wordclouds:

In [None]:
for topic in df_topic_keywords.index.values:
    summ = df_topic_keywords.loc[topic, :].values
    all_summary = " ".join(s for s in summ)
    stopwords = set(STOPWORDS)
    wordcloud = WordCloud(stopwords=stopwords,
                      background_color='black', width=1600,                            
                      height=800).generate(all_summary)
    
    fig, ax = plt.subplots(figsize=(10,4))            
    ax.imshow(wordcloud, interpolation='bilinear')       
    ax.set_axis_off()
    plt.title(topic)
    plt.imshow(wordcloud)  

Now, I'll build a dataframe containing each title of news (document) with its respective topic:

In [None]:
topicnames = ["Topic" + str(i) for i in range(lda_model.n_components)]

docnames = df_train['title']

# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)
# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic

df_document_topic


# Finding the tags

The function that I'll be created has the following logic:
- 1: Do the data preprocessing
- 2: Apply the already trained vectorizer and tfidf models
- 3: Find which tags are best, this consists of: calculation of the score -> topic probability of the document * importance of each word for the topic. The words with score higher than 0.001 (you can choose the threshold), are tags.

In [None]:
def find_tags(data, vect=vectorizer, tfidf=tfidf, lda_model=lda_model, threshold=0.01):
    global sent_to_words
    global lemmatization
    
    # Remove new line characters
    data = [re.sub(r'\s+', ' ', sent) for sent in data]
    # Remove distracting single quotes
    data = [re.sub(r"\'", "", sent) for sent in data]
    # Remove special characters
    data = [re.sub('[^A-Za-z0-9]+', ' ', sent) for sent in data]
    # Remove accentuation
    data = [unidecode(text) for text in data]

    
    
    
    data_words = list(sent_to_words(data))
    data = lemmatization(data_words)
    
    
    ## TOPIC MODELLING
    data = vect.transform(data)
    data_ready = tfidf.transform(data)
    
    
    
    topic_probability_scores = lda_model.transform(data_ready)
    
    topics = lda_model.n_components
    
    lda_components = lda_model.components_ / lda_model.components_.sum(axis=1)[:, np.newaxis] # normalization
    
    tags = []
    
    
    for topic in range(topics):
        topic_score = topic_probability_scores[0][topic]
        
        for word in data_words[0]:
            try:
                word_score = lda_components[topic][vectorizer.get_feature_names().index(word)]
            except:
                word_score = 0
            score = topic_score*word_score
            if score >= threshold:
                tags.append(word)
    
    return list(set(tags))

In [None]:
test = [df_test['description'].values[1]]
print('text:',test)
print('recommended tags:', find_tags(test,threshold=0.001))


In [None]:
test = [df_test['description'].values[5]]
print('text:',test)
print('recommended tags:', find_tags(test,threshold=0.0001))

In [None]:
test = [df_test['description'].values[15]]
print('text:',test)
print('recommended tags:', find_tags(test,threshold=0.001))

The recommended tags are usually good ones. I though in another methods, to exclude the dependency of the text (because this model will always recommend words that are in text), but I think that, as we have articles, that are usually long texts, this model would work well in production.