## Import delle librerie

In [1]:
import ast
import nltk
import pickle
import json

import pandas as pd
import matplotlib.pyplot as plt

import gensim
from gensim import corpora
from gensim import models

import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()

from collections import Counter

## Lemmatizzazione

In [2]:
nltk.download('wordnet')
from nltk.stem.wordnet import WordNetLemmatizer

def get_lemma(token):
    return WordNetLemmatizer().lemmatize(token)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Eliminazione delle stopword

In [3]:
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

def is_stopwords(token):
    if token in en_stop:
        return True
    return False

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Tokenizzazione

In [4]:
nltk.download('punkt')

def tokenize(text):
    tokens = nltk.word_tokenize(text)
    return tokens

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Lettura e trasformazione dei dati

In [5]:
def transform(text):
    tokens = tokenize(text)
    tokens = [token.lower() for token in tokens]
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

In [6]:
data = pd.read_csv("../data/train_data.csv")
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12549 entries, 0 to 12548
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Unnamed: 0         12549 non-null  int64 
 1   category           12549 non-null  object
 2   headline           12549 non-null  object
 3   authors            9792 non-null   object
 4   link               12549 non-null  object
 5   short_description  10752 non-null  object
 6   date               12549 non-null  object
dtypes: int64(1), object(6)
memory usage: 686.4+ KB


In [7]:
text_list = data["headline"].tolist()

cleaned_summary_list = [transform(element) for element in text_list]

In [8]:
print(cleaned_summary_list[0])

['shooting', 'texas']


## Vettorizzazione del corpus

In [9]:
dictionary = corpora.Dictionary(cleaned_summary_list)

corpus = [dictionary.doc2bow(text) for text in cleaned_summary_list]
tfidf = models.TfidfModel(corpus)  # step 1 -- initialize a model
corpus_tfidf = tfidf[corpus]

pickle.dump(corpus, open('../models/corpus_tfidf.pkl', 'wb'))
dictionary.save('../models/dictionary.gensim')

## Addestramento del modello

In [10]:
num_topics = 5

ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topics, id2word = dictionary, passes=30, iterations=100,
                   chunksize = 10000, eval_every = 10, random_state=20)

ldamodel.save('../models/model5.gensim')

In [11]:
topics = ldamodel.print_topics(num_words=10)
for topic in topics:
    print(topic)

(0, '0.016*"shooting" + 0.012*"police" + 0.009*"killed" + 0.006*"prison" + 0.006*"photo" + 0.005*"suspect" + 0.005*"school" + 0.005*"arrested" + 0.005*"killing" + 0.005*"olympics"')
(1, '0.023*"apple" + 0.013*"space" + 0.012*"iphone" + 0.011*"photo" + 0.009*"video" + 0.008*"planet" + 0.007*"rumor" + 0.007*"science" + 0.006*"first" + 0.006*"earth"')
(2, '0.009*"found" + 0.007*"allegedly" + 0.007*"police" + 0.007*"woman" + 0.006*"study" + 0.006*"video" + 0.005*"player" + 0.004*"arrested" + 0.004*"football" + 0.004*"google"')
(3, '0.010*"facebook" + 0.010*"state" + 0.008*"video" + 0.006*"people" + 0.006*"world" + 0.006*"google" + 0.006*"final" + 0.005*"tournament" + 0.005*"player" + 0.004*"score"')
(4, '0.053*"video" + 0.015*"watch" + 0.010*"youtube" + 0.007*"show" + 0.005*"photo" + 0.005*"tiger" + 0.005*"world" + 0.005*"touchdown" + 0.004*"study" + 0.004*"lebron"')


## Test su nuovi dati

In [12]:
new_headline = 'Two person murdered in New York'

new_headline = transform(new_headline)

new_headline_bow = dictionary.doc2bow(new_headline)

result = ldamodel.get_document_topics(new_headline_bow)

result

[(0, 0.39967823),
 (1, 0.06758),
 (2, 0.39855242),
 (3, 0.066705465),
 (4, 0.06748388)]

In [13]:
ldamodel.print_topic(0, topn=10)

'0.016*"shooting" + 0.012*"police" + 0.009*"killed" + 0.006*"prison" + 0.006*"photo" + 0.005*"suspect" + 0.005*"school" + 0.005*"arrested" + 0.005*"killing" + 0.005*"olympics"'

## Aggiornamento del modello

In [14]:
temp_file = "../models/model5.gensim"

# Load a potentially pretrained model from disk.
lda = models.LdaModel.load(temp_file)

new_data = pd.read_csv("../data/new_data.csv")
new_text_list = new_data["headline"].tolist()

new_cleaned_summary_list = [transform(element) for element in text_list]

dictionary = corpora.Dictionary(new_cleaned_summary_list)
new_corpus = [dictionary.doc2bow(text) for text in cleaned_summary_list]

corpus_tfidf = tfidf[new_corpus]

lda.update(corpus_tfidf)

In [15]:
topics = lda.print_topics(num_words=5)
for topic in topics:
    print(topic)

(0, '0.010*"shooting" + 0.006*"police" + 0.006*"killed" + 0.005*"prison" + 0.004*"suspect"')
(1, '0.016*"apple" + 0.009*"iphone" + 0.008*"space" + 0.006*"photo" + 0.006*"rumor"')
(2, '0.005*"found" + 0.004*"allegedly" + 0.004*"police" + 0.004*"study" + 0.004*"woman"')
(3, '0.007*"facebook" + 0.006*"state" + 0.004*"people" + 0.004*"google" + 0.004*"world"')
(4, '0.025*"video" + 0.015*"youtube" + 0.013*"watch" + 0.004*"tiger" + 0.004*"show"')


## Visualizzazione con pyLDAvis

In [16]:
lda_display = gensimvis.prepare(lda, corpus_tfidf, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)