In [None]:
import requests, json
import pandas as pd
from pathlib import Path
from tqdm import tqdm
import nltk

In [None]:
FILE_NAME = "entries.json"
SYNC_REQUIRED = True
CALC_SIMILARITY = False

In [None]:
columns = ['art_id', 'text', 'tags']
entries = pd.DataFrame(columns=columns)
i = 0

In [None]:
entries_json_file = Path(FILE_NAME)

if entries_json_file.is_file():
    resp = requests.get("https://cybernews.rrzatkie.xyz/api/entries/?offset={}".format(i)) 
    count = int(json.loads(resp.text)["count"])
    
    entries = pd.read_json(entries_json_file)
    cur_count = len(entries.values)
    
    SYNC_REQUIRED = (cur_count != count)

if(SYNC_REQUIRED):
    entries = pd.DataFrame(columns=columns)
    resp = requests.get("https://cybernews.rrzatkie.xyz/api/entries/?offset={}".format(i)) 
    json_content = json.loads(resp.text)

    content= json_content["results"]
    count = int(json_content["count"])
    
    with tqdm(total=count) as pbar:
        for entry in content:
            df_temp = pd.DataFrame([["id-{}".format(entry['id']), entry["article"]["text"],entry["tags"]]], columns=columns)
            entries = entries.append(df_temp, ignore_index=True)

        i += len(content)
        pbar.update(len(content))
        
        while(i < count):
            resp = requests.get("https://cybernews.rrzatkie.xyz/api/entries/?offset={}".format(i)) 
            resp_content = json.loads(resp.text)["results"]

            for entry in resp_content:
                df_temp = pd.DataFrame([["id-{}".format(entry['id']), entry["article"]["text"],entry["tags"]]], columns=columns)
                entries = entries.append(df_temp, ignore_index=True)

            i += len(resp_content)
            pbar.update(len(content))

    print("Expected: {}, got: {}".format(count, len(entries.values)))


    f = open(FILE_NAME, "w")
    f.write(entries.to_json())
    f.close()

In [None]:
entries['tags_count'] = entries.tags.str.len()

In [None]:
entries_with_tags = entries[entries.tags_count != 0]
entries_without_tags = entries[entries.tags_count == 0]

In [None]:
entries_with_tags

In [None]:
entries_without_tags

In [None]:
import dill as pickle

In [None]:
tags = requests.get("https://cybernews.rrzatkie.xyz/api/tags")
tags = [tag['name'] for tag in json.loads(tags.text)['results']]

In [None]:
f = open('svc_model_pipeline.pickle', 'rb')

In [None]:
model_lsvc = pickle.load(f)

In [None]:
import nltk

In [None]:
with open('svc_model_pipeline.pickle', 'rb') as f:
    model_lsvc = pickle.load(f)
    print('model loaded OK.')

In [None]:
results = model_lsvc.predict([entries_with_tags.text.values[0]])

In [None]:
print(results.shape)

In [None]:
# tags = [[tags[idx] for idx, topic in enumerate(doc) if topic == 1] for doc in results]

In [None]:
results_tags = [tags[i] for i in results.nonzero()[1]]

In [None]:
results_tags

In [None]:
distinct_tags = {}

In [None]:
for tag in tags:
    distinct_tags[tag] = 0

In [None]:
distinct_tags

In [None]:
for tags in entries_with_tags.tags.values:
    for tag in tags:
        distinct_tags[tag]+=1

In [None]:
distinct_tags

In [None]:
entries_with_tags.tags_count.describe()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [None]:
names = list(distinct_tags.keys())
names.append('NONE')
values = list(distinct_tags.values())
values.append(len(entries_without_tags.tags.values))

In [None]:
plt.figure(figsize=(15, 6))
plt.bar(names,values)
plt.xticks(rotation=90)
plt.show()

In [None]:
preprocessor = model_lsvc.steps[0][1]

In [None]:
preprocessor.lemmatize

In [None]:
model_lsvc.steps

In [None]:
entries_with_tags['lemmatized_text'] = preprocessor.transform(entries_with_tags['text'].values)

In [None]:
vectorizer = model_lsvc.steps[1][1]

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv_vectorizer = CountVectorizer(
    analyzer='word',       
    min_df=10,                        # minimum reqd occurences of a word 
    stop_words='english',             # remove stop words
    lowercase=True,                   # convert all words to lowercase
    token_pattern='[a-zA-Z0-9]{3,}',  # num chars > 3
    # max_features=50000             ,# max number of uniq words
)

entries_with_tags['lemmatized_text'] = preprocessor.inverse_transform(entries_with_tags['lemmatized_text'].values)

In [None]:
vectorizer = cv_vectorizer

In [None]:
data_vectorized = vectorizer.fit_transform(entries_with_tags['lemmatized_text'].values)

In [None]:
from sklearn.decomposition import LatentDirichletAllocation

# Init the Model
lda_model = LatentDirichletAllocation()
lda_model.learning_decay=0.9
lda_model.n_components=10

# Create Document - Topic Matrix
lda_output = lda_model.fit_transform(data_vectorized)


In [None]:
feature_names = vectorizer.get_feature_names()

In [None]:
for topic_idx, topic in enumerate(lda_model.components_):
    print("Topic #%d:" % topic_idx)
    print(" ".join([feature_names[i] for i in topic.argsort()[:-10 - 1:-1]]))
    print()

In [None]:
from gensim.parsing.porter import PorterStemmer

In [None]:
p = PorterStemmer()

In [None]:
p.stem('achieve')