In [1]:
import pandas as pd
from time import time
import re
import string
import joblib

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import GridSearchCV

In [3]:
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from cleantext import clean
import pyLDAvis
import matplotlib.pyplot as plt
import pyLDAvis.sklearn

In [4]:
RANDOM_SEED=123
N_FEATURES = 1000

In [5]:
stop_words = set(stopwords.words("english"))
print(stop_words)

{'same', "hadn't", 'doesn', 'again', 'between', 'the', 'more', 'hers', 'to', 'my', 'both', 'has', "you're", 'or', 're', 'needn', 'over', 'that', 'have', 'because', 'won', 'here', 'd', 'doing', 's', 'don', 'i', 'theirs', 'he', 'be', 'nor', 'each', 'where', 'not', 'ourselves', 'what', "you'd", 'been', 'who', 'll', 'yourself', 'should', "don't", "should've", 'myself', "shan't", 'for', "couldn't", 'but', 'above', 'as', 'shan', 'when', 'by', 'after', 'were', "it's", 'his', 'isn', 'o', "doesn't", 'ma', 'me', 'why', 'of', 'they', 'it', "she's", 'about', "shouldn't", 'up', 'and', 'an', "wouldn't", 'she', 'did', 'weren', "won't", 'will', 'most', 'further', 'only', 'at', 'themselves', 'you', "that'll", 'into', 'too', "weren't", 'out', 'aren', 'with', 'if', 'these', 'herself', 'now', "you've", 'few', 'other', 'their', 'in', 'its', 'there', 'mustn', 'while', 'below', 'itself', 'such', 'yours', 'couldn', 'are', 'himself', 'being', 'all', 'am', 'having', 'which', 'during', 'y', 'own', 'we', 'is', 't

## Очистка текстов

In [6]:
def tolower(s):
    return s.lower()

def remove_code_blocks(s):
    return re.sub(r"```[^\S\r\n]*[a-z]*\n.*?\n```", '', s, 0, re.DOTALL)

def remove_long_words(s, good_len=15):
    return " ".join([word for word in s.split() if len(word) < good_len])

def remove_stopw(s):
    return " ".join([word for word in s.split() if word not in stop_words])

def remove_custom_stopw(s):
    stopw = ["line", "file", "jetbrains", "idea", "pycharm", "type", "import", "class"]
    stopw += ["type", "int", "float", "python", "run", "traceback", "version", ""]
    
    return " ".join([word for word in s.split() if word not in stopw])

def remove_unicode(s):
    text_encode = s.encode(encoding="ascii", errors="ignore")
    text_decode = text_encode.decode()
    clean_text = " ".join([word for word in text_decode.split()])
    return clean_text

def remove_stuff(s):
    s = re.sub(r"@\S+", "", s)
    s = re.sub(r"\$", "", s)
    s = re.sub(r"https?:\/\/.*[\r\n]*", "", s)
    s = re.sub(r"#", "", s)
    punct = set(string.punctuation) 
    s = "".join([ch for ch in s if ch not in punct])
    return s

def stem_lem(s):
    stemmer = PorterStemmer()
    s = " ".join([stemmer.stem(word) for word in s.split()])
    return s

def remove_java_packages(s):
     return " ".join([word for word in s.split() if not len(word.split('.')) > 2])
    
def clear_str(s):
    s = remove_code_blocks(s)
    s = tolower(s)
    s = remove_java_packages(s)
    s = remove_stopw(s)
    s = remove_custom_stopw(s)
    s = remove_unicode(s)
    s = remove_stuff(s)
    s = stem_lem(s)
    s = remove_long_words(s)
    return s

In [7]:
data = pd.read_json('pycharm_issues.json', lines=True)

In [8]:
def clean_data_descr(s):
    if not s:
        return ""
    s = clear_str(s)
    
    s = clean(f"{s}",
    fix_unicode=True,               # fix various unicode errors
    to_ascii=True,                  # transliterate to closest ASCII representation
    lower=True,                     # lowercase text
    no_line_breaks=True,           # fully strip line breaks as opposed to only normalizing them
    no_urls=True,                  # replace all URLs with a special token
    no_emails=True,                # replace all email addresses with a special token
    no_phone_numbers=False,         # replace all phone numbers with a special token
    no_numbers=True,               # replace all numbers with a special token
    no_digits=True,                # replace all digits with a special token
    no_currency_symbols=True,      # replace all currency symbols with a special token
    no_punct=True,                 # remove punctuations
    no_emoji=True,
    
    replace_with_punct="",          # instead of removing punctuations you may replace them
    replace_with_url="",
    replace_with_email="",
    replace_with_phone_number="",
    replace_with_number="",
    replace_with_digit="",
    replace_with_currency_symbol="",
    lang="en"                       # set to 'de' for German special handling
)
    return s

In [9]:
cleaning_lambda = lambda d: clean_data_descr(d)

In [10]:
t0 = time()

data["str_data"] = data["summary"].astype(str) + data["description"].astype(str)
data["str_data"] = data["str_data"].apply(cleaning_lambda)


time() - t0

25.19657301902771

### Получим данные про версии 2020.2 и 2020.3

In [11]:
v_2020_2 = data['Affected versions'].apply(pd.Series).eq('2020.2').any(1)
data_v2020_2_str = data[v_2020_2]["str_data"]

v_2020_3 = data['Affected versions'].apply(pd.Series).eq('2020.3').any(1)
data_v2020_3_str = data[v_2020_3]["str_data"]

  v_2020_2 = data['Affected versions'].apply(pd.Series).eq('2020.2').any(1)
  v_2020_3 = data['Affected versions'].apply(pd.Series).eq('2020.3').any(1)


In [12]:
t0 = time()

data["str_data"] = data["summary"].astype(str) + data["description"].astype(str)
data_docs_cleaned = data["str_data"].apply(cleaning_lambda)


time() - t0

29.442588806152344

In [13]:
data_samples = list(data_docs_cleaned)

### Векторизуем текст

In [14]:
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(
    max_df=0.95, min_df=2, max_features=N_FEATURES, stop_words="english"
)
t0 = time()
tf = tf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))
print()

Extracting tf features for LDA...
done in 0.923s.



In [15]:
def grid_search_lda(search_params):
    lda = LatentDirichletAllocation(random_state=RANDOM_SEED)
    model = GridSearchCV(lda, param_grid=search_params, verbose=10)
    model.fit(tf)
    
    best_lda_model = model.best_estimator_

    print("Best Model's Params: ", model.best_params_)

    print("Best Log Likelihood Score: ", model.best_score_)

    print("Model Perplexity: ", best_lda_model.perplexity(tf))
    
    joblib.dump(best_lda_model, 'best_lda.pkl')
    return best_lda_model

In [16]:
def visualise_topics(data, model, tf_vectorizer, name):
    data_tf = tf_vectorizer.transform(data)
    pyLDAvis.enable_notebook()
    visualization = pyLDAvis.sklearn.prepare(model, data_tf, tf_vectorizer)
    pyLDAvis.save_html(visualization, name)
    return visualization

In [17]:
# search_params = {'n_components': [5, 6, 7, 8, 9, 10, 15, 20, 25, 30], 'learning_decay': [.5, .7, .9]}
search_params = {'n_components': [5, 8, 10, 12, 15, 20]}
model = grid_search_lda(search_params)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV 1/5; 1/6] START n_components=5..............................................
[CV 1/5; 1/6] END .........n_components=5;, score=-850971.094 total time= 1.0min
[CV 2/5; 1/6] START n_components=5..............................................
[CV 2/5; 1/6] END .........n_components=5;, score=-892233.097 total time= 1.1min
[CV 3/5; 1/6] START n_components=5..............................................
[CV 3/5; 1/6] END .........n_components=5;, score=-887515.008 total time= 1.1min
[CV 4/5; 1/6] START n_components=5..............................................
[CV 4/5; 1/6] END .........n_components=5;, score=-928375.728 total time= 1.0min
[CV 5/5; 1/6] START n_components=5..............................................
[CV 5/5; 1/6] END .........n_components=5;, score=-904276.174 total time=  59.7s
[CV 1/5; 2/6] START n_components=8..............................................
[CV 1/5; 2/6] END .........n_components=8;, score

In [18]:
visualise_topics(data_v2020_2_str, model, tf_vectorizer, "v2020_2.html")

  default_term_info = default_term_info.sort_values(
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload


In [19]:
visualise_topics(data_v2020_3_str, model, tf_vectorizer, "v2020_3.html")

  default_term_info = default_term_info.sort_values(


## Вывод

Если не фильтровать текст кода, то токены из обрывков кода начинают заполнять все топики, поэтому приходится его полностью удалить.  

При обучении на количестве топиков от 5 до 20, лучший результат обучения по perplexity получился на 15 топиках.  
Однако, если посмотреть на визуализацию топиков для 2 разных релизов, то, во-первых, разделение на топики практически одинаковое, а, во-вторых, топики сложно интерпретировать и отделить друг от друга.  
Хотя, можно сделать вывод, что 12-я тема из второй визуализации ответственна за UI, тогда как темы в группе слева состоят из каких-то самых популярных слов и терминов для любого проекта.