ML

Загрузим и нормализуем тренировочные и тестовые данные

In [1]:
import pandas as pd

In [2]:
# функция загрузки датасета

def load(path):
    comment_data = pd.read_json(path, lines=True, orient="records", encoding="windows-1251")
    comment_data.rename({"text":"post"}, inplace=True, axis=1)
    comment_data.set_index("post", inplace=True)
    comment_data = comment_data.explode(column="comments")
    comments = pd.json_normalize(comment_data["comments"])
    comments.set_index(comment_data.index, inplace=True)
    comment_data = pd.concat([comment_data, comments], axis=1)
    comment_data.drop("comments", axis=1, inplace=True)
    comment_data.reset_index(inplace=True)
    comment_data['text'] = comment_data['text'].str.replace("&#x27;", "'")
    del comments
    return comment_data

In [3]:
train_df = load("data/ranking_train.jsonl")
test_df = load("data/ranking_test.jsonl")

Обработаем текст комментариев следующим образом:

- уберем стоп-слова
- уберем пунктуацию
- уберем цифры
- понизим регистр всех слов
- приведем слова в начальную форму

In [4]:
import nltk
import re
from nltk.corpus import stopwords
from nltk import pos_tag
from nltk.corpus import wordnet
import string

w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()


def get_wordnet_pos(pos_tag):
    if pos_tag.startswith('J'):
        return wordnet.ADJ
    elif pos_tag.startswith('V'):
        return wordnet.VERB
    elif pos_tag.startswith('N'):
        return wordnet.NOUN
    elif pos_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def lemmatize_text(text):
    text = w_tokenizer.tokenize(text)
    pos_tags = pos_tag(text)
    return ' '.join([lemmatizer.lemmatize(w[0], get_wordnet_pos(w[1])) for w in pos_tags])

def transform(df):
    stop_words = stopwords.words('english')
    pattern = r'\b(?:{})\b'.format('|'.join(stop_words))
    punc_pattern = re.compile(f'[{string.punctuation}]')
    num_pattern = re.compile('\d')
    df.loc[:, "modified_text"] = df.loc[:, 'text'].str.replace(pattern, '', regex=True)
    df.loc[:, "modified_text"] = df.loc[:, 'modified_text'].str.replace(punc_pattern, '', regex=True)
    df.loc[:, "modified_text"] = df.loc[:, 'modified_text'].str.replace(num_pattern, '', regex=True)
    df.loc[:, "modified_text"] = df.loc[:, 'modified_text'].str.lower()
    df.loc[:, "modified_text"] = df.loc[:, 'modified_text'].apply(lemmatize_text)

In [5]:
transform(train_df)

In [6]:
transform(test_df)

Отложим 88100 строк на тест(нужно число, кратное 5, так как у нас гарантированы группы по 5 элементов, и довольно большое, поэтому выбрал такое)

In [None]:
test_pred = train_df.iloc[0:88100]
train_df.drop(range(88101), inplace=True)

Векторизируем нормализованные комментарии для обучения с помощью `TfidfVectorizer()` (его преимушество - учет частоты слов: чем чаще встречается слово, тем менее значимым оно становится)

Затем разделим данные на тренировочные и валидационные

In [41]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

vect = TfidfVectorizer()

X = vect.fit_transform(train_df.loc[:, "modified_text"])

x_train, x_val, y_train, y_val = train_test_split(X, train_df[["post", "score"]], test_size=0.2, random_state=7)

# подсчет групп по постам в тренировочных и валидационных данных
query_train = y_train.groupby("post")["post"].count().to_numpy()
query_val = y_val.groupby("post")["post"].count().to_numpy()


Используемая модель - `LGBMRanker()` из [библиотеки](https://lightgbm.readthedocs.io/en/latest/Python-API.html) lightgbm

Это ранжирующая модель, на основе на градиентного бустинга

Натренируем её

In [42]:
from lightgbm import LGBMRanker

model_return = LGBMRanker(n_estimators=100,
                          random_state=7,
                          num_leaves=41,
                          learning_rate=0.002,
                          colsample_bytree=0.7)

model_return.fit(x_train, y_train.loc[:, "score"],
             group = query_train,
             eval_set=[(x_val, y_val.loc[:, "score"])],
             eval_group=[query_val],
                )

[1]	valid_0's ndcg@1: 0.891006	valid_0's ndcg@2: 0.951045	valid_0's ndcg@3: 0.960688	valid_0's ndcg@4: 0.961527	valid_0's ndcg@5: 0.96156
[2]	valid_0's ndcg@1: 0.893764	valid_0's ndcg@2: 0.952172	valid_0's ndcg@3: 0.961598	valid_0's ndcg@4: 0.962484	valid_0's ndcg@5: 0.962522
[3]	valid_0's ndcg@1: 0.894147	valid_0's ndcg@2: 0.952294	valid_0's ndcg@3: 0.961707	valid_0's ndcg@4: 0.962601	valid_0's ndcg@5: 0.962633
[4]	valid_0's ndcg@1: 0.895151	valid_0's ndcg@2: 0.952622	valid_0's ndcg@3: 0.962082	valid_0's ndcg@4: 0.96294	valid_0's ndcg@5: 0.962986
[5]	valid_0's ndcg@1: 0.895363	valid_0's ndcg@2: 0.952736	valid_0's ndcg@3: 0.962117	valid_0's ndcg@4: 0.963014	valid_0's ndcg@5: 0.963062
[6]	valid_0's ndcg@1: 0.895984	valid_0's ndcg@2: 0.952983	valid_0's ndcg@3: 0.962371	valid_0's ndcg@4: 0.963232	valid_0's ndcg@5: 0.963279
[7]	valid_0's ndcg@1: 0.895207	valid_0's ndcg@2: 0.952768	valid_0's ndcg@3: 0.962109	valid_0's ndcg@4: 0.962992	valid_0's ndcg@5: 0.963039
[8]	valid_0's ndcg@1: 0.89539

LGBMRanker(colsample_bytree=0.7, learning_rate=0.002, num_leaves=41,
           random_state=7)

In [43]:
model_return.best_score_

defaultdict(collections.OrderedDict,
            {'valid_0': OrderedDict([('ndcg@1', 0.89674952006269),
                          ('ndcg@2', 0.9535211635234987),
                          ('ndcg@3', 0.9627557103056913),
                          ('ndcg@4', 0.9635996044674128),
                          ('ndcg@5', 0.9636424010376505)])})

ndcg@k везде 0.9 и выше, что является очень хорошим знаком

Протестируем модель на отложенных данных и измерим ndcg из библиотеки scikit-learn

In [45]:
# предсказываем
predictions = model_return.predict(vect.transform(test_pred.loc[:, "modified_text"]))
print(predictions)

[-0.08808458 -0.01940743 -0.02276948 ... -0.01271672  0.00357423
 -0.02010641]


Модель вернула не готовые категории популярности, а абстрактные рейтинги

Теперь задача простая - пройтись по пятеркам предсказанных значений, ранжировать по рейтингу и сохранить рейтинги в отдельном списке

In [89]:
# ранжируем
def get_ranks(arr):
    predicted_scores = []
    for i in range(0, len(arr), 5):
        group = predictions[i:i+5]
        sorted_group = sorted(group, reverse=True)
        ranking_dict = {v:ind for ind,v in enumerate(sorted_group)}
        for j in group:
            predicted_scores.append(ranking_dict[j])
    return predicted_scores

predicted_scores = get_ranks(predictions)

In [90]:
# замеряем результат
from sklearn.metrics import ndcg_score

ndcg_score([test_pred.loc[:, "score"]], [predicted_scores])

0.9187150671938966

Скор - 0.92. Модель довольно успешна в ранжировании

Предскажем тестовые данные, которые были даны кейсом

In [91]:
# предсказываем
real_predictions = model_return.predict(vect.transform(test_df.loc[:, "modified_text"]))
print(real_predictions)

[ 0.04482772  0.04334655  0.04086538 ...  0.03680324 -0.02722701
  0.01779177]


In [92]:
# помещаем результаты в фрейм
test_df["score"] = get_ranks(real_predictions)
test_df

Unnamed: 0,post,text,score,modified_text
0,"iOS 8.0.1 released, broken on iPhone 6 models,...",I'm still waiting for them to stabilize wifi o...,4,i still wait stabilize wifi ipad sith ios thei...
1,"iOS 8.0.1 released, broken on iPhone 6 models,...","For those who upgraded, no need to do a restor...",2,for upgraded need restore you optionclick quot...
2,"iOS 8.0.1 released, broken on iPhone 6 models,...",Upgraded shortly after it was released and suf...,3,upgrade shortly release suffered consequence j...
3,"iOS 8.0.1 released, broken on iPhone 6 models,...",I think they were under a lot of pressure on t...,1,i think lot pressure healthkit front that one ...
4,"iOS 8.0.1 released, broken on iPhone 6 models,...",Fix for those who already updated: http:&#x2F...,0,fix already updated httpxfxfwwwimorecomxfioski...
...,...,...,...,...
70015,Why does Gmail hate my domain?,I send a LOT of emails each month (email newsl...,4,i send lot email month email newsletter busine...
70016,Why does Gmail hate my domain?,I hit a similar problems when sending automate...,2,i hit similar problem send automate internal e...
70017,Why does Gmail hate my domain?,That's all a bit presumptive and inflammatory ...,0,that bite presumptive inflammatory amount pure...
70018,Why does Gmail hate my domain?,If the domain is bitbin.de and the mail server...,1,if domain bitbinde mail server host server pro...


Выберем нужные колонки и преобразуем фрейм в .jsonl файл с такой же структурой, как ranking_test.jsonl

In [93]:
# извлекаем
solution = test_df[["post", "text", "score"]]
solution

Unnamed: 0,post,text,score
0,"iOS 8.0.1 released, broken on iPhone 6 models,...",I'm still waiting for them to stabilize wifi o...,4
1,"iOS 8.0.1 released, broken on iPhone 6 models,...","For those who upgraded, no need to do a restor...",2
2,"iOS 8.0.1 released, broken on iPhone 6 models,...",Upgraded shortly after it was released and suf...,3
3,"iOS 8.0.1 released, broken on iPhone 6 models,...",I think they were under a lot of pressure on t...,1
4,"iOS 8.0.1 released, broken on iPhone 6 models,...",Fix for those who already updated: http:&#x2F...,0
...,...,...,...
70015,Why does Gmail hate my domain?,I send a LOT of emails each month (email newsl...,4
70016,Why does Gmail hate my domain?,I hit a similar problems when sending automate...,2
70017,Why does Gmail hate my domain?,That's all a bit presumptive and inflammatory ...,0
70018,Why does Gmail hate my domain?,If the domain is bitbin.de and the mail server...,1


In [98]:
# сделаем jsonl файл
import jsonlines
import numpy as np

future_jsonl = []
post_set, post_ind = np.unique(solution[["post"]], return_index=True)
post_set = post_set[np.argsort(post_ind)]
for element in post_set:
    df_frac = solution[solution["post"]==element]
    comments = df_frac[["text", "score"]].to_dict("records")
    json_sample = {"text":element, "comments":comments}
    future_jsonl.append(json_sample)

with jsonlines.open("data/ranking_result.jsonl", mode='w') as jsonl_writer: 
    jsonl_writer.write_all(future_jsonl)