# ДЗ3

## Ввод библиотек, задание параметров

In [None]:
!pip install pymorphy2

Collecting pymorphy2
  Downloading pymorphy2-0.9.1-py3-none-any.whl (55 kB)
[K     |████████████████████████████████| 55 kB 2.4 MB/s 
[?25hCollecting pymorphy2-dicts-ru<3.0,>=2.4
  Downloading pymorphy2_dicts_ru-2.4.417127.4579844-py2.py3-none-any.whl (8.2 MB)
[K     |████████████████████████████████| 8.2 MB 8.1 MB/s 
[?25hCollecting dawg-python>=0.7.1
  Downloading DAWG_Python-0.7.2-py2.py3-none-any.whl (11 kB)
Installing collected packages: pymorphy2-dicts-ru, dawg-python, pymorphy2
Successfully installed dawg-python-0.7.2 pymorphy2-0.9.1 pymorphy2-dicts-ru-2.4.417127.4579844


In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
import re
import pymorphy2
import nltk
nltk.download('stopwords')

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing  import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm_notebook



lemmatizer = pymorphy2.MorphAnalyzer()
TOKEN_PATTERN = "[а-яё]+"

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
stopword_set = set(nltk.corpus.stopwords.words('russian'))
stopword_set = stopword_set.union({'это', 'который', 'весь', 'наш', 'свой', 'ещё', 'её', 'ваш', 'также', 'итак'})


In [None]:
train_df = pd.read_csv("https://raw.githubusercontent.com/alexey190900/MLIntro2022_Spring/main/lecture05/kaggle/train.csv")
test_df = pd.read_csv("https://raw.githubusercontent.com/alexey190900/MLIntro2022_Spring/main/lecture05/kaggle/test.csv")

In [None]:
# train_df = train_df.drop(["id"], axis=1)
# test_df = test_df.drop(["id"], axis=1)

In [None]:
train_df.head()

Unnamed: 0,url,title,target
0,m.kp.md,"Экс-министр экономики Молдовы - главе МИДЭИ, ц...",False
1,www.kp.by,Эта песня стала известна многим телезрителям б...,False
2,fanserials.tv,Банши 4 сезон 2 серия Бремя красоты смотреть о...,False
3,colorbox.spb.ru,Не Беси Меня Картинки,False
4,tula-sport.ru,В Новомосковске сыграют следж-хоккеисты алекси...,False


In [None]:
test_df.head()

Unnamed: 0,url,title
0,www.kommersant.ru,Шестой кассационный суд в Самаре начнет работу...
1,urexpert.online,"Что такое индексация алиментов, кем и в каких ..."
2,imperimeha.ru,Женщинам | Империя Меха - Part 12
3,national-porn.com,"Небритые, волосатые киски: Порно всех стран и ..."
4,2gis.ru,67


## Выделение в выборки

In [None]:
# stopword_url = [".com", ".ru", ".org", "www.", ".ua", ".su"];
# train_df.url.replace(stopword_url, "")

In [None]:
train_df["full"] = train_df["url"] + " " + train_df["title"]
test_df["full"] = test_df["url"] + " " + test_df["title"]
x_train = train_df["full"].values
y_train = train_df["target"].astype(int).values
x_test = test_df["full"].values

## Обработка

In [None]:
def tokenize(text):
    return re.findall(TOKEN_PATTERN, text.lower())

docs = [tokenize(text) for text in x_train]

In [None]:
lemmatizer = pymorphy2.MorphAnalyzer()

lemmatizer_cache = {}

def lemmatize(token):
    if lemmatizer.word_is_known(token):
        if token not in lemmatizer_cache:
            lemmatizer_cache[token] = lemmatizer.parse(token)[0].normal_form
        return lemmatizer_cache[token]
    return token
x_train_lemmatized = [[lemmatize(token) for token in text] for text in tqdm_notebook(docs)]

x_train_cleared = [" ".join([token for token in text if token not in stopword_set]) for text in x_train_lemmatized]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  # This is added back by InteractiveShellApp.init_path()


  0%|          | 0/135309 [00:00<?, ?it/s]

### vectorizer

In [None]:
vectorizer = CountVectorizer()
model = MultinomialNB()

In [None]:
x_train_vectorized = vectorizer.fit_transform(x_train)

In [None]:
model.fit(x_train_vectorized, y_train)
y_pred = model.predict(x_train_vectorized)

In [None]:
f1_score(y_train, y_pred)

0.9250423968343697

### PL: CountVectorizer, SGDClassifier

In [None]:
count_model = make_pipeline(
        CountVectorizer(
            lowercase=True, token_pattern=TOKEN_PATTERN,
            stop_words=stopword_set
        ),
    SGDClassifier(random_state=False, loss='log', class_weight='balanced')
)

In [None]:
count_model.fit(x_train, y_train)
f1_score(y_train, count_model.predict(x_train))

0.6284205289142332

In [None]:
count_model.fit(x_train_cleared, y_train)
f1_score(y_train, count_model.predict(x_train_cleared))

0.6482655136330432

### PL:CountVectorizer, LogisticRegression

In [None]:
# count_lr_model = make_pipeline(
#         CountVectorizer(
#             lowercase=True, token_pattern=TOKEN_PATTERN,
#             stop_words=stopword_set
#         ),
#     ('clf', LogisticRegression(random_state=False, class_weight='balanced'))
# )

### TfidVectorizer

#### 1

In [None]:
char_tfidf_model = Pipeline([
    (
        'vectorizer',
        TfidfVectorizer(
            lowercase=True, ngram_range=(2, 4), analyzer='char',
            min_df=5, max_df=0.8
        )
    ),
    ('clf', SGDClassifier(random_state=False, loss='log', class_weight='balanced'))
])

In [None]:
char_tfidf_model.fit(x_train, y_train)
f1_score(y_train, char_tfidf_model.predict(x_train))

0.9579272780803769

In [None]:
char_tfidf_model.fit(x_train_cleared, y_train)
f1_score(y_train, char_tfidf_model.predict(x_train_cleared))

0.724076456118972

#### 2


In [None]:
char2_tfidf_model = Pipeline([
    (
        'vectorizer',
        TfidfVectorizer(
            lowercase=True, ngram_range=(2, 4), analyzer='char',
            min_df=5, max_df=0.8
        )
    ),
    ('clf', LogisticRegression(random_state=False, class_weight='balanced'))
])

In [None]:
char2_tfidf_model.fit(x_train_cleared, y_train)
f1_score(y_train, char2_tfidf_model.predict(x_train_cleared))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.7696011968768992

#### Лучший результат

In [None]:
char2_tfidf_model.fit(x_train, y_train)
f1_score(y_train, char2_tfidf_model.predict(x_train))

0.9911878535278357

In [None]:
test_df["target"] = char2_tfidf_model.predict(x_test).astype(bool)

test_df[["id", "target"]].to_csv("ml_hw_3.csv", index=False)

!cat ml_hw_3.csv | head

NameError: ignored

### Вывод
В тестировании хорошо себя показала LogisticRegression и TfidfVectorizer.
lemmatizer обработка данных ухудшала результаты измерений, откуда следует что на малых объемах данных (строка до 10 слов) применение нежелательно. 