# HSE 2021: Mathematical Methods for Data Analysis

## Homework 3

**Warning 1**: some problems require (especially the lemmatization part) significant amount of time, so **it is better to start early (!)**

**Warning 2**: it is critical to describe and explain what you are doing and why, use markdown cells

In [None]:
from typing import Tuple, List

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
%matplotlib inline

sns.set(style="darkgrid")

## PART 1: Logit model

We consider a binary classification problem. For prediction, we would like to use a logistic regression model. For regularization we add a combination of the $l_2$ and $l_1$ penalties (Elastic Net). 

Each object in the training dataset is indexed with $i$ and described by pair: features $x_i\in\mathbb{R}^{K}$ and binary labels $y_i$. The model parametrized with bias $w_0\in\mathbb{R}$ and weights $w\in\mathbb{R}^K$.

The optimization problem with respect to the $w_0, w$ is the following (Elastic Net Loss):

$$L(w, w_0) = \frac{1}{N} \sum_{i=1}^N \ln(1+\exp(-y_i(w^\top x_i+w_0))) + \gamma \|w\|_1 + \beta \|w\|_2^2$$.

#### 1. [0.5 points]  Find the gradient of the Elastic Net loss and write its formulas (better in latex format) 

$$A = {\exp(-y_i(w^\top x_i+w_0)}$$

$$\nabla_wL(w, w_0) = -\frac{1}{N} \sum_{i=1}^N y_ix_i\frac{A}{A+1} + \gamma sign(w) + 2\beta w$$

$$\nabla_{w_0}L(w, w_0) = -\frac{1}{N} \sum_{i=1}^N y_i\frac{A}{A+1}$$

#### 2. [0.25 points] Implement the Elastic Net loss (as a function)

In [None]:
def loss(X, y, w: List[float], w0: float, gamma=1., beta=1.) -> float:
    A = np.exp(-y * (np.dot(X, w.T) + w0))
    result = np.sum(np.log(A + 1)) / len(X) + gamma * np.sum(np.abs(w)) + beta * np.linalg.norm(w)
    return result

#### 3. [0.25 points] Implement the gradient (as a function)

In [None]:
def get_grad(X, y, w: List[float], w0: float, gamma=1., beta=1.) -> Tuple[List[float], float]:
    A = np.exp(-y * (np.dot(X, w.T) + w0))
    grad_w = - np.dot(A / (A + 1) * y, X) / len(X) + gamma * np.sign(w) + 2 * beta * w
    grad_w0 = - np.sum(A / (A + 1) * y) / len(X)
    return grad_w, grad_w0

#### Check yourself

In [None]:
np.random.seed(42)
X = np.random.multivariate_normal(np.arange(5), np.eye(5), size=10)
y = np.random.binomial(1, 0.42, size=10)
w, w0 = np.random.normal(size=5), np.random.normal()

grad_w, grad_w0 = get_grad(X, y, w, w0)
assert(np.allclose(grad_w,
                   [-2.73262076, -1.87176281, 1.30051144, 2.53598941, -2.71198109],
                   rtol=1e-2) & \
       np.allclose(grad_w0,
                   -0.2078231418067844, 
                   rtol=1e-2)
)

####  4. [1 point]  Implement gradient descent which works for both tol level and max_iter stop criteria and plot the decision boundary of the result

The template provides basic sklearn API class. You are free to modify it in any convenient way.

In [None]:
from sklearn.base import BaseEstimator, ClassifierMixin

In [None]:
from sklearn.metrics import roc_curve


class Logit(BaseEstimator, ClassifierMixin):
    def __init__(self, beta=1.0, gamma=1.0, learning_rate=1e-3, tolerance=0.01, max_iter=1000, random_state=42, threshold=0.5):  
        self.beta = beta        
        self.gamma = gamma
        self.tolerance = tolerance
        self.max_iter = max_iter
        self.learning_rate = learning_rate
        self.random_state = random_state
        # you may additional properties if you wish

        # Флаг на наличие нуля и история лоссов
        self.flag = False
        self.loss_history = []

        # Названия взяты от roc_curve из sklearn.metrics
        # https://habr.com/ru/company/netologyru/blog/582756/
        self.fpr, self.tpr = 0, 0
        self.threshold = threshold
        
    def fit(self, X, y):
        # add weights and bias and optimize Elastic Net loss over (X,y) dataset
        # save history of optimization steps

        self.w0 = 1
        self.w = np.ones(len(X[0]))

        if np.count_nonzero(y) < len(y):
            self.flag = True
            y[y == 0] = -1
        
        for i in range(self.max_iter):
            grad_w, grad_w0 = get_grad(X, y, self.w, self.w0, self.gamma, self.beta)
            self.w = self.w - grad_w * self.learning_rate
            self.w0 = self.w0 - grad_w0 * self.learning_rate
            self.loss_history.append(loss(X, y, self.w, self.w0, self.gamma, self.beta))
            self.fpr, self.tpr, array = roc_curve(y, self.predict_proba(X)[1,:])
            self.threshold = array[np.argmax(self.tpr - self.fpr)]

        return self
    
    def predict(self, X):
        # return vector of predicted labels for each object from X

        predicted_proba = self.predict_proba(X)
        predict = np.ones(predicted_proba.shape[1])

        predict[self.threshold > predicted_proba[1, :]] = 0
        if self.flag == False:
            predict[self.threshold > predicted_proba[1, :]] = -1
        
        return predict
        
    def predict_proba(self, X):
        return np.array([1 / (1 + np.exp(np.dot(X, self.w) + self.w0)),\
                         1 / (1 + np.exp(-np.dot(X, self.w) - self.w0))])

In [None]:
# sample data to test your model
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=180, n_features=2, n_redundant=0, n_informative=2,
                               random_state=42, n_clusters_per_class=1)

In [None]:
# a function to plot the decision boundary
def plot_decision_boundary(model, X, y):
    fig = plt.figure()
    X1min, X2min = X.min(axis=0)
    X1max, X2max = X.max(axis=0)
    x1, x2 = np.meshgrid(np.linspace(X1min, X1max, 200),
                         np.linspace(X2min, X2max, 200))
    ypred = model.predict(np.c_[x1.ravel(), x2.ravel()])
    ypred = ypred.reshape(x1.shape)
    
    plt.contourf(x1, x2, ypred, alpha=.4)
    plt.scatter(X[:,0], X[:,1], c=y)

In [None]:
model = Logit(0,0)
y[y == 0] = -1
model.fit(X, y)
plot_decision_boundary(model, X, y)

#### 5. [0.25 points] Plot loss diagram for the model, i.e. show the dependence of the loss function from the gradient descent steps

In [None]:
plt.plot(model.loss_history)
plt.xlabel("Итерации")
plt.ylabel("Потери")
plt.show()

In [None]:
# Поднял learning rate, потери сильно снизились
my_model = Logit(0, 0, learning_rate = 0.1)
y[y == 0] = -1
my_model.fit(X, y)

plt.plot(my_model.loss_history)
plt.xlabel("Итерации")
plt.ylabel("Потери")
plt.show()

## PART 2: Support Vector Machines

#### 6. [2 point] Using the same dataset, train SVM Classifier from Sklearn.
Investigate how different parameters influence the quality of the solution:
+ Try several kernels: Linear, Polynomial, RBF (and others if you wish). Some Kernels have hypermeters: don't forget to try different.
+ Regularization coefficient 

Show how these parameters affect accuracy, roc_auc and f1 score. 
Make plots for the dependencies between metrics and parameters. 
Try to formulate conclusions from the observations. How sensitive are kernels to hyperparameters? How sensitive is a solution to the regularization? Which kernel is prone to overfitting?

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score


def get_score(kernel, regularization, score, degree = None, gamma = None, use_x_train = False):
    if degree is not None:
        model = SVC(kernel = kernel, C = regularization, degree = degree)
    elif gamma is not None:
        model = SVC(kernel = kernel, C = regularization, gamma = gamma)
    else:
        model = SVC(kernel = kernel, C = regularization)
    
    model.fit(X_train, y_train)
    if use_x_train:
        current_y = y_train
        prediciton = model.predict(X_train)
    else:
        current_y = y_test
        prediciton = model.predict(X_test)

    if score == "accuracy":
        return accuracy_score(current_y, prediciton)
    elif score == "roc":
        return roc_auc_score(current_y, prediciton)
    elif score == "f1":
        return f1_score(current_y, prediciton)


def axis_settings(axis, title, xlabel, ylabel, legend, xscale):
    if title:
        axis.set_title(title)
    if xlabel:
        axis.set_xlabel(xlabel)
    if ylabel:
        axis.set_ylabel(score)
    if legend:
        axis.legend()
    if xscale:
        axis.set_xscale(xscale)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
figsize = (24, 6)

# Общие графики без использования гиперпараметров
scores = ["accuracy", "roc", "f1"]
kernels = ["linear", "poly", "rbf"]
regularizations = [0.01, 0.1, 1, 10, 100]
figures, axis = plt.subplots(1, 3, figsize = figsize)
for index, score in enumerate(scores):
    for kernel in kernels:
        current = []
        for regularization in regularizations:
            current.append(get_score(kernel, regularization, score))
        axis[index].plot(regularizations, current, label = kernel)
    axis_settings(axis[index], f"Kernel = {kernel}", "regularization", score, True, "log")
plt.show()

# График тестирования различных значений степени для POLY ядра
# и график тестирования различных значений гаммы для RBF ядра
figures, axis = plt.subplots(1, 2, figsize = figsize)
degrees = [0, 1, 2, 3, 4, 5]
gammas = [0.0001, 0.001, 0.01, 0.1, 1]
colors = ["red", "yellow", "purple"]

for index, score in enumerate(scores):
    current = []
    for degree in degrees:
        current.append(get_score("poly", 1, score, degree = degree))
    axis[0].plot(degrees, current, label = score, color = colors[index])

for index, score in enumerate(scores):
    current = []
    for gamma in gammas:
        current.append(get_score("rbf", 1, score, gamma = gamma))
    axis[1].plot(gammas, current, label = score, color = colors[index])

axis_settings(axis[0], "Kernel = poly", "degree", "scores", True, None)
axis_settings(axis[1], "Kernel = rbf", "gamma", "scores", True, "log")
plt.show()

# Графики для проверки prone overfitting
score = "accuracy"
kernels = ["linear", "poly", "rbf"]
regularizations = [0.01, 0.1, 1, 10, 100]
figures, axis = plt.subplots(1, 3, figsize = figsize)
for index, kernel in enumerate(kernels):
    current1, current2 = [], []
    for regularization in regularizations:
        current1.append(get_score(kernel, regularization, score))
        current2.append(get_score(kernel, regularization, score, use_x_train = True))
    axis[index].plot(regularizations, current1, label = kernel + " test")
    axis[index].plot(regularizations, current2, label = kernel + " train", linestyle = 'dashed')
    axis_settings(axis[index], f"Kernel = {kernel}", "regularization", score, True, "log")
plt.show()

1) Рассмотрим графики различных score для разных ядер в зависимсоти от 
regularization параметра.

Scores: accuracy, roc и f1. <br>
Ядра: linear, poly, rbf. <br>
Regularization: от 10^(-2) до 10^(2).

По оси x: регуляризация. <br>
По оси y: один из scores.

В целом, все 3 графика похожи. При 10^(-2) rbf показывает сильно хуже результаты, чем два остальных ядра. При 10^(-1) все три ядра начинают показывать схожие результаты. После 10^(0) все ядра достигают максимума (или практически достигают) и в дальнейшем резких изменений нет. Максимум по всем параметрам от 0,97 до 1.

---

2) Рассмотрим графики accuracy в зависимости от различных гиперпараметров для разных ядер.

Regularization: 1. <br>
Degree (poly): от 0 до 5. <br>
Gamma (rbf): от 10^(-4) до 10^(0).

По оси x: гиперпараметры. <br>
По оси y: scores.

Сначала рассмотрим график poly ядра. При degree = 0 результаты низкие. При degree = 1 метрики резко возрастают и практически достигают максимума. При degree от 1 до 4 резльутаты практически не меняются. При degree = 5 и далее метрики снижаются.

Сначала рассмотрим график rbf ядра. При gamma = 10^(-4) результаты низкие. Далее до gamma = 10^(-1) метрики постепенно возрастают и достигают максимума. Далее график движется практически горизонтально.

---

3) Рассмотрим графики показывающие точность предсказаний ядер на test данных на train данных.

Ядра: linear, poly, rbf. <br>
Data: train, test.

По оси x: regularization. <br>
По оси y: accuracy.

Нас интересуют ситуации, когда модель на test данных показывает результаты хуже, чем на train данных. В данном случае явно видно, что переобучена модель c linear ядром. Модели с ядрами poly и rbf показывают примерно равный accuracy для train и test данных.

Наиболее склонное к переобучению - линейное ядро.

## PART 3: Natural Language Processing

#### 7. [1.75 point] Form the dataset

We are going to form a dataset that we will use in the following tasks for binary and multiclass classification

0. Choose **six** authors that you like (specify who you've chosen) and download the <a href="https://www.kaggle.com/d0rj3228/russian-literature?select=prose">relevant data</a> from **prose** section
1. Build your own dataset for these authors: 
    * divide each text into sentences such that we will have two columns: *sentence* and *target author*, each row will contain one sentence and one target
    * drop sentences where N symbols in a sentence < 15
    * fix random state and randomly choose sentences in the folowing proportion "5k : 15k : 8k : 11k : 20k : 3k" for the authors respectively
    
    sample data may look like:
    
    <center> 
    <table>
        <tr>
            <th> sentence </th>
            <th> author </th>
        </tr> 
        <tr><td> Несколько лет тому назад в одном из своих поместий жил старинный русской барин, Кирила Петрович Троекуров. </td><td> Пушкин </td><td> 
        <tr><td> Уже более недели приезжий господин жил в городе, разъезжая по вечеринкам и обедам и таким образом проводя, как говорится, очень приятно время. </td><td> Гоголь </td><td> 
        <tr><td> ... </td><td> ... </td><td> 
        <tr><td> Я жил недорослем, гоняя голубей и играя в чехарду с дворовыми мальчишками. </td><td> Пушкин </td><td>         
    </table>
</center>
     
2. Preprocess (tokenize and clean) the dataset 
    * tokenize, remove all stop words (nltk.corpus.stopwords), punctuation (string.punctuation) and numbers
    * convert to lower case and apply either stemming or lemmatization of the words (on your choice)
    * vectorize words using both **bag of words** and **tf-idf** (use sklearn)
    * observe and describe the difference between vectorized output (what do numbers look like after transformations and what do they represent?)

In [None]:
!pip install pymorphy2

import os
import re
import random
import pymorphy2
import nltk
import nltk.data
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
def get_text(author):
    text = []
    filenames = [x for x in os.listdir(f"prose/{author}") if x.endswith(".txt")]
    for filename in filenames:
        file = open(f"prose/{author}/{filename}", mode = "r", encoding="utf-8")
        text.append(file.read())
        file.close()
    return re.sub(r'\s+', ' ' ,' '.join(text)) # убираем \n


def create_dataframe(authors):
    column1 = []
    column2 = []
    for author in authors:
        for sentence in authors[author]["sentences"]:
            column1.append(sentence)
            column2.append(author)
    return pd.DataFrame({'sentence' : column1, 'author' : column2})


def get_lemmas(sentence, stopwords):
    lemmas = []
    tokens = punct_tokenizer.tokenize(sentence)
    # https://www.geeksforgeeks.org/string-punctuation-in-python/
    punctuation = '"' + "!«!»?«?»«,»,)…—-!#$%&'()*+, -./:;<=>?@[\]^_`{|}~–--"
    digits = "0123456789"
    for token in tokens:
        if token in punctuation or token[0] in digits:
            continue
        if token in stopwords:
            continue
        lemmas.append(morph_analyzer.parse(token)[0].normal_form)
    return " ".join(lemmas)


def vectorize(data, vectorizer_class, column_index, column_name, max_features = 10):
    vectorizer = vectorizer_class(max_features = max_features)
    vectorizer.fit(data["lemmas"])

    result = vectorizer.transform(data["lemmas"])
    dense = np.asarray(result.todense())

    values = [dense[i] for i in range(dense.shape[0])]
    data.insert(column_index, column_name, values)

    print(f'Самые популярные токены {column_name}: {vectorizer.get_feature_names()}')


authors = {"Dostoevsky": {"len": 5000, "sentences": []},
           "Gogol": {"len": 15000, "sentences": []},
           "Gorky": {"len": 8000, "sentences": []},
           "Turgenev": {"len": 11000, "sentences": []},
           "Tolstoy": {"len": 20000, "sentences": []},
           "Pushkin": {"len": 3000, "sentences": []}}


# divide each text into sentences such that we will have two columns: sentence and target author, each row will contain one sentence and one target
for author in authors:    
    text = get_text(author)
    expression = r"[.|!|?|…]" # https://qna.habr.com/q/393616
    authors[author]["sentences"] = re.split(expression, text)


# drop sentences where N symbols in a sentence < 15
for author in authors:
    sentences = authors[author]["sentences"]
    authors[author]["sentences"] = [x for x in sentences if len(x.strip()) >= 15]


# fix random state and randomly choose sentences in the folowing proportion "5k : 15k : 8k : 11k : 20k : 3k" for the authors respectively
random.seed(83267)
for author in authors:
    authors[author]["sentences"] = random.sample(authors[author]["sentences"], authors[author]["len"])
 

# tokenize, remove all stop words (nltk.corpus.stopwords), punctuation (string.punctuation) and numbers
# convert to lower case and apply either stemming or lemmatization of the words (on your choice)
nltk.download('stopwords')
data = create_dataframe(authors)
punct_tokenizer = nltk.WordPunctTokenizer()
morph_analyzer = pymorphy2.MorphAnalyzer()
stopwords = nltk.corpus.stopwords.words('russian')

lemmas = []
data = create_dataframe(authors)
for sentence in data["sentence"]:
    lemmas.append(get_lemmas(sentence.lower(), stopwords))
data.insert(2, 'lemmas', lemmas)

# vectorize words using both bag of words and tf-idf (use sklearn)
vectorize(data, CountVectorizer, 3, "CountVectorizer", max_features = 10)
vectorize(data, TfidfVectorizer, 4, "TfidfVectorizer", max_features = 10)
data.head(10)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Самые популярные токены CountVectorizer: ['весь', 'говорить', 'знать', 'который', 'один', 'рука', 'свой', 'сказать', 'человек', 'это']
Самые популярные токены TfidfVectorizer: ['весь', 'говорить', 'знать', 'который', 'один', 'рука', 'свой', 'сказать', 'человек', 'это']


Unnamed: 0,sentence,author,lemmas,CountVectorizer,TfidfVectorizer
0,"Тушар в ответ ей пожал плечами, что, конечно,...",Dostoevsky,тушар ответ пожать плечо означать недаром деск...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,"Вот почему, и только поэтому, я обращал на не...",Dostoevsky,почему поэтому обращать внимание следить она э...,"[0, 0, 2, 0, 0, 0, 0, 0, 0, 1]","[0.0, 0.0, 0.9243488761648889, 0.0, 0.0, 0.0, ..."
2,"тогда как – все призрак, и мираж, и ложь, и с...",Dostoevsky,призрак мираж ложь стыд неестественность мера ...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 2]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,"Наполеон вздрогнул, подумал и сказал мне: «Ты...",Dostoevsky,наполеон вздрогнуть подумать сказать напомнить...,"[0, 0, 0, 1, 0, 0, 0, 1, 0, 0]","[0.0, 0.0, 0.0, 0.7191985373641532, 0.0, 0.0, ..."
4,Смятение души его вдруг прошло,Dostoevsky,смятение душа пройти,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
5,"Кто ж я на земле, как не приживальщик",Dostoevsky,земля приживальщик,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
6,") и проявляться не может иначе как сурово, гор...",Dostoevsky,проявляться иначе сурово горячо круто часто по...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
7,"Ну что, если б каждый из них вдруг узнал весь...",Dostoevsky,б каждый узнать весь секрет,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
8,"В заключение скажу, что вами означенный долг ...",Dostoevsky,заключение сказать вы означенный долг рубль се...,"[1, 0, 0, 0, 0, 0, 0, 1, 0, 0]","[0.77854435946222, 0.0, 0.0, 0.0, 0.0, 0.0, 0...."
9,— с достоинством и благородством заключил наш...,Dostoevsky,достоинство благородство заключить наш герой,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


1) Самые популярные токены для двух vectorizer вычислились одинаково (я сначал подумал, что у меня ошибка в коде). Ими оказались слова ['говорить', 'знать', 'который', 'мочь', 'один', 'рука', 'свой', 'сказать', 'человек', 'это']. Выглядит логично.

2) Что означают числа для CountVectorizer? Объясню на примере: <br>
Леммы = "посетить мой душа час говорить твёрдый вера..." <br>
Числа = [1, 0, 0, 0, 0, 0, 1, 0, 0, 0] <br>
Популярыне токены = ['говорить', 'знать', 'который' ...]

В массиве чисел на первом же месте стоит единица. Это означает, что токен стоящий на первом месте ("говорить") встречается в преложении один раз.
Если бы вместо единица в массиве на первом месте стояло число 3 - это означало бы, что токен "говорить" встречается три раза. Думаю, логика понятна :)

3) Что означают числа для TfidfVectorizer? <br>
TF IDF VECTORIZER -> **term frequency / inverse document frequency** VECTORIZER <br>
После расшифровки названия, становится понятно, что речь идет о частоте встречаемого слова в тексте. То есть чем чаще встречается токен, чем ближе число к единице. Расположение числа в массиве означает тоже самое, что и у CountVectorizer.

Пользовался вот этим для TfidfVectorizer <br>
https://ru.wikipedia.org/wiki/TF-IDF

###  Binary classification

#### 8. [2 point] Train model using Logistic Regression (your own) and SVC (SVM can be taken from sklearn) 

* choose *two* authors from the dataset that you have formed in the previous task
* check the balance of the classes
* divide the data into train and test samples with 0.7 split rate (don't forget to fix the random state)
* using GridSearchCV - find the best parameters for the models (by F1 score) and use it in the next tasks
* make several plots to address the dependence between F1 score and parameters
* plot confusion matrix for train and test samples
* compute some relevant metrics for test sample (useful to check the seminars 5 and 6, use sklearn) 
* make conclusions about the performance of your models


In [None]:
from sklearn.model_selection import GridSearchCV


# choose two authors from the dataset that you have formed in the previous task
limit = 1500
data1 = data[(data["author"] == "Gogol")][:limit]
data2 = data[(data["author"] == "Gorky")][:limit]

# check the balance of the classes
data1 = data1[:min(len(data1), len(data2))] # на случай, если убрать limit
data2 = data2[:min(len(data1), len(data2))]
data_local = data1.append(data2)

# divide the data into train and test samples with 0.7 split rate (don't forget to fix the random state)
x = np.vstack(data_local["TfidfVectorizer"])
y = np.where(data_local["author"] == "Gorky", -1, 1)
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 83267)

logit = Logit(0, 0, learning_rate = 0.1)
logit_fit = logit.fit(X_train, y_train)
logit_prediction_1 = logit_fit.predict(X_train)
logit_prediction_2 = logit_fit.predict(X_test)

svc = SVC(kernel = "linear", C = 1)
svc_fit = svc.fit(X_train, y_train)
svc_prediction_1 = svc_fit.predict(X_train)
svc_prediction_2 = svc_fit.predict(X_test)

# using GridSearchCV - find the best parameters for the models (by F1 score) and use it in the next tasks
# https://stats.stackexchange.com/questions/437072/use-f1-score-in-gridsearchcv
grid_logit = {"learning_rate": [0.001, 0.01, 0.1], "tolerance": [1e-9, 1e-8, 1e-7]}
searcher_logit = GridSearchCV(Logit(), param_grid = grid_logit, scoring = "f1", cv = 5)
searcher_logit.fit(X_train, y_train)

grid_logit_2 = {"beta": [0.1, 1, 2], "gamma": [0.1, 1, 2]}
searcher_logit_2 = GridSearchCV(Logit(), param_grid = grid_logit_2, scoring = "f1", cv = 5)
searcher_logit_2.fit(X_train, y_train)

grid_svc = {"kernel": ["linear", "poly", "rbf"], "C": [0.01, 0.1, 1]}
searcher_svc = GridSearchCV(SVC(), param_grid = grid_svc, scoring = "f1", cv = 5)
searcher_svc.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': [0.01, 0.1, 1],
                         'kernel': ['linear', 'poly', 'rbf']},
             scoring='f1')

In [None]:
def graphic_3d(x_array, y_array, z_array, x_best, y_best, title, x_label, y_label,
               x_ticks = None, x_ticks_labels = None,
               y_ticks = None, y_ticks_labels = None):
    x = [[x_array[0], x_array[0], x_array[0]],
         [x_array[1], x_array[1], x_array[1]],
         [x_array[2], x_array[2], x_array[2]]]
    y = [y_array, y_array, y_array]
    z = np.reshape(z_array, (len(x_array), len(y_array)))

    fig = plt.figure(figsize = (16, 8))
    ax = plt.axes(projection = "3d")
    ax.scatter3D(x, y, z, alpha = 0.2)
    ax.plot_surface(x, y, z, alpha = 0.2)
    ax.scatter3D(x_best, y_best, np.max(z), color = "purple")

    plt.title(title)
    plt.xlabel(x_label)
    plt.ylabel(y_label)

    if x_ticks is not None:
        ax.set_xticks(x_ticks)
    if x_ticks_labels is not None:
        ax.set_xticklabels(x_ticks_labels)
    if y_ticks is not None:
        ax.set_yticks(y_ticks)
    if y_ticks_labels is not None:
        ax.set_yticklabels(y_ticks_labels)
    
    plt.show()


best_params = {"learning_rate": searcher_logit.best_params_["learning_rate"],
               "tolerance": searcher_logit.best_params_["tolerance"],
               "beta": searcher_logit_2.best_params_["beta"],
               "gamma": searcher_logit_2.best_params_["gamma"],
               "kernel": searcher_svc.best_params_["kernel"],
               "C": searcher_svc.best_params_["C"]}
print(f"Best params = {best_params}")

# make several plots to address the dependence between F1 score and parameters
graphic_3d(grid_logit["learning_rate"], grid_logit["tolerance"], searcher_logit.cv_results_["mean_test_score"],
           best_params["learning_rate"], best_params["tolerance"],
           title = "Logit", x_label = "Learning rate", y_label = "Tolerance")

graphic_3d(grid_logit_2["beta"], grid_logit_2["gamma"], searcher_logit_2.cv_results_["mean_test_score"],
           best_params["beta"], best_params["gamma"],
           title = "Logit", x_label = "Beta", y_label = "Gamma",
           x_ticks = grid_logit_2["beta"], y_ticks = grid_logit_2["gamma"])

graphic_3d(grid_svc["C"], [0, 1, 2], searcher_svc.cv_results_["mean_test_score"],
           best_params["C"], 2,
           title = "SVC", x_label = "C", y_label = "Kernel",
           y_ticks = [0, 1, 2], y_ticks_labels = ["linear", "poly", "rbf"])

1) Logit (Learning rate, tolerance) <br>
Лучшие значения: 'learning_rate': 0.1, 'tolerance': 1e-09 <br>
Сразу заметим, что tolerance вообще не влияет на f1 score. При learning rate = 0.001 f1 score резко падает, при значениях 0.01 и 0.1 показывает неплохой уровень f1 score. Максимум находится на уровне learning_rate = 0.1 с любым tolerance.

---

2) Logit (Beta, Gamma) <br>
Лучшие значения: 'beta': 2, 'gamma': 1 <br>
График получился интересным. Экстремум находится в точке с гамма = 1 и бета = 2. При этом при изменении гаммы f1 score сильно падает. Но при бета равным 0.1 или 1 и гаммма равным 0.1 f1 score показывает значения лучше среднего по остальным точкам.

---

3) SVC (Kernel, C) <br>
Лучшие значения: 'kernel': 'linear', 'C': 0.01 <br>
Сразу заметим, что poly показывает результаты хуже, чем rbf и linear. При этом linear и rbf показывают практически идентиные результаты. При уменьшении параматера C их f1 score растет. Наилучший результат достигается при linear ядре и C = 0.01.

In [None]:
from sklearn.metrics import precision_score, recall_score, confusion_matrix


def print_metrics(train_or_test, prediction, model, dataset):
    print(f"{model} {dataset}")
    print(f"Accuracy: {accuracy_score(train_or_test, prediction)}")
    print(f"Roc Auc: {roc_auc_score(train_or_test, prediction)}")
    print(f"F1: {f1_score(train_or_test, prediction)}")
    print()


def graphic(axis, train_or_test, prediction, model, dataset):
    names = ['Gogol','Gorky']
    sns.heatmap(data=confusion_matrix(train_or_test, prediction),
                annot = True, fmt = "d",cbar = False,
                xticklabels = names, yticklabels = names, ax = axis)
    axis.set_title(f"{model} {dataset}")


# plot confusion matrix for train and test samples
figures, axis = plt.subplots(1, 2, figsize = (24, 6))
graphic(axis[0], y_train, logit_prediction_1, "Logit", "train")
graphic(axis[1], y_train, svc_prediction_1, "SVC", "train")
plt.show()

figures, axis = plt.subplots(1, 2, figsize = (24, 6))
graphic(axis[0], y_test, logit_prediction_2, "Logit", "test")
graphic(axis[1], y_test, svc_prediction_2, "SVC", "test")
plt.show()

# compute some relevant metrics for test sample (useful to check the seminars 5 and 6, use sklearn)
print_metrics(y_test, logit_prediction_2, "Logit", "test")
print_metrics(y_test, svc_prediction_2, "SVC", "test")

TP (TN) = true positive (negative) <br>
FP (FN) = false positive (negative) <br>

1) Сразу заметим, что результаты моей модели (logit) очень близки к результатам SVC модели - каждое число отличается буквально на несколько процентов. Что это означает? Во-первых, это успех :) Во-вторых, нет смысла рассматривать отдельно матрицы, так они идентичны, поэтому буду рассматривать только матрицы logit модели.

2) В обоих случаях TP + TN > FP + FN, то есть accuracy дейстильно больше 0.5 - уже неплохо. В частности TP > FN и TN > FP, то есть модель ошибается грубоговоря "равномерно" в обоих случаях, не перевешиваясь в одну крайность - тоже неплохо.

3) Также можно выделить достаточно низкое кол-во FP, то есть если модель выдает positive результат, то у него достаточно большая вероятность быть TP, что можно использовать.

#### 9. [1 point] Analysing ROC AUC

It is possible to control the proportion of statistical errors of different types using different thresholds for choosing a class. Plot ROC curves for Logistic Regression and SVC, show the threshold on ROC curve plots. Choose such a threshold that your models have no more than 30% of false positive errors rate. Pay attention to `thresholds` parameter in sklearn roc_curve 

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, RocCurveDisplay


## https://stackoverflow.com/questions/25009284/how-to-plot-roc-curve-in-python
def grapchic_roc(model, predict_proba = True):
    if predict_proba:
        probs = model.predict_proba(X_test)
        fpr, tpr, threshold = roc_curve(y_test, probs[:,1])
    else:
        decision = model.decision_function(X_test)
        fpr, tpr, threshold = roc_curve(y_test, decision)

    curve = RocCurveDisplay.from_estimator(model, X_test, y_test)
    curve.figure_.suptitle(type(model).__name__)
    plt.scatter([fpr], [tpr])
    plt.axvline(x = 0.3, color = "purple")
    plt.show()


logit = LogisticRegression(C = 1, max_iter=1000)
logit.fit(X_train, y_train)
grapchic_roc(logit)

svc = SVC(kernel = "linear", C = 1)
svc.fit(X_train, y_train)
grapchic_roc(svc, False)

При FPR = 0.3, TPR = 0.4 - не очень хороший результат, но как есть на этих данных.

### Multiclass logit

#### 10. [1 point] Take the One-VS-One classifier (use sklearn) and apply to Logit model (one you've made in the 4th task) in order to get multiclass linear classifier

<a href="https://scikit-learn.org/stable/modules/generated/sklearn.multiclass.OneVsOneClassifier.html">OneVsOneClassifier</a>

* use the data you got at the previous step for 6 authors
* divide the data into train and test samples with 0.7 split rate
* using GridSearchCV - find the best parameters for the models (by F1 score)
* plot confusion matrix for train and test samples
* compute all possible and relevant metrics for test sample (use sklearn)

In [None]:
from collections import Counter
from sklearn.multiclass import OneVsOneClassifier
from sklearn.metrics import make_scorer, f1_score

'''
# divide the data into train and test samples with 0.7 split rate
data_local = data.sample(200, random_state = 83267)
x = np.vstack(data_local["TfidfVectorizer"])
y = data_local["author"]
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 83267, stratify = y)

# using GridSearchCV - find the best parameters for the models (by F1 score)
grid_1 = {"estimator__learning_rate": [0.001, 0.01, 0.1], "estimator__tolerance": [1e-9, 1e-8, 1e-7]}
searcher_1 = GridSearchCV(OneVsOneClassifier(Logit()), param_grid = grid_1, scoring = "f1", cv = 5)
searcher_1.fit(X_train, y_train)

grid_2 = {"estimator__beta": [0.1, 1, 2], "estimator__gamma": [0.1, 1, 2]}
searcher_2 = GridSearchCV(OneVsOneClassifier(Logit()), param_grid = grid_2, scoring = "f1", cv = 5)
searcher_2.fit(X_train, y_train)
'''

# ПО НЕ ПОНЯТНЫМ МНЕ ПРИЧИНАМ, ОНО НЕ ХОЧЕТ РАБОТАТЬ

'\n# divide the data into train and test samples with 0.7 split rate\ndata_local = data.sample(200, random_state = 83267)\nx = np.vstack(data_local["TfidfVectorizer"])\ny = data_local["author"]\nX_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 83267, stratify = y)\n\n# using GridSearchCV - find the best parameters for the models (by F1 score)\ngrid_1 = {"estimator__learning_rate": [0.001, 0.01, 0.1], "estimator__tolerance": [1e-9, 1e-8, 1e-7]}\nsearcher_1 = GridSearchCV(OneVsOneClassifier(Logit()), param_grid = grid_1, scoring = "f1", cv = 5)\nsearcher_1.fit(X_train, y_train)\n\ngrid_2 = {"estimator__beta": [0.1, 1, 2], "estimator__gamma": [0.1, 1, 2]}\nsearcher_2 = GridSearchCV(OneVsOneClassifier(Logit()), param_grid = grid_2, scoring = "f1", cv = 5)\nsearcher_2.fit(X_train, y_train)\n'

In [None]:
from sklearn.metrics import precision_score, recall_score, confusion_matrix

'''
def print_metrics(train_or_test, prediction, dataset):
    print(f"{dataset}")
    print(f"Accuracy: {accuracy_score(train_or_test, prediction)}")
    print(f"Roc Auc: {roc_auc_score(train_or_test, prediction)}")
    print(f"F1: {f1_score(train_or_test, prediction)}")
    print()


# xticklabels = names, yticklabels = names, 
def graphic(axis, train_or_test, prediction, dataset):
    sns.heatmap(data=confusion_matrix(train_or_test, prediction),
                annot = True, fmt = "d",cbar = False, ax = axis)
    axis.set_title(f"{dataset}")


best_params = {"learning_rate": searcher_1.best_params_["estimator__learning_rate"],
               "tolerance": searcher_1.best_params_["estimator__tolerance"],
               "beta": searcher_2.best_params_["estimator__beta"],
               "gamma": searcher_2.best_params_["estimator__gamma"]}
print(f"Best params = {best_params}")

logit = Logit(learning_rate = 0.001, tolerance = 1e-09, beta = 0.1, gamma = 0.1)
logit_fit = logit.fit(X_train, y_train)
logit_prediction_1 = logit_fit.predict(X_train)
logit_prediction_2 = logit_fit.predict(X_test)

# plot confusion matrix for train and test samples
figures, axis = plt.subplots(1, 2, figsize = (24, 6))
graphic(axis[0], y_train, logit_prediction_1, "Train")
graphic(axis[1], y_test, logit_prediction_2, "Test")
plt.show()

# compute all possible and relevant metrics for test sample (use sklearn)
print_metrics(y_test, logit_prediction_2, "Test")
'''

'\ndef print_metrics(train_or_test, prediction, dataset):\n    print(f"{dataset}")\n    print(f"Accuracy: {accuracy_score(train_or_test, prediction)}")\n    print(f"Roc Auc: {roc_auc_score(train_or_test, prediction)}")\n    print(f"F1: {f1_score(train_or_test, prediction)}")\n    print()\n\n\n# xticklabels = names, yticklabels = names, \ndef graphic(axis, train_or_test, prediction, dataset):\n    sns.heatmap(data=confusion_matrix(train_or_test, prediction),\n                annot = True, fmt = "d",cbar = False, ax = axis)\n    axis.set_title(f"{dataset}")\n\n\nbest_params = {"learning_rate": searcher_1.best_params_["estimator__learning_rate"],\n               "tolerance": searcher_1.best_params_["estimator__tolerance"],\n               "beta": searcher_2.best_params_["estimator__beta"],\n               "gamma": searcher_2.best_params_["estimator__gamma"]}\nprint(f"Best params = {best_params}")\n\nlogit = Logit(learning_rate = 0.001, tolerance = 1e-09, beta = 0.1, gamma = 0.1)\nlogit_fi