In [None]:
import numpy as np
import pandas as pd
import copy
import matplotlib.pyplot as plt

# 1. Случайные величины и гипотезы

In [None]:
from scipy.stats import norm
print(norm.rvs(loc=0, scale=1, size=1))

In [None]:
number_of_samples =  100000
plt.figure(figsize = (14,6))
for mean, std in [(0,1),(0,2),(2,1)]:
    plt.hist(norm.rvs(loc=mean, scale=std, size=number_of_samples), bins=50, density=True)
    plt.plot([mean] * 10, np.linspace(0, 0.5, 10), label='mean = ' + str(mean))
plt.legend()
plt.show()

In [None]:
number_of_samples =  100000
# определите параметры распределения
mean = #YOUR CODE HERE
std = #YOUR CODE HERE

# точка, которую будем проверять
observed_value = #YOUR CODE HERE
p_value = #YOUR CODE HERE
significance = #YOUR CODE HERE
if #YOUR CODE HERE
    print('Reject hypothesis')
else:
    print('We cannot reject or accept hypothesis')


sample = norm.rvs(loc=mean, scale=std, size=number_of_samples)

density_domain = np.linspace(sample.min(), sample.max(), 100)
density = norm.pdf(density_domain)

plt.figure(figsize = (14, 6))
plt.hist(sample, bins = 50, density=True, label='approximate distribution')
plt.plot(observed_value, norm.pdf(observed_value), 'ro', label='observation')
plt.plot(density_domain, density, label='real distribution')
plt.legend()
plt.show()

# Корреляция

![](https://idatassist.com/wp-content/uploads/2017/04/dreamstime_m_37904189-610x461.jpg)

$\textit{Task:}$ Создайте 3 вектора. Первый и второй должны иметь корреляцию больше 0.5, первый и третий меньше -0.5

In [None]:
vector_1 = #YOUR CODE HERE
vector_2 = #YOUR CODE HERE
vector_3 = #YOUR CODE HERE

assert len(vector_1) == len(vector_2) == len(vector_3), "Векторы разной длины"

v1_v2_corr = #YOUR CODE HERE
v1_v3_corr = #YOUR CODE HERE

assert v1_v2_corr > 0.5, "Первый и второй векторы должны иметь корреляцию больше 0.5. У вас: " + str(round(v1_v2_corr,2))
assert v1_v3_corr < 0.5, "Первый и третий векторы должны иметь корреляцию меньше -0.5. У вас: " + str(round(v1_v3_corr,2))

Correlation refers to $\textbf{only linear}$ relationships between two variables!

In [None]:
x = np.arange(0, 100, 0.1)
y = np.sin(x)
print(np.corrcoef(x,y)[0,1])

# Линейная регрессия

$$y = \alpha + \beta_1x_1 + \beta_2x_2+...+\beta_nx_n + \varepsilon$$

$$MSE = \sum (y_i -y)^2$$

## Сгенерированные данные

In [None]:
import statsmodels.api as sm
from scipy import stats
from numpy.random import normal

number_of_samples = 250

X1 = normal(loc=0.0, scale=1, size=number_of_samples)
X2 = normal(loc=0.0, scale=1, size=number_of_samples)

noise = # YOUR CODE

# задайте любое уравнение регрессии
Y = # YOUR CODE

## Adding noise for variables
X1 += normal(loc=0.0, scale=0, size=number_of_samples)
X2 += normal(loc=0.0, scale=0, size=number_of_samples)

X = np.column_stack([X1, X2])
X = sm.add_constant(X)

model = sm.OLS(Y, X)
results = model.fit()

print(results.summary())


f, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 8), sharey=True)

x_axis = np.arange(min(X1),max(X1), 0.1)
step = max(X1) - min(X1)
ax1.scatter(X1, Y, edgecolors=(0, 0, 0))
ax1.plot(x_axis, results.params[0] + results.params[1] * x_axis, 'k--', lw=5, c='red')
ax1.set_ylim(np.mean(Y) - step, np.mean(Y) + step)
ax1.set_xlabel('X1')
ax1.set_ylabel('Y')

x_axis = np.arange(min(X2), max(X2), 0.1)
step = max(X2) - min(X2)
ax2.scatter(X2, Y, edgecolors=(0, 0, 0))
ax2.plot(x_axis, results.params[0] + results.params[2] * x_axis, 'k--', lw=5, c='red')
ax2.set_ylim(np.mean(Y) - step, np.mean(Y) + step)
ax2.set_xlabel('X2')

## Overfit

![](https://i.stack.imgur.com/t0zit.png)

![](https://vitalflux.com/wp-content/uploads/2015/02/fittings.jpg)

![](https://theneural.files.wordpress.com/2011/07/valid2.jpeg)

In [None]:
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from numpy.random import normal
np.random.seed(67)

number_of_samples = 1000
X1 = normal(loc=0.0, scale=1, size=number_of_samples)
noise = normal(loc=0.0, scale=0.1, size=number_of_samples)
Y = 1 + X1  + noise

variables_list = [X1]
for i in range(100):
    variables_list.append(normal(loc=0.0, scale=0.1, size=number_of_samples))

X = np.column_stack(variables_list)
ntr = int(0.5 * X.shape[0])

model = linear_model.LinearRegression()
model.fit(X[:ntr,:], Y[:ntr])
Y_predicted = model.predict(X)
train_error = mean_squared_error(Y_predicted[:ntr], Y[:ntr])
val_error = mean_squared_error(Y_predicted[ntr:], Y[ntr:])
print('Training Error:  \t',train_error)
print('Validation Error:\t',mean_squared_error(Y_predicted[ntr:], Y[ntr:]))
print('diff:\t\t\t', val_error - train_error)

#### Bonus task

Визуализируем эффект переобучения. Генерация данных:

In [None]:
number_of_samples = 1000
X1 = normal(loc=0.0, scale=1, size=number_of_samples)
noise = normal(loc=0.0, scale=1.2, size=number_of_samples)
Y = 1 + X1 + noise
plt.plot(X1, Y, 'o')

Попробуем приближать данные разными степенями полинома:
$$
  y(x) = a_0 + a_1x + \ldots + a_nx^n
$$

Поэкспериментируйте с разными значениями степени и проанализируйте результаты:

In [None]:
number_of_samples = 1000
X1 = normal(loc=0.0, scale=1, size=number_of_samples)
noise = normal(loc=0.0, scale=1.2, size=number_of_samples)
Y = 1 + X1 + noise


max_power = # YOUR CODE

variables_list = []
for i in range(max_power):
    variables_list.append(X1 ** i)

X = np.column_stack(variables_list)
ntr = int(0.5 * X.shape[0])

# определение линейной модели
model = # YOUR CODE
# обучение модели на обучающей (первые ntr строк) выборке
# YOUR CODE

# предсказание ответов для всей выборки
Y_predicted = # YOUR CODE

train_error = mean_squared_error(Y_predicted[:ntr], Y[:ntr])
val_error = mean_squared_error(Y_predicted[ntr:], Y[ntr:])
print('Training Error:  \t',train_error)
print('Validation Error:\t',mean_squared_error(Y_predicted[ntr:], Y[ntr:]))
print('diff:\t\t\t', val_error - train_error)


f, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 8))

ax1.plot(X1, Y, 'o')
ax1.set_ylim(Y.min(), Y.max())

ax2.plot(X1[:ntr], Y[:ntr], 'o')
ax2.plot(X1[ntr:], Y_predicted[ntr:], 'o')
# comment it to see all dots
ax2.set_ylim(Y.min(), Y.max())

# Реальные данные

## Регрессия

In [None]:
from sklearn.datasets import load_boston
boston = load_boston()
print(boston.DESCR)

In [None]:
boston = load_boston()
data = pd.DataFrame(boston.data, columns = boston.feature_names)
data['PRICE'] = boston.target
data.head()

In [None]:
import statsmodels.api as sm
from scipy import stats

Y = # YOUR CODE
X = # YOUR CODE
X = sm.add_constant(X)
ols_model = sm.OLS(Y, X)
ols_results = ols_model.fit()
print(ols_results.summary())

## Классификация

In [None]:
from sklearn.datasets import load_breast_cancer
breast_cancer = load_breast_cancer()
print(breast_cancer.DESCR)

In [None]:
data = pd.DataFrame(breast_cancer.data[:,:28], columns = breast_cancer.feature_names[:28])
data['CANCER'] = breast_cancer.target
data.head()

In [None]:
import statsmodels.api as sm

Y = # YOUR CODE
X = # YOUR CODE
X = sm.add_constant(X)
model = sm.Logit(Y, X)
results = model.fit()
print(results.summary2())