# Задание 2
## Линейная регрессия 

Практическая работа по линейной регрессии

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder

import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

## Загрузка данных

Данные взяты с сайта Kaggle.com: https://www.kaggle.com/shivam2503/diamonds

In [None]:
data = pd.read_csv('diamonds.csv', index_col=0).reset_index(drop=True)

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
cat_cols = [col for col in data.columns if data[col].dtype == 'object']


In [None]:
data[cat_cols].describe()

# EDA

Посмотрим на данные внимательнее.

In [None]:
data.isnull().sum()

In [None]:
num_cols = [col for col in data.columns\
            if data[col].dtype == 'float64'\
            or data[col].dtype == 'int64']


In [None]:
for col in num_cols:
    sns.distplot(data[col])
    plt.show()

In [None]:
data['price'] = data['price'].apply(math.log1p)
sns.distplot(data['price'])

In [None]:
sns.pairplot(data[num_cols])

In [None]:
sns.heatmap(data[num_cols].corr(), cmap='RdBu_r', annot=True, fmt='.2f')

In [None]:
data.loc[data['y']>20, 'y'] = data.loc[data['y']<20, 'y'].mean()

In [None]:
data.loc[data['z']>20, 'z'] = data.loc[data['z']<20, 'z'].mean()

In [None]:
data.loc[data['x']<1, 'x'] = data.loc[data['x']>1, 'x'].mean()

In [None]:
sns.heatmap(data[num_cols].corr(), cmap='RdBu_r', annot=True, fmt='.2f')

## Создание линейной модели

In [None]:
num_cols.remove('price')

In [None]:
X = data[num_cols]
y = data['price']

## Train Test Split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

## Тренировка модели

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
lm = LinearRegression()

In [None]:
lm.fit(X_train,y_train)

## Оценка модели

In [None]:
print(lm.intercept_)

In [None]:
coeff_df = pd.DataFrame(lm.coef_, num_cols, columns=['Coefficient'])
coeff_df['Coefficient_exp'] = coeff_df['Coefficient'].apply(lambda x: math.expm1(x))
coeff_df

In [None]:
predictions = lm.predict(X_test)

In [None]:
plt.scatter(x=y_test, y=predictions)

**Residual Histogram**

In [None]:
sns.distplot((y_test-predictions),bins=100);

In [None]:
def r2_adj(y_test, yhat, model=lm):
    SS_Residual = sum((np.array(y_test)-np.array(yhat))**2)
    SS_Total = sum((np.array(y_test)-np.mean(y_test))**2)
    r_squared = 1 - (float(SS_Residual))/SS_Total
    return 1 - (1-r_squared)*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1)

In [None]:
from sklearn import metrics

In [None]:
scores = pd.DataFrame(index = ['MAE', 'MSE', 'RMSE', 'R2_adj'])

In [None]:
predictions = [math.expm1(x) for x in predictions]
y_test = [math.expm1(x) for x in y_test]

mae = round(metrics.mean_absolute_error(y_test, predictions), 3)
mse = round(metrics.mean_squared_error(y_test, predictions), 3)
rmse = round(np.sqrt(metrics.mean_squared_error(y_test, predictions)), 3)
r2 = round(r2_adj(y_test, predictions), 3)

scores['mod1'] = [mae, mse, rmse, r2]

scores

## Построим вторую модель 

С учетом категориальных фич

In [None]:
for col in cat_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    ohe = OneHotEncoder(sparse=False)
    ohe_cols = ohe.fit_transform(data[col].values.reshape(-1, 1))
    col_names = [col+str(x) for x in range(1, ohe_cols.shape[1])]
    ohe_cols = pd.DataFrame(ohe_cols[:, 1:], columns=col_names)
    data = pd.concat([data, ohe_cols], axis=1)
data.head()

In [None]:
num_cols = [col for col in data.columns if data[col].dtype == 'float64' or data[col].dtype == 'int64']

In [None]:
num_cols.remove('price')
for col in cat_cols:
    num_cols.remove(col)

In [None]:
X = data[num_cols]
y = data['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
lm = LinearRegression()
lm.fit(X_train,y_train)
predictions = lm.predict(X_test)
predictions = [math.expm1(x) for x in predictions]
y_test = [math.expm1(x) for x in y_test]

mae = round(metrics.mean_absolute_error(y_test, predictions), 3)
mse = round(metrics.mean_squared_error(y_test, predictions), 3)
rmse = round(np.sqrt(metrics.mean_squared_error(y_test, predictions)), 3)
r2 = round(r2_adj(y_test, predictions), 3)

scores['mod2'] = [mae, mse, rmse, r2]

scores