## Regressão

**Tópicos:** <br>
    - O que é uma regressão? <br>
    - Como avaliar meu modelo? <br>
    - Data Science: Treinamento e Teste <br>

**Regressão** é um dos tópicos de Machine Learning que busca responder as perguntas **"Quanto?" ou "Quando?"** 
<br>
Por exemplo, <br>
Dado a idade de uma pessoa, quanto é seu salário? <br>
Dado a idade de uma pessoa, quando ela irá morrer? <br>
<br>
Elas basicamente tem um formato:
<br>
y = f(x)
<br><br>
onde: <br> 
x é idade ~ feature <br>
f(x) é uma função que varia com x  ~ modelo <br>
y é o sálario ou idade de quando a pessoa vai morrer ~ variável resposta<br>

In [44]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import ipywidgets as widgets
from ipywidgets import interact, interact_manual

pd.set_option('display.max_columns', None) 

In [107]:
df = pd.read_csv('../dataset_random/Kaggle/Datasets/Kaggle/regression/train.csv')
df.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [108]:
df.sample(3)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
1163,1164,90,RL,60.0,12900,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Sawyer,Feedr,Norm,Duplex,SFoyer,4,4,1969,1969,Gable,CompShg,Plywood,Plywood,,0.0,TA,TA,CBlock,Gd,TA,Av,GLQ,1198,Unf,0,0,1198,GasA,TA,Y,SBrkr,1258,0,0,1258,2,0,0,2,0,2,TA,6,Typ,0,,CarPort,1969.0,Unf,2,400,Fa,TA,Y,120,0,0,0,0,0,,,,0,1,2008,WD,Alloca,108959
1057,1058,60,RL,,29959,Pave,,IR2,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,1Fam,2Story,7,6,1994,1994,Gable,CompShg,HdBoard,HdBoard,,0.0,Gd,TA,PConc,Gd,TA,No,GLQ,595,Unf,0,378,973,GasA,Ex,Y,SBrkr,979,871,0,1850,0,0,2,1,3,1,Gd,7,Typ,1,Gd,BuiltIn,1994.0,Fin,2,467,TA,TA,Y,168,98,0,0,0,0,,,,0,1,2009,WD,Normal,248000
327,328,20,RL,80.0,11600,Pave,,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Norm,Norm,1Fam,1Story,6,5,1960,1960,Hip,CompShg,Wd Sdng,Wd Sdng,BrkFace,175.0,TA,TA,CBlock,TA,TA,No,Rec,565,Unf,0,818,1383,GasA,TA,Y,SBrkr,1383,0,0,1383,0,0,1,1,3,1,TA,7,Typ,0,,Attchd,1960.0,RFn,1,292,TA,TA,Y,0,45,0,0,0,0,,,,0,4,2006,WD,Normal,145250


In [5]:
def X_poly(array, degree):
    X = pd.DataFrame()
    for d in range(degree):
        X[f'x_{d}'] = array**(d+1)
    return X

def plot_poly(df, col, degree, many, size=(12,8)):
    fig, ax = plt.subplots(figsize=size)
    # X-Plot
    x = df[col][:many].values
    x_poly = np.arange(x.min(),x.max(), 100)
    # y-Plot
    y = df['SalePrice'][:many].values
    # Dataframes
    X_plot = X_poly(x_poly, degree)
    X_train = X_poly(x, degree)
    # Train
    reg = LinearRegression().fit(X_train.values, y.reshape(-1, 1))
    # Plot training point
    ax.scatter(x = x, y = y, alpha = .4)
    # Plot predicting model
    ax.plot(x_poly , reg.predict(X_plot).reshape(X_plot.shape[0],), color = 'black', alpha=.7)
    plt.show()
    return None

## Equação da Reta

y = a*x + b

In [158]:
@interact
def plot_poly(col = ['LotArea', 'OverallQual', 'GrLivArea', 'GarageArea', 'YearBuilt', 'TotRmsAbvGrd'], a =100, b=300000, many=(50,500,50)):
    fig, ax = plt.subplots(figsize=(12,8))
    # X-Plot
    x = df[col][:many].values
    x_poly = np.arange(x.min(),x.max(), 100)
    # y-Plot
    y = df['SalePrice'][:many].values
    # Plot training point
    ax.scatter(x = x, y = y, alpha = .4)
    y_pred = a*x +b
    plt.scatter(x = list(x), y = list(y_pred), color ='black', alpha = .3)
    plt.show()
    return None

interactive(children=(Dropdown(description='col', options=('LotArea', 'OverallQual', 'GrLivArea', 'GarageArea'…

In [106]:
@interact
def plot_poly_interactive(col = ['LotArea', 'OverallQual', 'GrLivArea', 'GarageArea', 'YearBuilt', 'TotRmsAbvGrd'], show_train = [0,1],  show_test = [0,1], degree = (1,15,1), many = (50,500,50), interval=(1,1000,10)):
    fig, ax = plt.subplots(1,2,figsize=(20,8))
    # X-Plot
    x = df[col][:many].values
    x_test = df[col][-many:].values
    x_poly = np.arange(x.min(),x.max(), interval)
    # y-Plot
    y = df['SalePrice'][:many].values
    y_test = df['SalePrice'][-many:].values
    # Dataframes
    X_plot = X_poly(x_poly, degree)
    X_train = X_poly(x, degree)
    X_test = X_poly(x_test, degree)
    print(X_test.shape)
    # Train
    reg = LinearRegression().fit(X_train.values, y.reshape(-1, 1))
    # Eval Model
    y_train_pred =  reg.predict(X_train.values)
    y_test_pred = reg.predict(X_test.values)
    mse_train = mean_squared_error(y, y_train_pred)
    mse_test = mean_squared_error(y_test, y_test_pred)
    mse_mean_train = mean_squared_error(y, np.ones(y.shape[0])*y.mean())
    mse_mean = mean_squared_error(y_test, np.ones(y_test.shape[0])*y_test.mean())
    print(f'Eval Metric Train: {mse_train:,.2f}')
    print(f'Eval Metric Test: {mse_test:,.2f}')
    print(f'Eval Metric Test - Mean: {mse_mean:,.2f}')
    ax[1].bar('MSE Train Model', mse_train, 0.35,  color='green', alpha=.4, label='Train')
    ax[1].bar('MSE Train Mean', mse_mean_train, 0.35,  color='green', alpha=.1, label='Train')
    ax[1].bar('MSE Test Model', mse_test, 0.35, color='red', alpha=.4, label='Test')
    ax[1].bar('MSE TEST Mean', mse_mean, 0.35,  color='red', alpha=.1, label='Test')
    # Plot training point
    if show_train:
        ax[0].scatter(x = x, y = y, alpha = .4, color='blue')
    # Plot training point
    if show_test:
        ax[0].scatter(x = x_test, y = y_test, color='orange', alpha = .7)
    # Plot predicting model
    ax[0].plot(x_poly , reg.predict(X_plot).reshape(X_plot.shape[0],), color = 'black', alpha=.2)
    ax[0].set_xlabel(col)
    ax[0].set_ylabel('Sale Price')
    plt.show()
    return None

interactive(children=(Dropdown(description='col', options=('LotArea', 'OverallQual', 'GrLivArea', 'GarageArea'…