# DS-SF-25 | Codealong 07 | Introduction to Regression and Model Fit, Part 2 | Answer Key

In [1]:
import os

import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 20)
pd.set_option('display.notebook_repr_html', True)
pd.set_option('display.max_columns', 10)

import statsmodels.api as sm
import statsmodels.formula.api as smf

from sklearn import feature_selection, linear_model

import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

import seaborn as sns

## Part A - Model's F-statistic

In [2]:
df = pd.read_csv(os.path.join('..', 'datasets', 'zillow-07.csv'), index_col = 'ID')

> ### `SalePrice` as a function of `Size`

In [3]:
model = smf.ols(formula = 'SalePrice ~ Size', data = df).fit()

model.summary()

> ### `SalePrice` as a function of `IsAStudio`

In [4]:
model = smf.ols(formula = 'SalePrice ~ IsAStudio', data = df).fit()

model.summary()

### Model's F-value (with significance level of `5%`)

In [5]:
model.fvalue

### Corresponding p-value

In [6]:
model.f_pvalue

## Part B1 - Linear Regression Modeling with `sklearn`

In [7]:
def summary(X, y, model):
    fvalues, f_pvalues = feature_selection.f_regression(X, y)
    print 'F-statistic (not join but instead done sequentially for each regressor)'
    print '- F-value', fvalues
    print '- p-value', f_pvalues
    print

    print 'R^2 =', model.score(X, y)
    print

    print 'Coefficients'
    print '- beta_0 (Intercept) = {}'.format(model.intercept_)
    for i, coef in enumerate(model.coef_):
        print '- beta_{} ({}) = {}'.format(i + 1, X.columns[i], coef)

> ### Remove samples with `NaN` in `IsAStudio`, `Size`, or `LotSize`

In [8]:
df.dropna(axis = 'index', subset = ['IsAStudio', 'Size', 'LotSize'], inplace = True)

### SalePrice ~ IsAStudio with `statsmodels`

In [9]:
smf.ols(formula = 'SalePrice ~ IsAStudio', data = df).fit().summary()

> ### SalePrice ~ IsAStudio with `sklearn`

In [10]:
X = df[ ['IsAStudio'] ]
y = df.SalePrice

model = linear_model.LinearRegression().fit(X, y)

summary(X, y, model)

### SalePrice ~ Size + LotSize with `statsmodels`

In [11]:
smf.ols(formula = 'SalePrice ~ Size + LotSize', data = df).fit().summary()

> ### SalePrice ~ Size + LotSize with `sklearn`

In [12]:
X = df[ ['Size', 'LotSize'] ]
y = df.SalePrice

model = linear_model.LinearRegression().fit(X, y)

summary(X, y, model)

## Part B2 - Linear Regression Modeling with `sklearn` (cont.)

In [13]:
df = pd.read_csv(os.path.join('..', 'datasets', 'advertising.csv'))

In [14]:
df

## Plots

> ### Sales ~ TV

In [15]:
sns.lmplot('TV', 'Sales', df)

> ### Sales ~ Radio

In [16]:
sns.lmplot('Radio', 'Sales', df)

> ### Sales ~ Newspaper

In [17]:
sns.lmplot('Newspaper', 'Sales', df)

## Simple linear regressions

> ### Sales ~ TV

In [18]:
model_tv = smf.ols(formula = 'Sales ~ TV', data = df).fit()

model_tv.summary()

> ### Sales ~ Radio

In [19]:
model_radio = smf.ols(formula = 'Sales ~ Radio', data = df).fit()

model_radio.summary()

> ### Sales ~ Newspaper

In [20]:
model_newspaper = smf.ols(formula = 'Sales ~ Newspaper', data = df).fit()

model_newspaper.summary()

## Residuals

> ### Sales ~ TV

In [21]:
sm.qqplot(model_tv.resid, line = 's')

pass

In [22]:
sm.graphics.plot_regress_exog(model_tv, 'TV')

pass

> ### Sales ~ Radio

In [23]:
sm.qqplot(model_radio.resid, line = 's')

pass

In [24]:
sm.graphics.plot_regress_exog(model_radio, 'Radio')

pass

> ### Sales ~ Newspaper

In [25]:
sm.qqplot(model_newspaper.resid, line = 's')

pass

In [26]:
sm.graphics.plot_regress_exog(model_newspaper, 'Newspaper')

pass

> ### Sales ~ TV + Radio + Newspaper

In [27]:
model = smf.ols(formula = 'Sales ~ TV + Radio + Newspaper', data = df).fit()

model.summary()

> ### Sales ~ TV + Radio

In [28]:
model = smf.ols(formula = 'Sales ~ TV + Radio', data = df).fit()

model.summary()

In [29]:
sm.qqplot(model.resid, line = 's')

pass

In [30]:
sm.graphics.plot_regress_exog(model, 'TV')

pass

In [31]:
sm.graphics.plot_regress_exog(model, 'Radio')

pass

## Part C - Interaction Effects

### Sales ~ TV + Radio + TV * Radio

In [32]:
model = smf.ols(formula = 'Sales ~ TV + Radio + TV * Radio', data = df).fit()

model.summary()

In [33]:
sm.qqplot(model.resid, line = 's')

pass

In [34]:
sm.graphics.plot_regress_exog(model, 'TV')

pass

In [35]:
sm.graphics.plot_regress_exog(model, 'Radio')

pass

In [36]:
sm.graphics.plot_regress_exog(model, 'TV:Radio')

pass

## Part D - One-hot encoding for categorical variables

In [37]:
df = pd.read_csv(os.path.join('..', 'datasets', 'zillow-07.csv'), index_col = 'ID')

In [38]:
df.drop(df[df.IsAStudio == 1].index, inplace = True)

In [39]:
smf.ols(formula = 'SalePrice ~ BathCount', data = df).fit().summary()

> ### What's the bathrooms' distribution in the dataset?

In [40]:
df.BathCount.value_counts(dropna = False).sort_index()

> ### Let's keep properties with 1, 2, 3, or 4 bathrooms

In [41]:
df = df[df.BathCount.isin([1, 2, 3, 4])]

In [42]:
df.BathCount.value_counts(dropna = False).sort_index()

> ### Let's use `pandas`'s `get_dummies` to create our one-hot encoding

In [43]:
baths_df = pd.get_dummies(df.BathCount, prefix = 'Bath')

In [44]:
baths_df

In [45]:
baths_df.rename(columns = {'Bath_1.0': 'Bath_1',
                           'Bath_2.0': 'Bath_2',
                           'Bath_3.0': 'Bath_3',
                           'Bath_4.0': 'Bath_4'}, inplace = True)

In [46]:
baths_df

In [47]:
df = df.join([baths_df])

In [48]:
df.columns

## Activity | One-hot encoding for categorical variables

> ### `SalesPrice` as a function of `Bath_2`, `Bath_3`, and `Bath_4`

In [49]:
smf.ols(formula = 'SalePrice ~ Size + Bath_2 + Bath_3 + Bath_4', data = df).fit().summary()

> ### `SalesPrice` as a function of `Bath_1`, `Bath_3`, and `Bath_4`

In [50]:
smf.ols(formula = 'SalePrice ~ Bath_1 + Bath_3 + Bath_4', data = df).fit().summary()

> ### `SalesPrice` as a function of `Bath_1`, `Bath_2`, and `Bath_4`

In [51]:
smf.ols(formula = 'SalePrice ~ Bath_1 + Bath_2 + Bath_4', data = df).fit().summary()

> ### `SalesPrice` as a function of `Bath_1`, `Bath_2`, and `Bath_3`

In [52]:
smf.ols(formula = 'SalePrice ~ Bath_1 + Bath_2 + Bath_3', data = df).fit().summary()