# Non-Linear Regression

In [None]:
import numpy as np
import pandas as pd

import os

# visualization
import matplotlib.pyplot as plt
import seaborn as sns
from mpl_toolkits.mplot3d import axes3d

# For regression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import scale
import sklearn.linear_model as skl_lm
from sklearn.metrics import mean_squared_error, r2_score
import statsmodels.api as sm
import statsmodels.formula.api as smf  #Provides a formula-based interface

%matplotlib inline
#plt.style.use('seaborn-white')

homedir=os.environ['HOME'] + '/'
datapath=homedir+ "datasets/"
autompgfile=datapath + "auto-mpg.csv"

## Load dataset, read metadata

In [None]:
dataset_org = pd.read_csv(autompgfile)
#dataset_org

In [None]:
dataset = dataset_org
dataset.dtypes

In [None]:
dataset.columns

### Quick basics check

In [None]:
import pandas as pd
import numpy as np

data = {'col1': [1, 2, 3, 4], 
        'col2': [5, 6, 7, 8],
        'col3': [9, 10, 11, 12]}
df = pd.DataFrame(data, index=['row1', 'row2', 'row3', 'row4'])
print("Original DataFrame:")
print(df)
print("-" * 20)
 

In [None]:
for index, row in df.iterrows():
    for column_name, value in row.items():
        print(f"({index}, {column_name}): {value}", end=" | ")
    print()

In [None]:
# row.items() returns tuples of (column_name, value)
# hence unpack to access only value
for index, row in df.iterrows():    
    for column_name, value in row.items():
        if value == 7:
            print(f"{column_name} -> {value} ", end=" | ")
    print()

### Back to ...

#### Replace any invalid with NAN

In [None]:
from numpy import nan
dataset.replace('?', nan, inplace=True)
print(f"dataset size -> {dataset.shape}")

#### Remove all NANs if any

In [None]:
dataset = dataset.dropna()
print(f"dataset size -> {dataset.shape}")

#### Convert object datatype to float

In [None]:
dataset["horsepower"] = dataset["horsepower"].astype(float)
dataset.dtypes

#### Plots using {`mpg`, `horsepower` (inde)

In [None]:
plt.scatter(dataset.horsepower, dataset.mpg, facecolors='None', edgecolors='k', alpha=.5) 
sns.regplot(x=dataset.horsepower, y=dataset.mpg, ci=None, label='Linear', scatter=False, color='orange')
sns.regplot(x=dataset.horsepower, y=dataset.mpg, ci=None, label='Degree 2', order=2, scatter=False, color='lightblue')
sns.regplot(x=dataset.horsepower, y=dataset.mpg, ci=None, label='Degree 5', order=5, scatter=False, color='g')
#sns.regplot(x=dataset1.horsepower, y=dataset1.mpg, ci=None, label='Degree 70', order=70, scatter=False, color='r')
plt.legend()
plt.ylim(5,55)
plt.xlim(40,240);


#### Creating new column `(2nd Degree)`

In [None]:
dataset['horsepower2'] = dataset.loc[:, "horsepower"]**2
#dataset.head()

### STATS Model Package

#### Using 1st Degree

In [None]:
auto_stats = smf.ols('mpg ~ horsepower ', dataset).fit()

In [None]:
auto_stats.summary()

#### Using 2nd Degree

In [None]:
# add horsepower2 to the mix
auto_stats = smf.ols('mpg ~ horsepower + horsepower2', dataset).fit()
auto_stats.summary()

In [None]:
# add weight to the mix
auto_stats = smf.ols('mpg ~ horsepower + horsepower2 + weight', dataset).fit()
auto_stats.summary()

### NLR Using `scikitlearn`

In [None]:
regr = skl_lm.LinearRegression()

#### LR model using `horsepower` to predict `mpg`

In [None]:
# Linear fit mpg= b0+ hp *b1
# reshape horsepower variable to (n row * 1 col)
X = dataset.horsepower.values.reshape(-1,1)
Y = dataset.mpg
regr.fit(X, Y)

#### Predictions and Residuals

In [None]:
dataset['pred1'] = regr.predict(X)
dataset['resid1'] = dataset.mpg - dataset.pred1


#### NLR - Quadratic Fit

In [None]:
# Quadratic fit  mpg= b0+ hp *b1 + b2 * hp^2
W = dataset[['horsepower', 'horsepower2']].values
regr.fit(W, Y)

dataset['pred2'] = regr.predict(W)
dataset['resid2'] = dataset.mpg - dataset.pred2

dataset

#### Plots - `Linear Fit` and `Quadratic Fit`

In [None]:
fig, (ax1,ax2) = plt.subplots(1,2, figsize=(12,5))

# Left plot
sns.regplot(x=dataset.pred1, y=dataset.resid1, lowess=True, 
            ax=ax1, line_kws={'color':'r', 'lw':1},
            scatter_kws={'facecolors':'None', 'edgecolors':'k', 'alpha':0.5})
ax1.hlines(0,xmin=ax1.xaxis.get_data_interval()[0],
           xmax=ax1.xaxis.get_data_interval()[1], linestyles='dotted')
ax1.set_title('Residual Plot for Linear Fit')

# Right plot
sns.regplot(x=dataset.pred2, y=dataset.resid2, lowess=True,
            line_kws={'color':'r', 'lw':1}, ax=ax2,
            scatter_kws={'facecolors':'None', 'edgecolors':'k', 'alpha':0.5})
ax2.hlines(0,xmin=ax2.xaxis.get_data_interval()[0],
           xmax=ax2.xaxis.get_data_interval()[1], linestyles='dotted')
ax2.set_title('Residual Plot for Quadratic Fit')

for ax in fig.axes:
    ax.set_xlabel('Fitted values')
    ax.set_ylabel('Residuals')