###### Let's start with importing what we need...  

In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.formula.api import ols
from scipy.stats import norm
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn import datasets 

###### Let's load the publicly available diabetes dataset and print out a description of the dataset

###### Your task is to build the best linear regression model you can using this data to predict the 'target' field.

#### Diabetes dataset

Ten baseline variables, age, sex, body mass index, average blood pressure, and six blood serum measurements were obtained for each of n = 442 diabetes patients, as well as the response of interest, a quantitative measure of disease progression one year after baseline.
Data Set Characteristics
Number of Instances: 442
Number of Attributes
First 10 columns are numeric predictive values
Target: Column 11 is a quantitative measure of disease progression one year after baseline
Attribute Information
 - age:     age in years <br>
  - sex:     sex <br>
  - bmi:     body mass index <br>
  - bp:      average blood pressure <br>
  - s1:      tc, total serum cholesterol <br>
  - s2:      ldl, low-density lipoproteins <br>
  - s3:      hdl, high-density lipoproteins <br>
  - s4:      tch, total cholesterol / HDL <br>
  - s5:      ltg, possibly log of serum triglycerides level <br>
  - s6:      glu, blood sugar level <br>
Note: Each of these 10 feature variables have been mean centered and scaled by the standard deviation times n_samples (i.e. the sum of squares of each column totals 1).
Source URL
https://www4.stat.ncsu.edu/~boos/var.select/diabetes.html
For more information see:
Bradley Efron, Trevor Hastie, Iain Johnstone and Robert Tibshirani (2004) "Least Angle Regression," Annals of Statistics (with discussion), 407-499. (https://web.stanford.edu/~hastie/Papers/LARS/LeastAngle_2002.pdf)


Source URL: https://www4.stat.ncsu.edu/~boos/var.select/diabetes.html
Data URL: https://www4.stat.ncsu.edu/~boos/var.select/diabetes.tab.txt
Note: The Data URL mentioned-above is obtained from the source URL. The source URL provides detailed information about the dataset, variables and also reference links including the dataset link.



##### Read in data into a dataframe then print the dataframe head.

In [9]:
 #Use this URL to read in the data into a pandas dataframe called "df".
#Hint: set sep="\t" when reading in the csv file. 
df = pd.read_csv('https://www4.stat.ncsu.edu/~boos/var.select/diabetes.tab.txt', sep="\t")

###### Basic field information

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 442 entries, 0 to 441
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   AGE     442 non-null    int64  
 1   SEX     442 non-null    int64  
 2   BMI     442 non-null    float64
 3   BP      442 non-null    float64
 4   S1      442 non-null    int64  
 5   S2      442 non-null    float64
 6   S3      442 non-null    float64
 7   S4      442 non-null    float64
 8   S5      442 non-null    float64
 9   S6      442 non-null    int64  
 10  Y       442 non-null    int64  
dtypes: float64(6), int64(5)
memory usage: 38.1 KB


###### Convert sex to a categorical variable

In [13]:
# YOUR CODE HERE
df['SEX']=df['SEX'].astype('category')


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 442 entries, 0 to 441
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   AGE     442 non-null    int64   
 1   SEX     442 non-null    category
 2   BMI     442 non-null    float64 
 3   BP      442 non-null    float64 
 4   S1      442 non-null    int64   
 5   S2      442 non-null    float64 
 6   S3      442 non-null    float64 
 7   S4      442 non-null    float64 
 8   S5      442 non-null    float64 
 9   S6      442 non-null    int64   
 10  Y       442 non-null    int64   
dtypes: category(1), float64(6), int64(4)
memory usage: 35.2 KB


###### Next, examine the dataframe

In [18]:
#Use Panda's describe function to peak into the dataframe.
# be sure to include the parameter include= "all" to grab the 
# columns where the statistic is Inappropriate for the datatype. 
# set the output to "dfDescription' so the result can be printed"
# YOUR CODE HERE
dfDescription = df.describe(include = "all")
print (dfDescription)

               AGE    SEX         BMI          BP          S1          S2  \
count   442.000000  442.0  442.000000  442.000000  442.000000  442.000000   
unique         NaN    2.0         NaN         NaN         NaN         NaN   
top            NaN    1.0         NaN         NaN         NaN         NaN   
freq           NaN  235.0         NaN         NaN         NaN         NaN   
mean     48.518100    NaN   26.375792   94.647014  189.140271  115.439140   
std      13.109028    NaN    4.418122   13.831283   34.608052   30.413081   
min      19.000000    NaN   18.000000   62.000000   97.000000   41.600000   
25%      38.250000    NaN   23.200000   84.000000  164.250000   96.050000   
50%      50.000000    NaN   25.700000   93.000000  186.000000  113.000000   
75%      59.000000    NaN   29.275000  105.000000  209.750000  134.500000   
max      79.000000    NaN   42.200000  133.000000  301.000000  242.400000   

                S3          S4          S5          S6           Y  
count 

###### Split dataframe into train and test subsets

In [19]:
# Use train_test_split() to split the train and test sets.
# Set test_size to 0.3
# Set random_state to 42
# The train and tests sets need to be called "df_train" and "df_test".
# YOUR CODE HERE
df_train, df_test = train_test_split(df, test_size=0.3,random_state=42)

###### Fit Multilinear OLS regression model using training dataset and save the result in 'est_train' variable. 
Print model summary

In [20]:
# YOUR CODE HERE
est_train = ols(formula="Y ~ AGE + SEX + BMI + BP + S1 + S2 + S3 + S4 + S5 + S6", data=df_train).fit()
print(est_train.summary())

                            OLS Regression Results                            
Dep. Variable:                      Y   R-squared:                       0.524
Model:                            OLS   Adj. R-squared:                  0.508
Method:                 Least Squares   F-statistic:                     32.86
Date:                Mon, 09 Oct 2023   Prob (F-statistic):           1.37e-42
Time:                        13:05:48   Log-Likelihood:                -1671.5
No. Observations:                 309   AIC:                             3365.
Df Residuals:                     298   BIC:                             3406.
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept   -341.2349     78.615     -4.341      0.0

###### Extract non significant coef (p< .05: SEX + BMI + S3 + S5), rerun model.

In [22]:
# YOUR CODE HERE
est_train = ols(formula="Y ~ SEX + BMI + BP + S5", data=df_train).fit()
print(est_train.params)

Intercept   -332.234893
SEX[T.2]     -12.866148
BMI            7.149076
BP             1.148400
S5            41.313402
dtype: float64


###### How well does it do on the test data? Lets use the model we trained on the training data to make predictions on the test data and then measure the R^2

In [27]:
# Set the r2 results to "r2"
# YOUR CODE HERE
test_pred = est_train.predict(df_test)
r2 = r2_score(df_test['Y'],test_pred)
print('OOS R-squared: '+ str(r2))


OOS R-squared: 0.48587882336937593
