In [1]:
import pandas as pd
import numpy as np
import altair as alt
import matplotlib as plt
import statsmodels.api as sm
import ssl

ssl._create_default_https_context = ssl._create_unverified_context


## Exercise 1

In [2]:

df = pd.read_csv("https://github.com/nickeubank/MIDS_Data/blob/master/smoking.csv?raw=true")
results = {}

In [3]:
df.head(10)

Unnamed: 0,id,date,gestation,bwt.oz,parity,mrace,mage,med,mht,mpregwt,inc,smoke
0,4604,1598,148,116,7,7,28,1,66,135,2,0
1,7435,1527,181,110,7,7,27,1,64,133,1,0
2,7722,1563,204,55,11,7,35,3,65,140,6,0
3,2026,1503,225,132,4,7,28,2,67,148,3,0
4,3553,1638,233,105,4,7,34,3,61,130,3,0
5,3491,1705,234,85,7,7,33,1,67,130,2,0
6,6757,1444,234,97,0,6,26,5,65,112,6,0
7,6153,1405,235,129,3,7,24,4,66,135,1,0
8,8187,1669,236,63,0,5,24,5,58,99,7,0
9,8403,1669,241,128,0,7,17,1,64,126,2,0


## Exercise 2

In [4]:
df_225 = df.copy()
df_225 = df_225.loc[df_225["gestation"] > 225]

In [5]:
df_225.head(10)
df_225.rename(columns = {"bwt.oz":"birth_weight"}, inplace=True)

In [6]:
chart = (
    alt.Chart(
        df_225
    )
    .mark_point()
    .encode(x=alt.X("gestation", scale=alt.Scale(zero=False)), y=alt.Y("birth_weight"), color="smoke")
    .properties(
        title="Relationship between infant weight and gestation for smoking and non-smoking mothers"
    )
)
fit = chart.transform_regression(
    "gestation", "birth_weight", groupby=["smoke"]
).mark_line(color="Red")
plot = chart + fit
plot.display()

In [7]:
results['ex2_birth_weight_lower'] = 'yes'

## Exercise 3

In [8]:
x = df_225[['gestation', 'smoke']]
y = df_225[['birth_weight']]

In [9]:
model = sm.OLS(y, x)
mod_results = model.fit()
mod_results.params

gestation    0.438243
smoke       -8.046251
dtype: float64

In [11]:
results['ex3_smoking_coefficient'] = mod_results.params['smoke']
results['ex3_smoking_coefficient']

-8.046250694178998

## Exercise 4

In [15]:
import statsmodels.formula.api as smf

In [12]:
x = df_225[['gestation', 'smoke', 'mpregwt', 'mrace']]
y = df_225[['birth_weight']]

In [20]:
mod2 = smf.ols(formula='birth_weight ~ gestation + smoke + mpregwt + C(mrace)', data=df_225).fit()
mod2.params

Intercept       -17.320334
C(mrace)[T.1]    -1.844101
C(mrace)[T.2]    -5.629077
C(mrace)[T.3]    -0.524913
C(mrace)[T.4]     0.988394
C(mrace)[T.5]    -0.672108
C(mrace)[T.6]     4.308397
C(mrace)[T.7]    -6.456950
C(mrace)[T.8]    -7.176391
C(mrace)[T.9]    -1.915299
gestation         0.441683
smoke            -7.895835
mpregwt           0.139554
dtype: float64

In [21]:
results['ex4_smoking_coefficient'] = mod2.params['smoke']
results['ex4_smoking_coefficient']

-7.895835256029566

#### Exercise 5

In [25]:
# 1 - white
# 0 - non-white
# 0 - 5 -> 1
# others -> 0
df_225["mrace"].replace(range(0, 6), 1, inplace=True)
df_225["mrace"].replace(range(6, 10), 0, inplace=True)
df_225["mrace"].replace(99, 0, inplace=True)
df_225.head()

Unnamed: 0,id,date,gestation,birth_weight,parity,mrace,mage,med,mht,mpregwt,inc,smoke
4,3553,1638,233,105,4,0,34,3,61,130,3,0
5,3491,1705,234,85,7,0,33,1,67,130,2,0
6,6757,1444,234,97,0,0,26,5,65,112,6,0
7,6153,1405,235,129,3,0,24,4,66,135,1,0
8,8187,1669,236,63,0,1,24,5,58,99,7,0


#### Exercise 6

In [31]:
mod3 = smf.ols(formula='birth_weight ~ gestation + mpregwt + smoke*C(mrace)', data=df_225).fit()
mod3.summary()

0,1,2,3
Dep. Variable:,birth_weight,R-squared:,0.243
Model:,OLS,Adj. R-squared:,0.238
Method:,Least Squares,F-statistic:,54.81
Date:,"Tue, 15 Nov 2022",Prob (F-statistic):,1.91e-49
Time:,15:03:50,Log-Likelihood:,-3587.5
No. Observations:,862,AIC:,7187.0
Df Residuals:,856,BIC:,7216.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-21.8908,10.949,-1.999,0.046,-43.381,-0.401
C(mrace)[T.1],4.3761,1.575,2.778,0.006,1.284,7.468
gestation,0.4446,0.039,11.510,0.000,0.369,0.520
mpregwt,0.1319,0.026,5.132,0.000,0.081,0.182
smoke,-8.8601,2.077,-4.267,0.000,-12.936,-4.784
smoke:C(mrace)[T.1],1.0069,2.421,0.416,0.678,-3.746,5.759

0,1,2,3
Omnibus:,8.544,Durbin-Watson:,1.956
Prob(Omnibus):,0.014,Jarque-Bera (JB):,11.787
Skew:,0.071,Prob(JB):,0.00276
Kurtosis:,3.555,Cond. No.,6350.0


In [32]:

hypotheses = "mrace = 1, smoke = 1"
T_test = mod3.t_test(hypotheses)
print(T_test)

PatsyError: unrecognized token in constraint
    mrace = 1, smoke = 1
    ^

In [28]:
T_test = mod3.t_test(r) 
print(T_test)

                             Test for Constraints                             
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
c0                  0          0          0      1.000           0           0
