In [1]:
import sys


sys.path.append('/Users/samrelins/Documents/LIDA/dental_project/src/')

from edrecs_context_data_prep import *
import os
import pandas as pd
import plotly.express as px
from patsy import dmatrices
import statsmodels.api as sm


In [2]:

# load dental epi survey data
dental_dir = "/Users/samrelins/Documents/LIDA/dental_project/data/bib_data/dental"
epi_path = os.path.join(dental_dir, "dsurvey_5yo/data.csv")
epi_data = pd.read_csv(epi_path)

epi_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 354 entries, 0 to 353
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   entity_id        354 non-null    object 
 1   has_dsurvey_5yo  354 non-null    float64
 2   AgeMths          354 non-null    float64
 3   ddsdt            354 non-null    float64
 4   ddsmt            354 non-null    float64
 5   ddsft            354 non-null    float64
 6   ddsdmft          354 non-null    float64
 7   ddscareindex     354 non-null    float64
dtypes: float64(7), object(1)
memory usage: 22.2+ KB


In [3]:
# load edrecs context data
ed_dir = "/Users/samrelins/Documents/LIDA/dental_project/data/bib_data/education"
context_path = os.path.join(ed_dir, "context/data.csv")
context_data = pd.read_csv(context_path)
context_data = (context_data.pipe(clean_context_data)
                .pipe(convert_sen_categories)
                .pipe(convert_ethnicity_categories)
                .pipe(convert_context_numerical_categories)
                .pipe(rename_context_cols))

context_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11711 entries, 0 to 11710
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   entity_id               11711 non-null  object  
 1   has_edrecs_context      11711 non-null  int16   
 2   year_started_school     11711 non-null  category
 3   birthday_academic_term  11711 non-null  category
 4   eal                     11711 non-null  category
 5   ethnicity               11711 non-null  category
 6   fsm                     11711 non-null  category
 7   gifted                  11711 non-null  category
 8   gender                  11711 non-null  category
 9   looked_after            11711 non-null  category
 10  sen                     11711 non-null  category
 11  has_edcont              11711 non-null  int16   
dtypes: category(9), int16(2), object(1)
memory usage: 241.3+ KB


In [4]:
# join context variables to epi data
epi_data = epi_data.merge(context_data, on="entity_id", how="left")
epi_data.dropna(inplace=True)
epi_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 352 entries, 0 to 353
Data columns (total 19 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   entity_id               352 non-null    object  
 1   has_dsurvey_5yo         352 non-null    float64 
 2   AgeMths                 352 non-null    float64 
 3   ddsdt                   352 non-null    float64 
 4   ddsmt                   352 non-null    float64 
 5   ddsft                   352 non-null    float64 
 6   ddsdmft                 352 non-null    float64 
 7   ddscareindex            352 non-null    float64 
 8   has_edrecs_context      352 non-null    float64 
 9   year_started_school     352 non-null    category
 10  birthday_academic_term  352 non-null    category
 11  eal                     352 non-null    category
 12  ethnicity               352 non-null    category
 13  fsm                     352 non-null    category
 14  gifted                  35

In [5]:
# remove superfluous features for linear regression
drop_cols = ["entity_id", "has_dsurvey_5yo", "ddsdt", "ddsmt", "ddsft",
             "ddscareindex", "has_edrecs_context", "has_edcont", "AgeMths"]

epi_data = epi_data.drop(drop_cols, axis=1)

epi_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 352 entries, 0 to 353
Data columns (total 10 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   ddsdmft                 352 non-null    float64 
 1   year_started_school     352 non-null    category
 2   birthday_academic_term  352 non-null    category
 3   eal                     352 non-null    category
 4   ethnicity               352 non-null    category
 5   fsm                     352 non-null    category
 6   gifted                  352 non-null    category
 7   gender                  352 non-null    category
 8   looked_after            352 non-null    category
 9   sen                     352 non-null    category
dtypes: category(9), float64(1)
memory usage: 9.6 KB


In [6]:
features = epi_data.drop("ddsdmft", axis=1).columns
patsy_string = "ddsdmft ~ "
patsy_string += " + ".join(features)
y, X = dmatrices(patsy_string, data=epi_data, return_type="dataframe")

mod = sm.OLS(y, X)
res = mod.fit()
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:                ddsdmft   R-squared:                       0.110
Model:                            OLS   Adj. R-squared:                  0.079
Method:                 Least Squares   F-statistic:                     3.501
Date:                Fri, 25 Jun 2021   Prob (F-statistic):           6.72e-05
Time:                        11:32:04   Log-Likelihood:                -856.02
No. Observations:                 352   AIC:                             1738.
Df Residuals:                     339   BIC:                             1788.
Df Model:                          12                                         
Covariance Type:            nonrobust                                         
                                       coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------------------
Intercep

  return np.sqrt(eigvals[0]/eigvals[-1])


In [9]:
preds = res.predict(X)
preds

0      1.223638
1      2.075952
2      1.953225
3      1.744088
4      1.749168
         ...   
349    3.410288
350    0.758706
351    3.410288
352    2.245691
353    3.410288
Length: 352, dtype: float64