In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:

## Converts DHS Individual Recode (ir) file into sibling dataset with one row
## for each sibling reported.
create_sib_data <- function(ir, country, survyear, strata="v022"){

  ## caseid   "case identification"
  ## v000     "country code and phase"
  ## v001     "cluster number"
  ## v002     "household number"
  ## v003     "respondent's line number"
  ## v004     "ultimate area unit"
  ## v005     "women's individual sample weight (6 decimals)"
  ## v006     "month of interview"
  ## v007     "year of interview"
  ## v008     "date of interview (cmc)"
  ## v008a    "date of interview century day code (cdc)"
  ## v009     "respondent's month of birth"
  ## v010     "respondent's year of birth"
  ## v011     "date of birth (cmc)"
  ## v012     "respondent's current age"
  ## v013     "age in 5-year groups"
  ## v014     "completeness of age information"
  ## v015     "result of individual interview"
  ## v016     "day of interview"
  ## v017     "cmc start of calendar"
  ## v018     "row of month of interview"
  ## v019     "length of calendar"
  ## v019a    "number of calendar columns"
  ## v020     "ever-married sample"
  ## v021     "primary sampling unit"
  ## v022     "sample strata for sampling errors"
  ## v023     "stratification used in sample design"
  ## v024     "region"
  ## v025     "type of place of residence"
  ## v026     "na - de facto place of residence"
  ## v027     "number of visits"
  ## v028     "interviewer identification"
  ## v029     "na - keyer identification"
  ## v030     "field supervisor"

  ## mmidx_01 "index to maternal mortality"
  ## mm1_01   "sex of sibling"
  ## mm2_01   "survival status of sibling"
  ## mm3_01   "sibling's current age"
  ## mm4_01   "sibling's date of birth (cmc)"
  ## mm5_01   "na - sibling's marital status"
  ## mm6_01   "years since sibling died"
  ## mm7_01   "sibling's age at death"
  ## mm8_01   "date of death of sibling (cmc)"
  ## mm9_01   "sibling's death and pregnancy"
  ## mm10_01  "na - death and a pregnancy are related"
  ## mm11_01  "na - cause of death sibling"
  ## mm12_01  "na - amount of time between sibling's delivery and death"
  ## mm13_01  "na - place of sibling's death"
  ## mm14_01  "number of sibling's children"
  ## mm15_01  "na - sibling's year of death"


  ir$country <- country
  ir$survyear <- survyear
  ir$stratum <- do.call(paste, ir[strata])

  widevars <- c("caseid", "country", "survyear", "stratum",
                "v001", "v002", "v003", "v005", "v007", "v008", "v011", "v012", "v013", "v021", "v022", "v023", "v024", "v025",
                grep("mmc", names(ir), value=TRUE))
  longvars <- grep("^mm[idx0-9]", names(ir), value=TRUE)
  varying <- tapply(longvars, sub("(.*)_.*", "\\1", longvars), c)

  ir <- ir[c(widevars, longvars)]

  ## Reshape wide to long
  sib <- reshape(ir[c(widevars, unlist(longvars))],
                 varying = varying,
                 v.names = names(varying),
                 idvar = "caseid",
                 timevar = "mmidx",
                 direction = "long")

  ## Drop empty entries
  sib <- subset(sib, !is.na(mmidx))


  ## Recode variables

  if(is.null(sib$mm15))
    sib$mm15 <- rep(NA, nrow(sib))

  if(is.factor(sib$v007)) sib$v007 <- as.integer(as.character(sib$v007))
  sib$v007 <- with(sib, ifelse(v007 < 100, 1900+v007, v007))
  sib$v007 <- with(sib, ifelse(is.na(v007), 1900+floor(v008/12), v007))

  sib$v023 <- factor(sib$v023)

  if(class(sib$mmc5) == "integer")
    sib$mmc5 <- factor(sib$mmc5, c(10, 12, 13, 15), c("10 years", "12 years", "13 years", "15 years"))

  sib$intvy <- sib$v007
  sib$intvcmc <- sib$v008

  mm1 <- sib$mm1
  mm1 <- replace(mm1, mm1==1, "male")
  mm1 <- replace(mm1, mm1==2, "female")
  sib$sex <- factor(tolower(mm1), c("male", "female"))

  mm2 <- sib$mm2
  mm2 <- replace(mm2, mm2==0, "dead")
  mm2 <- replace(mm2, mm2==1, "alive")
  sib$deaths <- factor(tolower(mm2), c("dead", "alive")) == "dead"

  sib$weight <- sib$v005/1e6

  sib$sibdob <- sib$mm4
  sib$sibdod <- sib$mm8

  return(sib)
}


create_mics_sib_data <- function(mm, country, survyear, strata=c("hh7", "hh6")){

  ## HH1      "Cluster number"
  ## HH2      "Household number"
  ## LN       "Line number"
  ## MMLN     "Sibling's line number"
  ## MM5      "Sibling's gender"
  ## MM6      "Sibling still alive"
  ## MM7      "Age of sibling"
  ## MM8      "Years since death"
  ## MM9      "Age at death of sibling"
  ## MM10     "Pregnant when died"
  ## MM11     "Died during the childbirth"
  ## MM12     "Died within two months after the end of a pregnancy or childbirth"
  ## MM13     "Number of live born children during lifetime"
  ## HH6      "Area"
  ## HH7      "Province"
  ## WDOI     "Date of interview women (CMC)"
  ## WDOB     "Date of birth of woman (CMC)"
  ## MM7C     "Imputed date of birth"
  ## MM8C     "Imputed date of death"
  ## welevel  "Education"
  ## wmweight "Women's sample weight"
  ## wscore   "Combined wealth score"
  ## windex5  "Wealth index quintile"
  ## wscoreu  "Urban wealth score"
  ## windex5u "Urban wealth index quintile"
  ## wscorer  "Rural wealth score"
  ## windex5r "Rural wealth index quintile"

  sib <- do.call(data.frame, mm)
  names(sib) <- tolower(names(sib))  # some inconsistency in capitalisation of variable names

  sib$country <- country
  sib$survyear <- survyear
  sib$stratum <- do.call(paste, sib[strata])

  sib$intvcmc <- sib$wdoi
  sib$intvy <- floor(sib$wdoi/12 + 1900)
  sib$sex <- factor(as.integer(sib$mm5), 1:2, c("male", "female"))
  sib$sibdob <- sib$mm7c
  sib$sibdod <- sib$mm8c

  sib$deaths <- as.integer(factor(as.integer(sib$mm6), 1:2) == 2)

  sib$weight <- sib$wmweight

  return(sib)
}



#' Calculate age-specific mortality rates in period preceding survey.
#'
#' Should replicate mortality rates reported in DHS reports.
#'
#' @param sib A dataset as `data.frame`.
#' @param period Interval 'period' defined in the months before the survey.
#'
#' @importFrom survival Surv
#' @importFrom survival survSplit
calc_dhs_mx <- function(sib, period=c(0, 84)){

  ## drop if missing survival status or sex
  sib <- subset(sib, sex %in% c("male", "female") & !is.na(deaths))

  ## define episode
  sib$deaths <- with(sib, ifelse(deaths & v008-period[1] < mm8+1, FALSE, deaths))
  sib$agecens <- with(sib, pmin(v008-period[1], mm8+1, na.rm=TRUE) - mm4)  # add 1 to DOD to go to month end
  sib$agestart <- sib$v008 - period[2] - sib$mm4

  sibspl <- survSplit(Surv(agestart, agecens, deaths)~.,
                      subset(sib, agecens > agestart),
                      cut=3:10*5*12, episode="agegr")

  sibspl$agegr <- factor(sibspl$agegr, 2:8, 3:9*5)
  sibspl <- subset(sibspl, !is.na(agegr))
  sibspl$pys <- (sibspl$agecens - sibspl$agestart)/12

  mx <- aggregate(weight*cbind(deaths, pys) ~ agegr + sex + survyear + country, sibspl, sum)
  mx$mx <- mx$deaths / mx$pys

  return(mx)
}

## Calculate 35q15 from output of calc_dhs_mx().
## Should replicate 35q15 reported in DHS reports.
calc_dhs_35q15 <- function(mx){
  qx <- aggregate(cbind(px=exp(-5*mx)) ~ survyear+sex+country, mx, prod)
  qx$qx <- 1-qx$px
  return(qx)

}


#' Create episode dataset split by period, age group, and time preceding survey indicator (TIPS)
#'
#' @param dat A dataset as `data.frame`.
#' @param period Numeric vector defining calendar periods to stratify analysis, use `NULL` for no periods.
#' @param agegr Numeric vector defining ages *in years* for splits.
#' @param tips Break points for TIme Preceding Survey.
#' @param dobvar Variable name for date of birth (character string).
#' @param dodvar Variable name for date of death (character string).
#'
#' @importFrom survival Surv
#' @importFrom survival survSplit
create_tips_data <- function(dat, period=do.call(seq.int, as.list(range(dat$intvy)+c(-16, 1))), agegr = 3:12*5, tips = 0:15, dobvar="sibdob", dodvar="sibdod"){

  ## drop if missing survival status or sex
  dat <- subset(dat, sex %in% c("male", "female") & !is.na(deaths))

  ## split into annual TIPS episodes
  ## (Note: splitting TIPS first to eliminate as much person time as posdatle for each person for efficiency)
  dat$tstart <- dat[[dobvar]] # episode starts at child dob
  dat$tcens <- pmin(dat[[dodvar]]+1, dat$intvcmc, na.rm=TRUE) # episode ends at either date of death or interview
  dat$tipsstart <- with(dat, tstart - intvcmc)
  dat$tipscens <- with(dat, tcens - intvcmc)
  dat <- survSplit(Surv(tipsstart, tipscens, deaths)~., subset(dat, tipscens > tipsstart), cut=-tips*12, episode="tips", start="tipsstart", end="tipscens")
  dat$tips <- c(NA, rev(tips[-length(tips)]), NA)[dat$tips]
  dat <- subset(dat, !is.na(tips))

  dat$tstart <- with(dat, tipsstart + intvcmc)
  dat$tcens <- with(dat, tipscens + intvcmc)

  ## define time episode and split
  dat <- survSplit(Surv(tstart, tcens, deaths)~., subset(dat, tcens > tstart), cut=(period-1900)*12, episode="period", start="tstart", end="tcens")
  dat$period <- c(NA, period[-length(period)], NA)[dat$period]
  dat <- subset(dat, !is.na(period))

  ## split into age groups
  dat$agestart <- dat$tstart - dat[[dobvar]]
  dat$agecens <- dat$tcens - dat[[dobvar]]
  dat <- survSplit(Surv(agestart, agecens, deaths)~., dat, cut=agegr*12, episode="agegr", start="agestart", end="agecens")
  dat$agegr <- c(NA, agegr[-length(agegr)], NA)[dat$agegr]
  dat <- subset(dat, !is.na(agegr))

  ## calculate PYs
  dat$pys <- with(dat, (agecens - agestart)/12)

  return(dat)
}

In [None]:
#' Calculate the probability of dying between age x and x+n (nqx)
#'
#' Default arguments are configured to calculate under 5 mortality
#' from a DHS Births Recode file.
#'
#' @param data A dataset (data.frame), for example a DHS births recode (BR) dataset.
#' @param by A formula specifying factor variables by which to stratify analysis.
#' @param agegr Numeric vector defining ages *in years* for splits.
#' @param period Numeric vector defining calendar periods to stratify analysis, use `NULL` for no periods.
#' @param cohort Numeric vector defining birth cohorts to stratify analysis, use `NULL` for no cohort stratification.
#' @param tips Break points for TIme Preceding Survey.
#' @param clusters Formula or data frame specifying cluster ids from largest level to smallest level, ‘~0’ or ‘~1’ is a formula for no clusters.
#' @param strata Formula or vector specifying strata, use ‘NULL’ for no strata.
#' @param weight Formula or vector specifying sampling weights.
#' @param dob Variable name for date of birth (character string).
#' @param dod Variable name for date of death (character string).
#' @param death Variable name for event variable (character string).
#' @param intv Variable name for interview date (character string).
#' @param varmethod Method for variance calculation. Currently "lin" for Taylor
#'   linearisation or "jk1" for unstratified jackknife, or "jkn", for stratified
#'   jackknife.
#' @param origin Origin year for date arguments. 1900 for CMC inputs.
#' @param scale Scale for dates inputs to calendar years. 12 for CMC inputs.
#'
#' @examples
#'
#' data(zzbr)
#' zzbr$death <- zzbr$b5 == "no"  # b5: child still alive ("yes"/"no")
#' zzbr$dod <- zzbr$b3 + zzbr$b7 + 0.5
#'
#' ## Calculate 5q0 from birth history dataset.
#' ## Note this does NOT exactly match DHS calculation.
#' ## See calc_dhs_u5mr().
#' u5mr <- calc_nqx(zzbr)
#' u5mr
#'
#' ## Retrieve sample covariance and correlation
#' vcov(u5mr)  # sample covariance
#' cov2cor(vcov(u5mr))  # sample correlation
#'
#' ## 5q0 by sociodemographic characteristics
#' calc_nqx(zzbr, by=~v102) # by urban/rural residence
#' calc_nqx(zzbr, by=~v190, tips=c(0, 10)) # by wealth quintile, 0-9 years before
#' calc_nqx(zzbr, by=~v101+v102, tips=c(0, 10)) # by region and residence
#'
#' ## Compare unstratified standard error estiamtes for linearization and jackknife
#' calc_nqx(zzbr, varmethod = "lin")  # unstratified design
#' calc_nqx(zzbr, strata=NULL, varmethod = "lin")  # unstratified design
#' calc_nqx(zzbr, strata=NULL, varmethod = "jk1")  # unstratififed jackknife
#' calc_nqx(zzbr, varmethod = "jkn")  # stratififed jackknife
#'
#' ## Calculate various child mortality indicators (neonatal, infant, etc.)
#' calc_nqx(zzbr, agegr=c(0, 1)/12)  # neonatal
#' calc_nqx(zzbr, agegr=c(1, 3, 5, 12)/12) # postneonatal
#' calc_nqx(zzbr, agegr=c(0, 1, 3, 5, 12)/12) # infant (1q0)
#' calc_nqx(zzbr, agegr=c(12, 24, 36, 48, 60)/12) # child (4q1)
#' calc_nqx(zzbr, agegr=c(0, 1, 3, 5, 12, 24, 36, 48, 60)/12) # u5mr (5q0)
#'
#' ## Calculate annaul 5q0 by calendar year
#' calc_nqx(zzbr, period=2005:2015, tips=NULL)
#'
#' @export
#' @md
calc_nqx <- function(data,
                     by = NULL,
                     agegr = c(0, 1, 3, 5, 12, 24, 36, 48, 60)/12,
                     period = NULL,
                     cohort = NULL,
                     tips = c(0, 5, 10, 15),
                     clusters=~v021,
                     strata=~v024+v025,
                     weight= "v005",
                     dob="b3",
                     dod="dod",
                     death="death",
                     intv = "v008",
                     varmethod = "lin",
                     origin=1900,
                     scale=12){

  data$tstop <- ifelse(data[[death]], data[[dod]], data[[intv]])

  data$dob <- data[[dob]]
  data$death <- data[[death]]
  data$intv <- data[[intv]]
  data$weights <- data[[weight]] / mean(data[[weight]])

  if(is.null(by))
    by <- ~1

  vars <- unique(unlist(lapply(c(by, strata, clusters), all.vars)))
  f <- formula(paste("~", paste(vars, collapse = "+")))
  mf <- model.frame(f, data=data, na.action=na.pass, death=death,
                    weights=weights, dob=dob, intv=intv, tstop=tstop)

  aggr <- demog_pyears(f, mf, period=period, agegr=agegr, tips=tips, event="(death)",
                       tstart="(dob)", tstop="(tstop)", weights="(weights)",
                       origin=origin, scale=scale)$data

  ## All values of factor combinations that appear
  byvar <- intersect(c(all.vars(by), "agegr", "period", "cohort", "tips"),
                     names(aggr))
  aggr$byf <- interaction(aggr[byvar], drop=TRUE)
  
  ## prediction for all factor levels that appear
  pred <- data.frame(aggr[c(byvar, "byf")])[!duplicated(aggr$byf),]
  pred <- pred[order(pred$byf), ]
  pred$pyears <- 1
  
  ## Matrix to aggregate piecewise-constant rates to cumulative hazards
  dfmm <- .mm_aggr(pred[byvar], agegr)
  mm <- dfmm$mm

  if(varmethod == "lin") {
    des <- survey::svydesign(ids=clusters, strata=strata, data=aggr, weights=~1)

    ## fit model
    f <- if(length(levels(aggr$byf)) == 1)
           event ~ offset(log(pyears))
         else
           event ~ -1 + byf + offset(log(pyears))

    mod <- survey::svyglm(f, des, family=quasipoisson)

    mx <- predict(mod, pred, type="response", vcov=TRUE)

    lest <- drop(mx %*% mm)
    lv <- t(mm) %*% vcov(mx) %*% mm
    est <- 
    dF <- diag(exp(-lest), length(lest))
    v <- dF %*% lv %*% dF

    estdf <- data.frame(est  = 1 - exp(-lest),
                        se   = sqrt(diag(v)),
                        ci_l = 1 - exp(-(lest - qnorm(0.975)*sqrt(diag(lv)))),
                        ci_u = 1 - exp(-(lest + qnorm(0.975)*sqrt(diag(lv)))))
    attr(estdf, "var") <- v
    
  } else if(varmethod %in% c("jkn", "jk1")) {

    ## Convert to array with events and PYs for each cluster
    ## reshape2::acast is MUCH faster than stats::reshape
    events_clust <- reshape2::acast(aggr, update(clusters, byf ~ .), value.var="event")
    pyears_clust <- reshape2::acast(aggr, update(clusters, byf ~ .), value.var="pyears")

    if(varmethod == "jkn"){
      aggr$strataid <- as.integer(interaction(aggr[all.vars(strata)], drop=TRUE))
      strataid <- drop(reshape2::acast(unique(aggr[c(all.vars(clusters), "strataid")]),
                                       update(clusters,  1 ~ .), value.var="strataid"))
    } else
      strataid <- NULL

    estdf <- jackknife(events_clust, pyears_clust, strataid, t(dfmm$mm), function(x) 1 - exp(-x))
  } else
    stop(paste0("varmethod = \"", varmethod, "\" is not recognized."))

  val <- data.frame(dfmm$df, estdf)
  attr(val, "var") <- vcov(estdf)

  rownames(val) <- NULL
  
  val
}

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(style='white', context='notebook', palette='deep')
from collections import Counter
pd.set_option('display.max_columns', None)

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

data=pd.read_csv('dataset/child-mortality-dataset )

data = data.fillna(np.nan)

# Check for Null values
data.isnull().sum()

nan_cols = [i for i in data.columns if data[i].isnull().any()]
nan_cols

an_cols20 = [i for i in data.columns if data[i].isnull().sum() > 0.2*len(data)]
nan_cols20

data.describe()

data = data.fillna(data.groupby('State_Name').transform('mean'))
data

data = data.reindex(columns = [col for col in data.columns if col != 'YY_Under_Five_Mortality_Rate_U5MR_Total_Person'] + ['YY_Under_Five_Mortality_Rate_U5MR_Total_Person'])
data

 

graph = sns.heatmap(data[["YY_Under_Five_Mortality_Rate_U5MR_Total_Person","AA_Population_Total","BB_Average_Household_Size_All_Total","BB_Dependency_Ratio_Total","CC_Sex_Ratio_0_4_Years_Total"]].corr(),annot=True, fmt = ".2f", cmap = "coolwarm")



graph = sns.heatmap(data[["YY_Under_Five_Mortality_Rate_U5MR_Total_Person","DD_Person_Total","DD_Male_Total","DD_Female_Total"]].corr(),annot=True, fmt = ".2f", cmap = "coolwarm")


graph = sns.heatmap(data[["YY_Under_Five_Mortality_Rate_U5MR_Total_Person","YY_Crude_Death_Rate_Cdr_Total_Person", "YY_Infant_Mortality_Rate_Imr_Total_Person", "YY_Neo_Natal_Mortality_Rate_Total", "YY_Post_Neo_Natal_Mortality_Rate_Total"]].corr(),annot=True, fmt = ".2f", cmap = "coolwarm")

data2 = data[['State_Name', 'AA_Population_Total', 'BB_Dependency_Ratio_Total', 'DD_Person_Total', 'FF_Children_Currently_Attending_School_Age_6_17_Years_Person_Total', 'GG_Children_Aged_5_14_Years_Engaged_In_Work_Person_Total', 'LL_Crude_Birth_Rate_Cbr_Total', 'LL_Mean_Number_Of_Children_Ever_Born_To_Women_Aged_15_49_Years_Total', 'PP_Mothers_Who_Received_Any_Antenatal_Check_Up_Total', 'TT_Children_Aged_12_23_Months_Fully_Immunized_Total', 'TT_Children_Who_Did_Not_Receive_Any_Vaccination_Total', 'YY_Crude_Death_Rate_Cdr_Total_Person', 'YY_Infant_Mortality_Rate_Imr_Total_Person', 'YY_Neo_Natal_Mortality_Rate_Total', 'YY_Post_Neo_Natal_Mortality_Rate_Total', 'YY_Under_Five_Mortality_Rate_U5MR_Total_Person']].copy()
data2





SyntaxError: unterminated string literal (detected at line 13) (<ipython-input-1-13099756d716>, line 13)