Sri R Sankaranarayanan - Final Project - DSC530 - Texas County Health Ranking Report

http://thinkstats2.com

Copyright 2016 Allen B. Downey

MIT License: https://opensource.org/licenses/MIT

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import math
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import statsmodels.stats.api as sms
import statsmodels.formula.api as smf
from scipy.stats import shapiro, normaltest, ttest_ind, ttest_rel

In [3]:
def Compute_Cohend(mean1, mean2,ser1,ser2, var1, title): 
    """Compute cohens'd to see the effect size"""     
    
    diff = mean1 - mean2
    n1, n2, = len(ser1), len(ser2)
    
    pooled_var = (n1 * var1 + n2) / (n1 + n2)
    d = diff / math.sqrt(pooled_var)
    print(f'{title} = {d}  \n')
    return d


def Hist_Plot(subtitle, xlab1, xlab2, ser1, ser2):
    """Plot histogram using matplotlin ploty plots two small plots
        side by side for comparison"""
     
    # Main Title    
    fig = plt.figure(figsize=(10,10))
    title = fig.suptitle(subtitle, fontsize=14, fontweight="bold")
                         
    fig.subplots_adjust(top=0.88, wspace=0.3)
    
    # Histogram 1
    ax1 = ax1 = fig.add_subplot(1,2,1)
    ax1.set_xlabel(xlab1)
    ax1.set_ylabel("Frequency")
    
    freq1, bins1, patches1 = ax1.hist(ser1, bins=10, color='darksalmon',  edgecolor='darkred', linewidth=1)
        
                                
    # Histogram 2
    ax2 = ax2 = fig.add_subplot(1,2,2)
    ax2.set_xlabel(xlab2)
    ax2.set_ylabel("Frequency")
    
    freq2, bins2, patches2 = ax2.hist(ser2, bins=10, color='green', edgecolor='darkblue', linewidth=1) 
                          
                        
    return





def Sns_Kde(subtitle, xlab1, xlab2, ser1, mean1, ser2, mean2):
    """Plot KDE using seaborn- plot side by side for 
        comparison"""
     
    # Main Title    
    fig = plt.figure(figsize=(10,10))
    title = fig.suptitle(subtitle, \
                     fontsize=14, fontweight="bold")
    fig.subplots_adjust(top=0.88, wspace=0.3)
    
     # KDE 1
    ax1 = fig.add_subplot(1,2,1)
    ax1.set_xlabel(xlab1)
    ax1.set_ylabel("Density")
    sns.kdeplot(ser1, shade=True,  color='darksalmon' )
    plt.axvline(x=mean1, color="black", linestyle="--")
    
    # KDE 2
    ax2 = ax2 = fig.add_subplot(1,2,2)
    ax2.set_xlabel(xlab2)
    ax2.set_ylabel("Density")
    sns.kdeplot(ser2, shade=True,  color = 'green')
    plt.axvline(x=mean2, color="black", linestyle="--")
    return


def Cdf_Plot(subtitle, data):
    """Plot the CDF plots"""
    
    fig = plt.figure(figsize=(10,10))
    title = fig.suptitle(subtitle,\
                         fontsize=14, fontweight="bold")
    kwargs1 = {'cumulative': True}
    kwargs = {'cumulative': True, 'density': True}
    sns.distplot(data, hist_kws=kwargs, kde_kws=kwargs1)
    return


def Pmf_plot(subtitle, data, xlab):
    """Plot a PMF"""

    fig = plt.figure(figsize=(10,10))
    title = fig.suptitle(subtitle, \
                     fontsize=14, fontweight="bold")
    
    fig.subplots_adjust(top=0.88, wspace=0.3)
    ax1 = fig.add_subplot(1,1,1)
    ax1.set_xlabel(xlab)
    probs=data.value_counts(normalize=True)
    sns.barplot(probs.index, probs.values)
    return


def Heatmap_Plot(subtitle, xlab1, xlab2, data1, data2):
    """Plot a heatmap or Correlation Map"""
    fig = plt.figure(figsize=(13,13))
    title = fig.suptitle(subtitle, \
                         fontsize=14, fontweight="bold")
    fig.subplots_adjust(top=0.88, wspace=0.3)
    # Heatmap1
    ax1 = fig.add_subplot(1,2,1)
    ax1.set_xlabel(xlab1)
    sns.heatmap(
            data1,
            vmin=-1, vmax=1, center=0,
            cmap=sns.diverging_palette(20, 220, n=220),
            square=True)
    
    ax1.set_xticklabels(
        ax1.get_xticklabels(),
        rotation=45,
        horizontalalignment='right')
    
    #Heatmap2
    ax2 = fig.add_subplot(1,2,2)
    ax2.set_xlabel(xlab2)
    sns.heatmap(
            data2,
            vmin=-1, vmax=1, center=0,
            cmap=sns.diverging_palette(20, 220, n=220),
            square=True)
    
    ax2.set_xticklabels(
        ax2.get_xticklabels(),
        rotation=45,
        horizontalalignment='right')
    return


def sns_Scatter(subtitle, xlab, ylab, x_val, y_val, data):
    "Scatter plots for variables"
    fig = plt.figure(figsize=(10,10))
    title = fig.suptitle(subtitle, \
                         fontsize=14, fontweight="bold")
    fig.subplots_adjust(top=0.88, wspace=0.3)
    # Scatter Plots
    ax1 = fig.add_subplot(1,1,1)
    ax1.set_xlabel(xlab)
    ax1.set_xlabel(ylab)
    sns.scatterplot(x = x_val, y = y_val, data=data)
    return


def sns_Lmplot(subtitle, x_val, y_val, data):
    """Plot a rgeression line with the obs"""
    fig = plt.figure(figsize=(10,10))
    title = fig.suptitle(subtitle, \
                         fontsize=14, fontweight="bold")    
    fig.subplots_adjust(top=0.88, wspace=0.3)
   
    # Scatter Plots
    ax1 = fig.add_subplot(1,4,1)
    sns.lmplot(x=x_val, y=y_val, data=data)
  
    return

In [None]:
# Set context to `"paper"`
sns.set(rc={"font.size":15,"axes.labelsize":10})
#fig, ax = plt.subplots(figsize=(10,10))
sns.set(color_codes=True)

In [None]:
# =============================================================================
# Using pg. 29 of your text as an example, compare two scenarios in your data using a
# PMF. Reminder, this isn’t comparing two variables against each other
#  – it is the same variable, but a different scenario. Almost like a filter.
#  The example in the book is first babies compared to all other babies, it
#  is still the same variable, but breaking the data out based on criteria 
#  we are exploring (Chapter 3).
# =============================================================================

In [None]:
# =============================================================================
# Read csv file and drop first row which has texas cumulative dats
# =============================================================================
tx_data = pd.read_csv("~/tx_household.csv", sep = ",")
tx = pd.DataFrame(tx_data)
tx = tx.drop(tx.index[0])


# Create subsets

# RURAL
rural = tx[tx['Population'] <= 50000]  # rural communiy

# URBAN
urban = tx[tx['Population'] > 50000]

# rename variables for ease 
# Rural

s_percent = rural.percent_single
grad = rural.Grad_Rate
maths = rural.Math
read = rural.Read

# Urban

s_percent1 = urban.percent_single
grad1 = urban.Grad_Rate
maths1 = urban.Math
read1 = urban.Read

In [None]:
rural.describe()

In [None]:
urban.describe()

In [None]:
# =============================================================================
# 1. Is there a significant statistical difference between rural single parent 
#   households and urban single parent households?
#   look for counties that are considered rural 0 - 50K
#   or nbr of households > 50k
#    % single is based upon # hseholds
#   Look at the normal distribution for the precent single
#   In texas rural population is > than urban plus urban areas also
#   rural populations. Did not count these as I couldn't tell from 
#   the numbers if the rural was counted seperately from the urban.
# =============================================================================

# =============================================================================
# Create two scatter plots comparing two variables and provide your analysis
#  on correlation and causation. Remember, covariance, Pearson’s correlation, 
#  and Non- Linear Relationships should also be considered during your analysis
#  (Chapter 7).
# =============================================================================
subtitle = 'Single Parent Households Per County Population Rural'
x_val = 'Population'
y_val = 'percent_single'
xlab = 'Population'
ylab = 'Single Parent Household'
data = rural
sns_Scatter(subtitle, xlab, ylab, x_val, y_val, data)

In [None]:

subtitle = 'Single Parent Households Per County Population Urban'
x_val = 'Population'
y_val = 'percent_single'
xlab = 'Population'
ylab = 'Single Parent Household'
data = urban
sns_Scatter(subtitle, xlab, ylab, x_val, y_val, data)

In [None]:
# =============================================================================
# Include the other descriptive characteristics about the variables: Mean,
#  Mode, Spread, and Tails (Chapter 2)
# =============================================================================
# Compute Mean, Variance and Std.

rs_mean = s_percent.mean()
rs_var = s_percent.var()
rs_std = s_percent.std()
rs_mode = s_percent.mode()
print(f'Rural:  Mean = {rs_mean}  Var = {rs_var}  Std = {rs_std}  Mode = {rs_mode} \n')
    
us_mean = s_percent1.mean()
us_var = s_percent1.var()
us_std = s_percent1.std()
us_mode = s_percent1.mode()
print(f'Urban:  Mean = {us_mean}  Var = {us_var}  Std = {us_std} Mode = {us_mode} \n')

# Set values for plotting PDF and Cohen's d
mean1 = rs_mean
mean2 = us_mean
var1 = rs_var
ser1 = s_percent 
ser2 = s_percent1 

# =============================================================================
#  Cohen's d
# 
#  What is the difference between single households in rural and urban 
#  couldn't derive the precentage of singles on rural area within urban counties.
# =============================================================================
ser1 = s_percent
ser2 = s_percent1
title = 'Cohens d for Single Parent Households Rural vs Urban'
cohen_d = Compute_Cohend(mean1, mean2, ser1, ser2, var1, title)

In [None]:
# =============================================================================
# Include a histogram of each of the 5 variables – in your summary and 
# analysis, identify any outliers and explain the reasoning for them being
# outliers and how you believe they should be handled (Chapter 2).   
# =============================================================================

# =============================================================================
# Plot 1 analytical distribution and provide your analysis on how it applies
#  to the dataset you have chosen (Chapter 5).
# =============================================================================

In [None]:
# Histogram and PDFs Single parent households rural vs urban
# =============================================================================
# plot side by side for comparison
subtitle = "Percent Single Parent Households Rural vs Urban"
xlab1 = "Single Parent Households Rural"
xlab2 = "Single Parent Households Urban"
binz = 10
 
Hist_Plot(subtitle, xlab1, xlab2, ser1, ser2)

# KDE/PDFs
Sns_Kde(subtitle,xlab1, xlab2, ser1, mean1, ser2, mean2)

In [None]:
# =============================================================================
# Create 1 CDF with one of your variables, using page 41-44 as your guide, 
# what does 
# this tell you about your variable and how does
#  it address the question you are trying to answer (Chapter 4).
# =============================================================================

subtitle = " CDF -  Single Parent Households Rural"
Cdf_Plot(subtitle,ser1)
subtitle = " CDF -  Single Parent Households Urban"
Cdf_Plot(subtitle, ser2)

In [None]:
# =============================================================================
#  2.Is the high school grduation rate and average math/reading scores 
#    better in urban schools than rural schools?
#     
#  There is a myth that the best schools are in suburban areas and urban and
#  rural schools are lacking in quality education.  Do the high school 
#  graduation rates and 3rd grade reading/math scores reflect this?
# =============================================================================

In [None]:
# =============================================================================
# # Compute Stats
# =============================================================================

# Graduation Rate
rg_mean = grad.mean()
rg_var = grad.var()
rg_std = grad.std()
rg_mode = grad.mode()
print(f'Rural - Grad_Rate:  Mean = {rg_mean}  Var = {rg_var}  Std = {rg_std}  Mode = {rg_mode} \n')

ug_mean = grad1.mean()
ug_var = grad1.var()
ug_std = grad1.std()
ug_mode = grad.mode()
print(f'Urban - Grad_Rate:  Mean = {ug_mean}  Var = {ug_var}  Std = {ug_std} Mode = {ug_mode} \n')

# Cohen's d
mean1 = rg_mean
mean2 = ug_mean
var1 = rg_var
ser1 = grad
ser2 = grad1
title = 'Cohens d for High School Graduation Rates Rural vs Urban'
Compute_Cohend(mean1, mean2,ser1,ser2, var1, title)

# 3rd Grade Average Reading Scores
rr_mean = read.mean()
rr_var = read.var()
rr_std = read.std()
rr_mode = read.mode()      
print('Rural - 3rd Grade Average Reading Scores')
print(f'Mean = {rr_mean}  Var = {rr_var}  Std = {rr_std} Mode = {rr_mode}\n')

ur_mean = read1.mean()
ur_var = read1.var()
ur_std = read1.std()
ur_mode = read1.mode()
print('Urban - 3rd Grade Average Reading Scores')
print(f'Mean = {ur_mean}  Var = {ur_var}  Std = {ur_std} Mode = {ur_mode} \n')

# Cohen's d
mean1 = rr_mean
mean2 = ur_mean
var1 = rr_var
ser1 = read
ser2 = read1
title = 'Cohens d for 3rd Grade Reading Levels Rural vs Urban'
Compute_Cohend(mean1, mean2,ser1,ser2, var1, title)

# 3rd Grade Average Math Scores
rm_mean = maths.mean()
rm_var = maths.var()
rm_std = maths.std()
rm_mode = maths.mode()
print('Rural - 3rd Grade Average Math Scores')
print(f'Mean = {rm_mean}  Var = {rm_var}  Std = {rm_std} Mode = {rm_mode} \n')

um_mean = maths1.mean()
um_var = maths1.var()
um_std = maths1.std()
um_mode = maths1.mode()
print('Urban - 3rd Grade Average Math Scores')
print(f'Mean = {um_mean}  Var = {um_var}  Std = {um_std} Mode = {um_mode}\n')

# Cohen's d
mean1 = rm_mean
mean2 = um_mean
var1 = rm_var
ser1 = maths
ser2 = maths1
title = 'Cohens d for 3rd Grade Math Levels Rural vs Urban'
Compute_Cohend(mean1, mean2,ser1,ser2, var1, title)

In [None]:
# Descriptive Statistics

hs_df_rural = rural[['Grad_Rate', 'Math', 'Read']]
hs_df_rural.describe()

In [None]:
hs_df_urban = urban[['Grad_Rate', 'Math', 'Read']]
hs_df_urban.describe()

In [None]:

# =============================================================================
# Include a histogram of each of the 5 variables – in your summary and 
# analysis, identify any outliers and explain the reasoning for them being
# outliers and how you believe they should be handled (Chapter 2).   
# =============================================================================
# Histogram for distribution - Placed these histograms and KDEs here because they
# use the raw data not the normalized data. Did this so I could compare the 
# actual vlues and not the normalized values.  
# =============================================================================

# =============================================================================
# # Graduation Rates Rural vs Urban
# =============================================================================
subtitle = "High School Graduation Rates for Rural vs Urban"
xlab1 = "Households - Rural"
xlab2 = "Households - Urban"
mean1 = rg_mean
mean2 = ug_mean
ser1 = grad
ser2 = grad1
Hist_Plot(subtitle, xlab1, xlab2, ser1, ser2)

# KDE/PDF
Sns_Kde(subtitle,xlab1, xlab2, ser1, mean1, ser2, mean2)

In [None]:
# =============================================================================
# Using pg. 29 of your text as an example, compare two scenarios in your data using a
# PMF. Reminder, this isn’t comparing two variables against each other
#  – it is the same variable, but a different scenario. Almost like a filter.
#  The example in the book is first babies compared to all other babies, it
#  is still the same variable, but breaking the data out based on criteria 
#  we are exploring (Chapter 3).
# =============================================================================

In [None]:
subtitle = 'PMF Plot for High School Graduation Rates - Rural'
data = grad
xlab = 'Rural High School Graduation Rates '
Pmf_plot(subtitle, data, xlab)
subtitle = 'PMF Plot for High School Graduation Rates  - Urban'
data = grad1
xlab = 'Urban High School Graduation Rates '
Pmf_plot(subtitle, data, xlab)

In [None]:
# =============================================================================
# # Histograms and PDFs 3rd grade Reading levels
# =============================================================================
subtitle = "3rd Grade Reading Levels for Rural vs Urban"
xlab1 = "Households - Rural"
xlab2 = "Households - Urban"
mean1 = rr_mean
mean2 = ur_mean
ser1 = read
ser2 = read1
Hist_Plot(subtitle, xlab1, xlab2, ser1, ser2)


# KDE/PDF
Sns_Kde(subtitle,xlab1, xlab2, ser1, mean1, ser2, mean2)

In [None]:

subtitle = 'PMF Plot for 3rd Grade Average Reading Scores - Rural'
data = read
xlab = 'Rural Reading Scores '
Pmf_plot(subtitle, data, xlab)
subtitle = 'PMF Plot for 3rd Grade Average Reading Scores  - Urban'
data = read1
xlab = 'Urban Reading Scores '
Pmf_plot(subtitle, data, xlab)

In [None]:
# =============================================================================
# # Histograms  and PDFs 3rd grade Math levels
# =============================================================================
subtitle = "3rd Grade Math Levels for Rural vs Urban"
xlab1 = "Households - Rural"
xlab2 = "Households - Urban"
mean1 = rm_mean
mean2 = ur_mean
ser1 = maths
ser2 = maths1
Hist_Plot(subtitle, xlab1, xlab2, ser1, ser2)

# KDE/PDFs
Sns_Kde(subtitle,xlab1, xlab2, ser1, mean1, ser2, mean2)

In [None]:
subtitle = 'PMF Plot for Math Scores - Rural'
data = maths
xlab = 'Rural Math Scores '
Pmf_plot(subtitle, data, xlab)
subtitle = 'PMF Plot for Math Scores - Urban'
data = maths1
xlab = 'Urban Math Scores '
Pmf_plot(subtitle, data, xlab)

In [None]:
# =============================================================================
# Conduct Correlation Tests - Chapter 9
# =============================================================================
# what if any is the correlation between 3rd grade levels and high school
# graduation
# need to normalize the data
# normalize the data  sklearn

scaler = MinMaxScaler()
rural_school = rural[['Read', 'Math', 'Grad_Rate']].copy()
urban_school = urban[['Read', 'Math', 'Grad_Rate']].copy()

rural_school[['Read', 'Math', 'Grad_Rate']] = scaler.fit_transform(rural_school[['Read',\
     'Math', 'Grad_Rate']])

urban_school[['Read', 'Math', 'Grad_Rate']] = scaler.fit_transform(urban_school[['Read',\
     'Math', 'Grad_Rate']])

#Pearson                                  
corr_rural1 = rural_school.corr(method="pearson")
print("Pearson correlation coefficient Rural :")
print(corr_rural1,"\n")

corr_urban1 = urban_school.corr(method="pearson")
print("Pearson correlation coefficient Urban:")
print(corr_urban1, "\n")

#Spearman
corr_rural2 = rural_school.corr(method="spearman")
print("Spearman correlation coefficient Rural :")
print(corr_rural2, "\n")

corr_urban2 = urban_school.corr(method="spearman")
print("Spearman correlation coefficient Urban :")
print(corr_urban2, "\n")

#Kendall's Tau
corr_rural3 = rural_school.corr(method="kendall")
print("Kendall Tau correlation coefficient Rural :")
print(corr_rural3, "\n")

corr_urban3 = urban_school.corr(method="kendall")
print("Kendall Tau correlation coefficient Urban:")
print(corr_urban3, "\n")

In [None]:
# Linear Relationship
subtitle = 'Graduation Rates and Reading Scores - Rural'
x_val = "Grad_Rate"
y_val = 'Math'
data = rural_school
sns.lmplot(x_val, y_val, data)

# Linear Relationship
subtitle = 'Graduation Rates and Reading Scores - Urban'
x_val = "Grad_Rate"
y_val = 'Math'
data = urban_school
sns.lmplot(x_val, y_val, data)

# Linear Relationship
subtitle = 'Graduation Rates and Reading Scores - Rural'
x_val = "Grad_Rate"
y_val = 'Read'
data = rural_school
sns.lmplot(x_val, y_val, data)

# Linear Relationship
subtitle = 'Graduation Rates and Reading Scores - Urban'
x_val = "Grad_Rate"
y_val = 'Read'
data = urban_school
sns.lmplot(x_val, y_val, data)

# Math and Reading
subtitle = 'Math and Reading Scores - Rural'
x_val = "Math"
y_val = 'Read'
data = rural_school
sns.lmplot(x_val, y_val, data)

# Math and Reading
subtitle = 'Math and Reading Scores - Urban'
x_val = "Math"
y_val = 'Read'
data = urban_school
sns.lmplot(x_val, y_val, data)

In [None]:
#Include a histogram of each of the 5 variables – in your summary and analysis, identify
#any outliers and explain the reasoning for them being outliers and how you believe they
#should be handled (Chapter 2).

In [None]:
# =============================================================================
# 3. Is there a correlation between life expectancy and poor health, 
#  lack of sleep, no insurance, housing costs, access to food, and 
#  availability of doctors? 
#
# 4. Is there a correlation between poor health and availability of doctors, 
#  lack of sleep,  no insurance, housing costs, and access to food? 
# =============================================================================

In [None]:
# Histogram and PDFs Physicians rural vs urban
# =============================================================================
# plot side by side for comparison
subtitle = "Number of Physicians per County Rural vs Urban"
xlab1 = "Rural"
xlab2 = "Urban"
ser1 = rural['Physicians']

ser2 = urban.Physicians
mean1 = rural.Physicians.mean()
mean2 = urban.Physicians.mean()
Hist_Plot(subtitle, xlab1, xlab2, ser1, ser2)

# KDE/PDFs
Sns_Kde(subtitle,xlab1, xlab2, ser1, mean1, ser2, mean2)

In [None]:
# Cohen's d while I have the mean
mean1 = rural.Physicians.var()
mean2 = urban.Physicians.var()
ser1 = rural.Physicians
ser2 = urban.Physicians
title = 'Cohens d for Physicians Rural vs Urban'
Compute_Cohend(mean1, mean2,ser1,ser2, var1, title)

In [None]:
# Histogram and PDFs Life_Expectancy rural vs urban
# =============================================================================
# plot side by side for comparison
subtitle = "Life_Expectancy Rural vs Urban"
ser1 = rural['Life_Expectancy']

ser2 = urban.Life_Expectancy
mean1 = rural.Life_Expectancy.mean()
mean2 = urban.Life_Expectancy.mean()
Hist_Plot(subtitle, xlab1, xlab2, ser1, ser2)

# KDE/PDFs
Sns_Kde(subtitle,xlab1, xlab2, ser1, mean1, ser2, mean2)

In [None]:
# Cohen's d while I have the mean
rural.Life_Expectancy.var()
title = 'Cohens d for Life Expectancy Rural vs Urban'
Compute_Cohend(mean1, mean2,ser1,ser2, var1, title)

In [None]:
# Histogram and PDFs  rural vs urban
# =============================================================================
# plot side by side for comparison
subtitle = "Population Rural vs Urban"
ser1 = rural['Population']

ser2 = urban.Population
mean1 = rural.Population.mean()
mean2 = urban.Population.mean()
Hist_Plot(subtitle, xlab1, xlab2, ser1, ser2)

# KDE/PDFs
Sns_Kde(subtitle,xlab1, xlab2, ser1, mean1, ser2, mean2)

In [None]:
# Cohen's d while I have the mean
rural.Population.var()
title = 'Cohens d for Population Rural vs Urban'
Compute_Cohend(mean1, mean2,ser1,ser2, var1, title)

In [None]:
# Histogram and PDFs  rural vs urban
# =============================================================================
# plot side by side for comparison
subtitle = "poor_health vs Urban"
ser1 = rural['poor_health']

ser2 = urban.poor_health
mean1 = rural.poor_health.mean()
mean2 = urban.poor_health.mean()
Hist_Plot(subtitle, xlab1, xlab2, ser1, ser2)

# KDE/PDFs
Sns_Kde(subtitle,xlab1, xlab2, ser1, mean1, ser2, mean2)

In [None]:
# Cohen's d while I have the mean
var1 = rural.poor_health.var()
title = 'Cohens d for Poor Health vs Urban'
Compute_Cohend(mean1, mean2,ser1,ser2, var1, title)

In [None]:
# Histogram and PDFs  rural vs urban
# =============================================================================
# plot side by side for comparison
subtitle = "poor_sleep vs Urban"
ser1 = rural['poor_sleep']

ser2 = urban.poor_sleep
mean1 = rural.poor_sleep.mean()
mean2 = urban.poor_sleep.mean()
Hist_Plot(subtitle, xlab1, xlab2, ser1, ser2)

# KDE/PDFs
Sns_Kde(subtitle,xlab1, xlab2, ser1, mean1, ser2, mean2)

In [None]:

# Cohen's d while I have the mean
var1 = rural.poor_sleep.var()
title = 'Cohens d for Poor Sleep  Rural vs Urban'
Compute_Cohend(mean1, mean2,ser1,ser2, var1, title)

In [None]:
# Histogram and PDFs  rural vs urban
# =============================================================================
# plot side by side for comparison
subtitle = "Uninsured_health Rural vs Urban"
ser1 = rural['Uninsured_health']

ser2 = urban.Uninsured_health
mean1 = rural.Uninsured_health.mean()
mean2 = urban.Uninsured_health.mean()
Hist_Plot(subtitle, xlab1, xlab2, ser1, ser2)

# KDE/PDFs
Sns_Kde(subtitle,xlab1, xlab2, ser1, mean1, ser2, mean2)

In [None]:

# Cohen's d while I have the mean
var1 = rural.Uninsured_health.var()
title = 'Cohens d for Uninsured  Rural vs Urban'
Compute_Cohend(mean1, mean2,ser1,ser2, var1, title)

In [None]:
# Histogram and PDFs  rural vs urban
# =============================================================================
# plot side by side for comparison
subtitle = "Cost_Burden vs Urban"
ser1 = rural.Cost_Burden

ser2 = urban.Cost_Burden
mean1 = rural.Cost_Burden.mean()
mean2 = urban.Cost_Burden.mean()
Hist_Plot(subtitle, xlab1, xlab2, ser1, ser2)

# KDE/PDFs
Sns_Kde(subtitle,xlab1, xlab2, ser1, mean1, ser2, mean2)

In [None]:
# Cohen's d while I have the mean
var1 = rural.Cost_Burden.var()
title = 'Cohens d for Uninsured  Rural vs Urban'
Compute_Cohend(mean1, mean2,ser1,ser2, var1, title)

In [None]:
# Histogram and PDFs  rural vs urban
# =============================================================================
# plot side by side for comparison
subtitle = "food_Insecure vs Urban"
ser1 = rural.food_Insecure

ser2 = urban.food_Insecure
mean1 = rural.food_Insecure.mean()
mean2 = urban.food_Insecure.mean()
Hist_Plot(subtitle, xlab1, xlab2, ser1, ser2)

# KDE/PDFs
Sns_Kde(subtitle,xlab1, xlab2, ser1, mean1, ser2, mean2)

In [None]:
# Cohen's d while I have the mean
var1 = rural.food_Insecure.var()
title = 'Cohens d for Uninsured  Rural vs Urban'
Compute_Cohend(mean1, mean2,ser1,ser2, var1, title)

In [None]:
# =============================================================================
# Run correrlation for bad health habits rural vs urban.  Hypothesis Do 
#   rural residents have worse health outcomes due to poor health, lack
#   of availability of doctors, access to food due to housing costs
#   which may have an impact on poor sleep and health?
# 
# =============================================================================

# normalize the data with sklearn
scaler = MinMaxScaler()
rural_df = rural[['Population', 'poor_health', 'poor_sleep', 'Uninsured_health',\
       'Life_Expectancy', 'Physicians', 'food_Insecure','Cost_Burden']].copy() 
       
       
rural_df[['Population','poor_health','poor_sleep','Uninsured_health',\
       'Life_Expectancy','Physicians','food_Insecure','Cost_Burden']]\
        = scaler.fit_transform(rural_df[['Population', 'poor_health', \
        'poor_sleep', 'Uninsured_health', 'Life_Expectancy', 'Physicians',\
       'food_Insecure', 'Cost_Burden']])
    

urban_df = urban[['Population', 'poor_health', 'poor_sleep', 'Uninsured_health',\
       'Life_Expectancy', 'Physicians', 'food_Insecure', 'Cost_Burden']].copy() 
       

urban_df[['Population','poor_health','poor_sleep','Uninsured_health',\
       'Life_Expectancy','Physicians','food_Insecure','Cost_Burden']]\
        = scaler.fit_transform(urban_df[['Population', 'poor_health', \
        'poor_sleep', 'Uninsured_health', 'Life_Expectancy', 'Physicians',\
       'food_Insecure', 'Cost_Burden']])
       

#Pearson Correlation
corr_rural1 = rural_df.corr(method="pearson")

corr_urban1 = urban_df.corr(method="pearson")

print("Pearson correlation coefficient Rural : \n")
print(corr_rural1,"\n")
print("Pearson correlation coefficient Urban : \n")
print(corr_urban1,"\n")

In [None]:
# =============================================================================
# Display correlation
# =============================================================================
subtitle = "Correlation for Households Rural vs Urban"
xlab1 = "Rural"
xlab2 = "Urban"
data1 = corr_rural1
data2 = corr_urban1
Heatmap_Plot(subtitle, xlab1, xlab2, data1, data2)

In [None]:

# =============================================================================
# Include the other descriptive characteristics about the variables: Mean,
#  Mode, Spread, and Tails (Chapter 2)
# =============================================================================
# =============================================================================
# # Display descriptive statistics
# =============================================================================
rural_cp = rural[['Population', 'poor_health', 'poor_sleep', 'Uninsured_health',\
       'Life_Expectancy', 'Physicians', 'food_Insecure','Cost_Burden']].copy()

urban_cp = urban[['Population', 'poor_health', 'poor_sleep', 'Uninsured_health',\
       'Life_Expectancy', 'Physicians', 'food_Insecure','Cost_Burden']].copy()


# before scaling
print('RURAL \n')
print(rural_cp.describe(), '\n')
print('URBAN \n')
print(urban_cp.describe(), '\n')

In [None]:
# after scaling
print('RURAL \n')
print(rural_df.describe(), '\n')
print('URBAN \n')
print(urban_df.describe(), '\n')

In [None]:
for x in rural_df:
    print(f'Mode for {x} = {rural_cp[x].mode()} \n')

In [None]:

for x in rural_df:
    print(f'Mode for {x} = {urban_cp[x].mode()} \n')

In [None]:
urban_df

In [None]:
rural_df

In [None]:
# =============================================================================
# Chapter 9 Test on Hypothesis.
# Normality Test
# =============================================================================
# # Shapiro-Wilk Test
# # Sample has a Gaussian Distribution
# =============================================================================
# create same sample size

s_rural = rural_df.sample(n=60, random_state=1)
s_rural.reset_index(drop=True, inplace=True)

s_urban = urban_df.sample(n=60, random_state=1)
s_urban.reset_index(drop=True, inplace=True)


stat, p = shapiro(rural_df)
print(f"Shapiro-Wilk Test - Rural \n")
print(f'stat= {stat} p = {p} \n')
if p > 0.05:
    print('Probably Gaussian \n')
else:
    print('Probably not Gaussian \n')

print(f"Shapiro-Wilk Test - Urban \n")    
stat, p = shapiro(urban_df)
print(f'stat= {stat} p = {p} \n')

if p > 0.05:
    print('Probably Gaussian \n')
else:
	print('Probably not Gaussian \n')
    
# =============================================================================
# #-D'Agostinos K^2 test 
# # p > 0.05
# =============================================================================
stat, p = normaltest(rural_df)

print(f"D'Agostinos K^2 test - Rural \n")
print(f'stat= {stat}\n')
print(f'p = {p} \n')

stat, p = normaltest(urban_df)

print(f"D'Agostinos K^2 test - Urban \n")
print(f'stat= {stat}\n')
print(f'p = {p} \n')

# =============================================================================
# Check the means of the samples 
# H0 = mean = mean indpendent and identical distribution
# show the how significant the difference is between the means.  Meaning
# could the differences have happened by chance. Larger the t-score the more
# difference between the groups. 
# https://towardsdatascience.com/inferential-statistics-series-t-test-using-numpy-2718f8f9bf2f
# A t-score of 3 means the groups are 3x as different from each other
# small t = more similar the groups are.
# low p = data did not occur by chance baseline = .05%
# =============================================================================
t, p = ttest_ind(rural_df, urban_df)
print( "Student's t-test Rural and Urban \n")

print(f't= {t}\n')
print(f'p = {p} \n') 
 
# =============================================================================
# # Calculate CI for difference of the means
# =============================================================================
ci = sms.CompareMeans(sms.DescrStatsW(rural_df), sms.DescrStatsW(urban_df))
print('Difference between the means CI')
print(ci.tconfint_diff(usevar='unequal'),'\n')
  
# =============================================================================
# Paired Student's t-test
# =============================================================================

t, p = ttest_rel(s_rural, s_urban)
print( "Paired Student's t-test Rural and Urban \n")
print(f't= {t}\n')
print(f'p = {p} \n')

In [None]:

t, p = ttest_rel(s_rural, s_urban)

In [None]:
# =============================================================================
# Conduct linear regression analysis
# Using life expectancy as the reponse how do 
# poor health, lack of sleep, no insuranceand availability of Physicians
# affect LE. Use poor_health as the response variable, how do the
# factors affect poor_health.
# =============================================================================
# =============================================================================
# For this project, conduct a regression analysis on either one dependent and 
# one explanatory variable, or multiple explanatory variables (Chapter 10 & 11)
# =============================================================================
# =============================================================================
# Create two scatter plots comparing two variables and provide your analysis
#  on correlation and causation. Remember, covariance, Pearson’s correlation, 
#  and Non- Linear Relationships should also be considered during your analysis
#  (Chapter 7).
# =============================================================================
# =============================================================================
#  How do the factors affect life expectancy
# =============================================================================

In [None]:

# Rename variables
life = s_rural.Life_Expectancy
health = s_rural.poor_health
sleep = s_rural.poor_sleep
uninsured = s_rural.Uninsured_health
docs = s_rural.Physicians
food = s_rural.food_Insecure
burden = s_rural.Cost_Burden

life1 = s_urban.Life_Expectancy
health1 = s_urban.poor_health
sleep1 = s_urban.poor_sleep
uninsured1 = s_urban.Uninsured_health
docs1 = s_urban.Physicians
food1 = s_urban.food_Insecure
burden1 = s_urban.Cost_Burden 

# Fit the models
# Rural
mod1 = smf.ols('life ~ health + sleep + uninsured + docs\
                + food + burden',
                data=s_rural).fit()
print('                     RURAL', '\n')
print(mod1.summary(),'\n')

# Urban
mod2 = smf.ols('life1 ~ health1 + sleep1 + uninsured1 + docs1\
                 + food1 + burden1',
                 data=s_urban).fit()
print('-                    URBAN', '\n')
print(mod2.summary(), '\n')

In [None]:

# =============================================================================
# How do the factors affect health
# =============================================================================
# Rural
mod2 = smf.ols('health ~ life + sleep + uninsured + docs\
                + food + burden', data=s_rural).fit()
print('                        RURAL', '\n')
print(mod2.summary(), '\n')

# Urban
mod2 = smf.ols('health1 ~ life1 + sleep1 + uninsured1 + docs1\
                + food1 + burden1', data=s_urban).fit()
print('                         URBAN', '\n')
print(mod2.summary(), '\n')

In [None]:
# Linear Relationship
subtitle = 'Rural'
x_val = "Life_Expectancy"
y_val = 'poor_health'
data = s_rural
fig = sns.lmplot(x_val, y_val, data).fig.suptitle(subtitle)

subtitle = 'Urban'
x_val = "Life_Expectancy"
y_val = 'poor_health'
data = s_urban
fig = sns.lmplot(x_val, y_val, data).fig.suptitle(subtitle)

subtitle = 'Rural'
x_val = "Life_Expectancy"
y_val = 'Physicians'
data = s_rural
fig = sns.lmplot(x_val, y_val, data).fig.suptitle(subtitle)

subtitle = 'Urban'
x_val = "Life_Expectancy"
y_val = 'Physicians'
data = s_urban
fig = sns.lmplot(x_val, y_val, data).fig.suptitle(subtitle)

In [None]:
# Linear Relationship
subtitle = 'Rural'
x_val = 'poor_health'
y_val = 'food_Insecure'
data = s_rural
fig = sns.lmplot(x_val, y_val, data).fig.suptitle(subtitle)

subtitle = 'Urban'
x_val = 'poor_health'
y_val = 'food_Insecure'
data = s_rural
data = s_urban
fig = sns.lmplot(x_val, y_val, data).fig.suptitle(subtitle)

In [None]:
# Linear Relationship
subtitle = 'Rural'
x_val = 'poor_health'
y_val = 'Uninsured_health'
data = s_rural
fig = sns.lmplot(x_val, y_val, data).fig.suptitle(subtitle)

subtitle = 'Urban'
x_val = 'poor_health'
y_val = 'Uninsured_health'
data = s_rural
data = s_urban
fig = sns.lmplot(x_val, y_val, data).fig.suptitle(subtitle)

In [None]:
# Linear Relationship
subtitle = 'Rural'
x_val = 'poor_health'
y_val = 'Physicians'
data = s_rural
fig = sns.lmplot(x_val, y_val, data).fig.suptitle(subtitle)

subtitle = 'Urban'
x_val = 'poor_health'
y_val = 'Physicians'
data = s_urban

fig = sns.lmplot(x_val, y_val, data).fig.suptitle(subtitle)

In [None]:
# Linear Relationship
subtitle = 'Rural'
x_val = 'poor_health'
y_val = 'Cost_Burden'
data = s_rural
fig = sns.lmplot(x_val, y_val, data).fig.suptitle(subtitle)

subtitle = 'Urban'
x_val = 'poor_health'
y_val = 'Cost_Burden'
data = s_urban

fig = sns.lmplot(x_val, y_val, data).fig.suptitle(subtitle)

In [None]:
subtitle = 'Number of Doctors Per County Population - Rural'
x_val = 'Population'
y_val = 'Physicians'
xlab = 'Population per County'
ylab = '# of Physicians'
data = rural
sns_Scatter(subtitle, xlab, ylab, x_val, y_val, data)

In [1]:
subtitle = 'Number of Doctors Per County Population - urban'
x_val = 'Population'
y_val = 'Physicians'
xlab = 'Population per County'
ylab = '# of Physicians'
data = urban
sns_Scatter(subtitle, xlab, ylab, x_val, y_val, data)

NameError: name 'urban' is not defined