In [1]:
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('seaborn-ticks')
%matplotlib inline


Lets pick up where we left.

In [2]:
medical = pd.read_pickle("data/medical.p")

In [3]:
# The file includes socio-demographic data, including 
# health insurance and various aspects of health care
# touchpoints for the respondent group of a survey
# conducted in the USA.

# The collection includes 35072 observations and 27 variables:
  
# UMARSTAT – Marital status recode
# UCUREMP – Currently has employer coverage
# UCURNINS – Currently uninsured
# USATMED – Satisfied with quality of medical care
# URELATE – Number of relatives in household
# REGION – region
# STATE - state
# HHID – Household identification number
# FHOSP – In hospital overnight last year
# FDENT – Dental visits last year
# FEMER – Number of emergency room visits last year
# FDOCT – Number of doctor visits last year
# UIMMSTAT – Immigration status
# U_USBORN – U.S.- or foreign-born
# UAGE – Age topcoded
# U_FTPT – Full-time or part-time worker this year
# U_WKSLY – Weeks worked last year
# U_HRSLY – Hours worked per week last year
# U_USHRS – Hours worked per week this year
# HEARNVAL – Earnings amount last year - Household
# HOTHVAL – Household income, total exc. earnings
# HRETVAL – Retirement amount – Household
# HSSVAL – Social Security amount - Household
# HWSVAL – Wages and salaries amount – Household
# UBRACE – race
# GENDER – gender
# UEDUC3 – education level
# CEYES - color of eyes
# CHAIR - color of hair

First we will recode UCURNINS to binary form.

In [4]:
print(medical.UCURNINS.unique())
medical["UCURNINS"] = (medical.UCURNINS=="Yes").astype(int)
print(medical.UCURNINS.unique())

['Yes' 'No']
[1 0]


Today we will work with sklearn a lot. Therefore we need to recode all variables with nominal levels to binary form. We will use pandas get_dummies method for that. Lets just get rid of HHID

In [5]:
levCols = []
numCols = []
for col in medical.columns:
    if medical[col].dtype==object:
        levCols.append(col)
    else:
        numCols.append(col)
levCols.remove("HHID")

In [6]:
dummLev = pd.get_dummies(medical[levCols])
dummLev.shape

(35072, 88)

In [7]:
medical = pd.concat([medical[numCols], dummLev], axis=1)

In [8]:
features = medical.columns.tolist()
features.remove("UCURNINS")

In [9]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [10]:
lda = LinearDiscriminantAnalysis(solver="svd", store_covariance=True)
est = lda.fit(medical[features], medical["UCURNINS"])
preds = est.predict(medical[features])
probs = est.predict_proba(medical[features])
sum(preds == medical["UCURNINS"])/len(preds)



0.89595688868613144

In [11]:
lda = LinearDiscriminantAnalysis(solver="svd", store_covariance=True, priors=(0.5, 0.5))
est = lda.fit(medical[features], medical["UCURNINS"])
preds1 = est.predict(medical[features])
probs1 = est.predict_proba(medical[features])
sum(preds1 == medical["UCURNINS"])/len(preds)



0.89082458941605835

In [12]:
# ldaPreds = pd.DataFrame([preds, preds1]).transpose()
pd.crosstab(preds, preds1)

col_0,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,26179,284
1,0,8609


In [13]:
lda =  QuadraticDiscriminantAnalysis(store_covariances=True)
est = lda.fit(medical[features], medical["UCURNINS"])
preds2 = est.predict(medical[features])
probs2 = est.predict_proba(medical[features])
sum(preds2 == medical["UCURNINS"])/len(preds1)



0.89073905109489049

In [14]:
lda =  QuadraticDiscriminantAnalysis(store_covariances=True, priors=(0.5, 0.5))
est = lda.fit(medical[features], medical["UCURNINS"])
preds3 = est.predict(medical[features])
probs3 = est.predict_proba(medical[features])
sum(preds3 == medical["UCURNINS"])/len(preds1)



0.89073905109489049

In [15]:
pd.crosstab(preds, preds3)

col_0,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,26174,289
1,2,8607


In [16]:
features = medical.columns.tolist()
features.remove("USATMED")

In [17]:
lda = LinearDiscriminantAnalysis(solver="svd", store_covariance=True)
est = lda.fit(medical[features], medical["USATMED"])
preds = est.predict(medical[features])
probs = est.predict_proba(medical[features])
sum(preds == medical["USATMED"])/len(preds)



0.51653740875912413

In [18]:
lda =  QuadraticDiscriminantAnalysis(store_covariances=True)
est = lda.fit(medical[features], medical["USATMED"])
preds1 = est.predict(medical[features])
probs1 = est.predict_proba(medical[features])
sum(preds2 == medical["USATMED"])/len(preds1)



0.039262089416058396

In [19]:
medical["USATMED"].nunique()

5

In [20]:
pd.crosstab(preds, preds1)

col_0,0,1,2,3,4
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,87,0,0,0,0
1,6,2,0,0,0
2,674,4,13,8,0
3,3944,99,21,40,0
4,29401,460,50,262,1


In [None]:

#--------------------------------------------------------------------
# Exercises 3.

# Exercise 3.1.

# Titanic passengers data – 1310 observations and 15 variables:

# passenger_id – Unique passenger id
# pclass – Ticket class (1 = 1st, 2 = 2nd, 3 = 3rd)
# survived – Survival (0 = No, 1 = Yes)
# name – Name and SUrname
# sex – Sex (0 = Male, 1 = Female)
# age – Age in years
# sibsp – # of siblings / spouses aboard the Titanic
# parch – # of parents / children aboard the Titanic
# ticket – Ticket number
# fare – Passenger fare
# cabin – Cabin number
# embarked – Port of Embarkation (C = Cherbourg, Q = Queenstown, S = Southampton)
# boat – Lifeboat (if survived)
# body – Body number (if did not survive and body was recovered)
# home.dest – Home/Destination

# Use linear and quadratic discriminant analysis to
# explain the probability of survival (survived = 1).
# Generate fitted values and compare them for different
# models.








In [None]:

# Exercise 3.2.
# Wine Quality Data Set: "data/wines.csv"
# source: https://archive.ics.uci.edu/ml/datasets/wine+quality
# The file contains data on samples of white and red Portuguese wine 
# Vinho Verde. 
# Various physico-chemical characteristics of individual samples
# are available as well as wine quality scores on a point scale (0-10) 
# made by specialists.

# Perform linear and quadratic discriminant analysis 
# to model the quality of wine (variable quality),
# treating the explained variable as qualitative.
# Generate fitted values and compare them for different models.

