In [None]:
from util import *
from feature_eng import *

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
%matplotlib inline
import seaborn as sns
import dtale
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from prince import MCA
from prince import FAMD

In [None]:
df = read_csv('TEDS-D-2019-DS0001-bndl-data-tsv_V1.zip')

## Select Columns

##### Drop Unnecessary

In [None]:
#Unnecessary columns
unnecessarys = ['DISYR','CASEID','CBSA2010','DETCRIM','PREG','DAYWAIT','SERVICES_D','EMPLOY_D','LIVARAG_D','ARRESTS_D',
                     'DETNLF_D','SUB3','SUB3_D','ROUTE3','FREQ3','FREQ3_D','FRSTUSE3','FREQ_ATND_SELF_HELP_D']

#High missing data counts
high_missing = ['DETNLF','PRIMINC','SUB2','SUB2_D','ROUTE2','FREQ2','FREQ1_D','FREQ2_D','FRSTUSE2','HLTHINS','PRIMPAY']

#Drop substance flag columns
substance = ['ALCFLG','COKEFLG', 'MARFLG', 'HERFLG', 'METHFLG', 'OPSYNFLG', 'PCPFLG', 
             'HALLFLG', 'MTHAMFLG', 'AMPHFLG', 'STIMFLG', 'BENZFLG', 'TRNQFLG', 
             'BARBFLG', 'SEDHPFLG', 'INHFLG', 'OTCFLG', 'OTHERFLG']

columns = unnecessarys + high_missing + substance

# Remove columns + NaNs
df_drop = clean_df(df.copy(), columns)

In [None]:
df_drop.head()

## Feature Engineering

#### Norm Mean of LOS

In [None]:
#Change LOS categories to minimum of range
df_drop.loc[df_drop['LOS'] == 32, 'LOS'] = 46
df_drop.loc[df_drop['LOS'] == 33, 'LOS'] = 61
df_drop.loc[df_drop['LOS'] == 34, 'LOS'] = 91
df_drop.loc[df_drop['LOS'] == 35, 'LOS'] = 121
df_drop.loc[df_drop['LOS'] == 36, 'LOS'] = 181
df_drop.loc[df_drop['LOS'] == 37, 'LOS'] = 366

df_drop = compute_LOS_norm(df_drop)

#### Binary Discharge Status

In [None]:
df_drop.loc[df_drop['REASON'] == 1, 'REASONbinary'] = 1
df_drop.loc[df_drop['REASON'] != 1, 'REASONbinary'] = 0

In [None]:
df_drop.columns

## Multiple Correspondence Analysis (MCA)

In [None]:
# Make all columns but LOSnorm to strings
df_categories = df_drop.copy()
str_columns = df_drop.columns.drop('LOSnorm')
df_categories[str_columns] = df_categories[str_columns].applymap(str)

In [None]:
features = df_categories.columns.drop('REASONbinary')
mca = MCA(n_components=5, n_iter=3, random_state=101)
model, transformed = unsupervised_model(df_categories, features, mca)

In [None]:
%matplotlib inline

mca.plot_coordinates(X=df_categories[features])

## Factor Analysis of Mixed Data (FAMD)

In [None]:
famd = FAMD(n_components=2, n_iter=3, random_state=101)

features = df_categories.columns.drop('SERVICES')

model, transformed = unsupervised_model(df_categories, features, famd)

In [None]:
famd.plot_row_coordinates(df_categories,color_labels=[f'Service {t}' for t in df_categories['SERVICES']])

## PCA on Reduced Set with Dummy Variables

In [None]:
columns = ['STFIPS', 'EDUC', 'MARSTAT', 'SERVICES', 'LOS', 'PSOURCE', 'NOPRIOR',
                                   'ARRESTS', 'EMPLOY', 'METHUSE', 'PSYPROB', 'GENDER', 'VET', 'LIVARAG',
                                   'REASON', 'DSMCRIT', 'AGE', 'RACE', 'ETHNIC', 'SUB1', 'SUB1_D',
                                   'ROUTE1', 'FREQ1', 'FRSTUSE1', 'FREQ_ATND_SELF_HELP', 'DIVISION',
                                   'REGION', 'IDU', 'ALCDRUG']

df_dummies = pd.get_dummies(df_drop, prefix=columns, prefix_sep='_', columns=columns)

df_dummies.head()

#### Create Dummy Variable DF for Categories

#### Binary Status as Y

In [None]:
columns = df_dummies.columns.drop('REASONbinary')

scaled_df = preprocess(df_dummies, [columns], [StandardScaler()], remainder='drop')

In [None]:
# pca, transformed_X = unsupervised_model(df_dummies, features, PCA(n_components=10))
pca = PCA(n_components=10)
pca.fit(scaled_df)
transformed = pca.transform(scaled_df)

In [None]:
components = range(pca.n_components_)

plt.bar(components, pca.explained_variance_ratio_, color='black')
plt.xlabel('PCA features')
plt.ylabel('variance %')
plt.xticks(components)

In [None]:
PCA_components_df = pd.DataFrame(principalComponents)

plt.scatter(PCA_components_df[0], PCA_components_df[1], alpha=.01, color='black') #Alpha reduced to look for clustering
plt.xlabel('PCA 1')
plt.ylabel('PCA 2')

In [None]:
ks = range(1, 10)
inertias = []
for k in ks:
    # Create a KMeans instance with k clusters: model
    model = KMeans(n_clusters=k)
    
    # Fit model to samples
    model.fit(PCA_components_df.iloc[:,:3])
    
    # Append the inertia to the list of inertias
    inertias.append(model.inertia_)
    
plt.plot(ks, inertias, '-o', color='black')
plt.xlabel('number of clusters, k')
plt.ylabel('inertia')
plt.xticks(ks)
plt.show()

In [None]:
pca = (PCA(n_components=2))

principalComponents = pca.fit_transform(x)

principalDf = pd.DataFrame(data = principalComponents
                          , columns = ['PC1','PC2'])

finalDf = pd.concat([principalDf, df_drop[['REASONbinary']]], axis=1)

In [None]:
%matplotlib inline

fig = plt.figure(figsize =(8,8))
ax = fig.add_subplot(1,1,1)

ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)
ax.set_title('2 component PCA', fontsize = 20)
targets = [0,1]

colors = ['r','b']
for target, color in zip(targets,colors):
    indicesToKeep = finalDf['REASONbinary'] == target
    ax.scatter(finalDf.loc[indicesToKeep, 'PC1']
               , finalDf.loc[indicesToKeep, 'PC2']
               , c = color
               , s = 50)
ax.legend(targets)
ax.grid()

ax.set_xlim([-7.5,7.5])
ax.set_ylim([-7.5,7.5])

#### LOSnorm as Y

In [None]:
x = df_dummies.loc[:, df_dummies.columns != 'LOSnorm'].values

y = df_dummies.loc[:, ['LOSnorm']].values

x = StandardScaler().fit_transform(x)

In [None]:
pca = PCA(n_components=10)
principalComponents = pca.fit_transform(x)

features = range(pca.n_components_)

plt.bar(features, pca.explained_variance_ratio_, color='black')
plt.xlabel('PCA features')
plt.ylabel('variance %')
plt.xticks(features)

In [None]:
PCA_components_df = pd.DataFrame(principalComponents)

plt.scatter(PCA_components_df[0], PCA_components_df[1], alpha=.01, color='black') #Alpha reduced to look for clustering
plt.xlabel('PCA 1')
plt.ylabel('PCA 2')

In [None]:
ks = range(1, 10)
inertias = []
for k in ks:
    # Create a KMeans instance with k clusters: model
    model = KMeans(n_clusters=k)
    
    # Fit model to samples
    model.fit(PCA_components_df.iloc[:,:3])
    
    # Append the inertia to the list of inertias
    inertias.append(model.inertia_)
    
plt.plot(ks, inertias, '-o', color='black')
plt.xlabel('number of clusters, k')
plt.ylabel('inertia')
plt.xticks(ks)
plt.show()