#### Generic Approach for Clustering 

##### Step 1 - Handling Missing values

calculating %age share of missing values against entire dataset
dropping columns with missing % greater than threshold

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
import pprint
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA    
import operator
from sklearn.preprocessing import LabelEncoder

# magic word for producing visualizations in notebook
%matplotlib inline

In [9]:
df = pd.DataFrame(np.arange(12).reshape(4,3))

In [10]:
na_data = df.isnull().sum()[df.isnull().sum() != 0]

data_dictn = {'count': na_data.values, 'pct': np.round(na_data.values *100/(df.shape[0]),2)}

df_null = pd.DataFrame(data=data_dictn, index=na_data.index)
df_null.sort_values(by='count', ascending=False, inplace=True)
df_null


Unnamed: 0,count,pct


Removing outlier columns based on Null percentage

In [None]:
# Remove the outlier columns from the dataset. 

#Removing top 6 column based on the percentage of NaNs calculated in previous step
drop_columns = ['TITEL_KZ', 'AGER_TYP', 'KK_KUNDENTYP', 'KBA05_BAUMAX', 'GEBURTSJAHR','ALTER_HH']
df = df.drop(drop_columns, axis=1)

#### Assess Missing data in each row

Divide the data into two subsets: one for data points that are above some threshold for missing values, and a second subset for points below that threshold.

In order to know what to do with the outlier rows, we should see if the distribution of data values on columns that are not missing data (or are missing very little data) are similar or different between the two groups. Select at least five of these columns and compare the distribution of values

If the distributions of non-missing features look similar between the data with many missing values and the data with few or no missing values, then we could argue that simply dropping those points from the analysis won't present a major issue. On the other hand, if the data with many missing values looks very different from the data with few or no missing values, then we should make a note on those data as special.

In [None]:
# How much data is missing in each row of the dataset?
nan_rowcnt = df.isnull().sum(axis=1)
nan_rowcnt.describe()

In [None]:
# Plotting the missing columns, to determine the splitting of dataset
import matplotlib.pyplot as plt

plt.figure(figsize=(15,6))
plt.hist(nan_rowcnt, bins=np.arange(0,50,1))
plt.xlabel('NaNs')
plt.ylabel('Row count')
plt.xticks(np.arange(0,50,5));

In [None]:
# dividing the dataset at missing column count 25, since we have upto 50 missing columns 
msng_small = df[df.isnull().sum(axis=1) < 25].reset_index(drop=True)

msng_large = df[df.isnull().sum(axis=1) >= 25].reset_index(drop=True)

msng_large.shape[0]

In [None]:
# Compare the distribution of values for at least five columns where there are 
# no or few missing values, between the two subsets.

col_names_small = msng_small.columns

def print_countplot(cols,num):
    
    fig, axs = plt.subplots(num,2, figsize=(15, 15), facecolor='w', edgecolor='k')
    fig.subplots_adjust(hspace =2 , wspace=.2)
    axs = axs.ravel()

    for i in range(num):
    
        sns.countplot(msng_small[cols[i]], ax=axs[i*2])
        axs[i*2].set_title('few_missing')
        
        sns.countplot(msng_large[cols[i]], ax=axs[i*2+1])
        axs[i*2+1].set_title('high_missing')
    
    
print_countplot(col_names_small,6)

Need to observe the distribution of data and decide whether to drop or include large missing value dataset

#### Filling missing values starting with visit year by randomly choosing a year accordin to its probability

In [None]:
s1 = df[df.VisitYear!=0].VisitYear.value_counts(normalize=1).sort_index()
s2 = df[df.VisitYear == 0].index
s3 = np.random.choice(a=s1.index.tolist(),p=s1.values.tolist(),size=s2.shape[0])
#passing a dictionary in value field of replace
df.VisitYear.replace(to_replace=0,value=dict(zip(s2,s3)),inplace=True)

### Re-Encode Features

Since the unsupervised learning techniques to be used will only work on data that is encoded numerically, we need to make a few encoding changes to dataset.

Typically, given 3 different type of variables (Numerical, Categorical and Mixed) - 

For numeric and interval data, these features can be kept without changes.
For the ordinal variables - While ordinal values may technically be non-linear in spacing, make the simplifying assumption that the ordinal variables can be treated as being interval in nature (that is, kept without any changes).
Special handling may be necessary for the remaining two variable types: categorical, and 'mixed'.

In [None]:
# Re-encode categorical variables to be kept in the analysis.

df = pd.get_dummies(df, columns=['CATEGORICAL_VARIABLE','CATEGORICAL_VARIABLE','CATEGORICAL_VARIABLE'])

##### Age Categorization

In [None]:
def categorize_age(age):
    if age < 18: return '0-17'
    elif age < 25: return '18-24'
    elif age < 35: return '25-34'
    elif age < 45: return '35-44'
    elif age < 55: return '45-54'
    elif age < 65: return '55-64'
    else: return '65+'

In [None]:
df['Age_Category'] = df.AGE.apply(lambda x:categorize_age(x))  #AGE - is the age column

In [None]:
# See the plot of members in different age categories
df.groupby('Age_Category').MBRID.count().plot.bar( align='center',figsize=(18,9))
plt.show()

#### ICD9 Categorization

Categorize 3500+ ICD9 codes among 20 categories refering to https://en.wikipedia.org/wiki/List_of_ICD-9_codes. Using get_dummies to get a binary table.

Run quick analysis with the purpose to find patients that were diagnozed with one diagnosis multiple times.

In [None]:
def categorize_icd9code(code,method = 1):
    icd9code = {    
        '001-139': 'infectious and parasitic',
        '140-239': 'neoplasms',
        '240-279': 'endocrine, nutritional and metabolic, immunity disorders',
        '280-289': 'diseases of the blood and blood-forming organs',
        '290-319': 'mental disorders',
        '320-359': 'nervous system',
        '360-389': 'sense organs',
        '390-459': 'circulatory system',
        '460-519': 'respiratory system',
        '520-579': 'digestive system',
        '580-629': 'genitourinary system',
        '630-679': 'complications of pregnancy, childbirth, and the puerperium',
        '680-709': 'skin and subcutaneous tissue',
        '710-739': 'musculoskeletal system and connective tissue',
        '740-759': 'congenital anomalies',
        '760-779': 'certain conditions originating in the perinatal period',
        '780-799': 'symptoms, signs, and ill-defined conditions',
        '800-999': 'injury and poisoning',
        'E-V': 'external causes of injury and supplemental classification'
    }
    if method == 1:
        code = code.split('.')[0]
        if ('E' in code.upper()) or ('V' in code.upper()): return 'E-V'
        elif int(code) < 139: return '001-139'
        elif int(code) < 239: return '140-239'
        elif int(code) < 279: return '240-279'
        elif int(code) < 289: return '280-289'
        elif int(code) < 319: return '290-319'
        elif int(code) < 359: return '320-359'
        elif int(code) < 389: return '360-389'
        elif int(code) < 459: return '390-459'
        elif int(code) < 519: return '460-519'
        elif int(code) < 579: return '520-579'
        elif int(code) < 629: return '580-629'
        elif int(code) < 679: return '630-679'
        elif int(code) < 709: return '680-709'
        elif int(code) < 739: return '710-739'
        elif int(code) < 759: return '740-759'
        elif int(code) < 779: return '760-779'
        elif int(code) < 799: return '780-799'
        elif int(code) < 899: return '800-899'
        else: return 'Unknown'

In [None]:
df['ICD9CodeCategory'] = df.ICD9Code.apply(lambda x:categorize_icd9code(x,1))

In [None]:
diagnosis_agg = df['ICD9CodeCategory']
diagnosis_agg.index = df.MBRID
diagnosis_agg = pd.get_dummies(diagnosis_agg,prefix='Icd9',prefix_sep='_').reset_index().groupby('MBRID').sum()

s1 = diagnosis_agg.sum(axis=1) #sum of all the diagnosed categories per row

s2 = (diagnosis_agg>0).sum(axis=1) #only where there's any diagnosis


diagnosis_agg['DiagnosisCount'] = s1
diagnosis_agg['VisitCount'] = s2
diagnosis_agg['DiagnosisFreq'] = s1/s2

#diagnosis_agg['AcuteCount'] = df1[['PatientGuid','Acute']].groupby('PatientGuid').sum()
#diagnosis_agg['AcuteFreq'] = df1[['PatientGuid','Acute']].groupby('PatientGuid').sum()/df1[['PatientGuid','Acute']].groupby('PatientGuid').count()



#### Imputing missing values with Mean

In [None]:
from sklearn.preprocessing import Imputer

# removing NaNs using imputer
imputer = Imputer(strategy='mean')
imputed_df = imputer.fit_transform(df)

Before we apply dimensionality reduction techniques to the data, we need to perform feature scaling so that the principal component vectors are not influenced by the natural differences in scale for features

In [None]:
# Apply feature scaling to the data.

scaler = StandardScaler()
standardized_df = scaler.fit_transform(imputed_df)

### Dimentionality reduction using PCA

In [None]:
# First applying PCA to close to 50% of the features in data.
pca = PCA(90)
pca_ftr = pca.fit_transform(standardized_df)

In [None]:
# Investigate the variance accounted for by each principal component.
def pca_variance_plot(pca):
    # Creates a scree plot associated with the principal components 
    
    #INPUT: pca - the result of instantian of PCA in scikit learn
            
    
    num_cmpnt=len(pca.explained_variance_ratio_)
    ind = np.arange(num_cmpnt)
    vals = pca.explained_variance_ratio_
 
    plt.figure(figsize=(18, 6))
    ax = plt.subplot(111)
    cumvals = np.cumsum(vals)
    ax.bar(ind, vals)
    ax.plot(ind, cumvals)
    for i in range(num_cmpnt):
        ax.annotate(r"%s" % ((str(vals[i]*100)[:3])), (ind[i], vals[i]), va="bottom", ha="center", fontsize=4.5)
 
    ax.xaxis.set_tick_params(width=0)
    ax.yaxis.set_tick_params(width=2, length=10)
 
    ax.set_xlabel("Principal Component")
    ax.set_ylabel("Variance Explained (%)")
    plt.title('Explained Variance Per Principal Component')

In [None]:
# Plotting PCA curve
scree_plot(pca)

In [None]:
# Re-apply PCA to the data while selecting for number of components to retain.
pca = PCA(40)  # Select the number of features after which variance explained in the plot is stagnant
pca_ftr = pca.fit_transform(standardized_df)

In [None]:
scree_plot(pca)

In [None]:
# Plotting the feature with absolute variance for a pca component
def pca_plt(data, pca, n_compo):
        
    compo = pd.DataFrame(np.round(pca.components_, 5), columns = data.keys()).iloc[n_compo-1]
    compo.sort_values(ascending=False, inplace=True)
    compo = pd.concat([compo.head(6), compo.tail(6)])
    
    compo.plot(kind='bar', title='PCA for Component ' + str(n_compo))
    ax = plt.gca()
    ax.grid(linewidth='0.6', alpha=0.5)
    ax.set_axisbelow(True)
    plt.show()

In [None]:
pca_plt(df, pca, 1)

In [None]:
 Map weights for the second principal component to corresponding feature names
# and then print the linked values, sorted by weight.

pca_plt(df, pca, 2)


In [None]:
# Map weights for the third principal component to corresponding feature names
# and then print the linked values, sorted by weight.

pca_plt(df, pca, 3)