In [12]:
# import libraries here; add more as necessary
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# magic word for producing visualizations in notebook
%matplotlib inline

# Load in the general demographics data.
azdias = pd.read_csv('Udacity_AZDIAS_Subset.csv', sep=';')

# Load in the feature summary file.
feat_info = pd.read_csv('AZDIAS_Feature_Summary.csv', sep=';')

#Convert these strings to list of int & string
def convert(i) :
    """
    inputs :
    i - string
    output :
    either the integer version or i itself
    -helper function
    """
    if (i != 'X' and i != 'XX' and len(i) > 0) :
        return int(i)
    return i

def create_list(s) :
    """
    inputs :
    s - string
    output :
    a list contatining either the integer components 
    or 'X' and 'XX' 
    """
    s = s[1:-1]
    s = s.split(',')
    return [convert(i) for i in s]

feat_info.missing_or_unknown = feat_info.missing_or_unknown.apply(create_list)

def get_decade(i) :
    """
    inputs :
    i - integer
    output :
    the correct decade according to the dictionary under PRAEGENDE_JUGENDJAHRE
    single digit integers representing the decade (4: 40s, 5: 50s, 6: 60s, etc.)
    -helper function
    """
    if i in [1, 2] :
        return 4
    if i in [3, 4] :
        return 5
    if i in [5, 6, 7] :
        return 6
    if i in [8, 9] :
        return 7
    if i in [10, 11, 12, 13] :
        return 8
    if i in [14, 15] :
        return 9
    else : 
        return np.nan
    
def in_avangarde(i) :
    """
    inputs :
    i - integer
    output :
    1 if the movement is avangarde and 0 if the movement is mainstream
    all NaN left as NaN
    -looked at the dictionary under PRAEGENDE_JUGENDJAHRE
    """
    mainstream = [1, 3, 5, 8, 12, 14]
    avantgarde = [2, 4, 6, 7, 9, 13, 15]
    if (i in avantgarde) :
        return 1
    if (i in mainstream) :
        return 0
    return None

def extract_poor(s) :
    """
    inputs :
    i - string type programatically but actually 2-digit number
    output :
    Extracts how poor the family is (represented by first digit)
    -looked at the dictionary under CAMEO_INTL_2015
    """
    if type(s) == str :
        return float(s[0])
    return s

def extract_age(s) :
    """
    inputs :
    i - string type programatically but actually 2-digit number
    output :
    Extracts family age (represented by second digit)
    -looked at the dictionary under CAMEO_INTL_2015
    """
    if type(s) == str :
        return float(s[1])
    return s



def clean_data(df, cthreshold=0.2):
    """
    Perform feature trimming, re-encoding, and engineering for demographics
    data
    
    INPUT: Demographics DataFrame
    OUTPUT: Trimmed and cleaned demographics DataFrame
    """
    #Convert these strings to list of int & string 
    #feat_info.missing_or_unknown = feat_info.missing_or_unknown.apply(create_list)
    missing_data_dict = dict(zip(feat_info.attribute, feat_info.missing_or_unknown))
    # Identify missing or unknown data values and convert them to NaNs.
    for col in missing_data_dict.keys(): 
        df.loc[df[col].isin(missing_data_dict[col]), col] = np.nan
    
    #GETTING OUTLIER COLUMNS (COLUMNS WITH MORE THAN 20% MISSING)
    nmissing = df.isna().sum()/df.shape[0] 
    nmissing = nmissing[nmissing > 0]
    columns_over = nmissing[nmissing > cthreshold].sort_values(ascending = False)
    #Drop outlier columns
    df.drop(columns=columns_over.index, inplace=True)
    
    #Identify rows with lots of missing (11% is the threshold I chose)
    #Any row with more got removed
    nmissing_rows = df.isnull().mean(axis=1)
    threshold = 0.11
    df = df.loc[nmissing_rows < threshold, :]
    
    #Identify categorical variables
    categorical = feat_info [ feat_info.type == 'categorical' ]
    #some were removed earlier so find the ones that were not
    set_of_cols = set(df.columns)
    cols = [ c for c in categorical.attribute if c in set_of_cols]
    #identify the columns with less number of unique values
    col_under_6 = [col for col in cols if len(df[col].value_counts()) < 6]
    #convert columns to object type
    df_cat_u6 = df[col_under_6]
    for col in col_under_6 :
        df_cat_u6[col] = df_cat_u6[col].astype(object)
    #will make the dummy variables out of the 6 and under columns
    categorical_dummy = pd.get_dummies(df_cat_u6, drop_first=True) 
    #can drop the first to save space
    #dropped the 18 categorical variablees
    df2 = df.drop(columns=cols)
    #add the dummy variables
    df = pd.concat([df2, categorical_dummy], axis=1)
    
    #these are the mixed variables
    mixed_columns = ['PRAEGENDE_JUGENDJAHRE', 'CAMEO_INTL_2015', 'LP_LEBENSPHASE_FEIN', 'LP_LEBENSPHASE_GROB', 'WOHNLAGE', 'PLZ8_BAUMAX']
    #Extract decade and avangarde columns from PRAEGENDE_JUGENDJAHRE
    df['AVANGARDE_MOVEMENT'] = df['PRAEGENDE_JUGENDJAHRE'].apply(in_avangarde)
    df['DECADE'] = df['PRAEGENDE_JUGENDJAHRE'].apply(get_decade)
    #Extract household poverty and household age from CAMEO_INTL_2015
    df['HOUSEHOLD_POVERTY'] = df['CAMEO_INTL_2015'].apply(extract_poor)
    df['HOUSEHOLD_AGE'] = df['CAMEO_INTL_2015'].apply(extract_age)
    #PLZ8_BAUMAX not too many unique values so I decided to make dummy columns
    temp_df = df[['PLZ8_BAUMAX']]
    temp_df['PLZ8_BAUMAX'] = temp_df['PLZ8_BAUMAX'].astype('object')
    temp_df = pd.get_dummies(temp_df)
    temp_df.columns = ['1to2_FAMILY_HOMES', '3to5_FAMILY_HOMES', 
                       '6to10_FAMILY_HOMES', 'OVER_10_FAMILY_HOMES', 'BUSINESS_BUILDINGS'
                      ]
    df = pd.concat([df, temp_df], axis=1)
    #drop all the mixed columns after adding the engineered ones
    df.drop(columns=mixed_columns, inplace=True)
    return df

CLEAN DATA

In [13]:
df = azdias.copy()
df = clean_data(df)
df.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cat_u6[col] = df_cat_u6[col].astype(object)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df['PLZ8_BAUMAX'] = temp_df['PLZ8_BAUMAX'].astype('object')


(771701, 82)

In [14]:
df2 = pd.read_csv('Udacity_CUSTOMERS_Subset.csv', sep=';')
df2 = clean_data(df2, cthreshold=0.35)
df2.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cat_u6[col] = df_cat_u6[col].astype(object)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df['PLZ8_BAUMAX'] = temp_df['PLZ8_BAUMAX'].astype('object')


(135799, 82)

IMPUTE 

In [15]:
from sklearn.impute import SimpleImputer
saved_columns = df.columns
imp = SimpleImputer(strategy="most_frequent")
fitted_imp = imp.fit(df)
imputed_values = fitted_imp.transform(df)
imputed_df = pd.DataFrame(imputed_values, columns=saved_columns)
imputed_df.shape

(771701, 82)

In [16]:
saved_columns = df2.columns
imputed_values2 = fitted_imp.transform(df2)
imputed_df2 = pd.DataFrame(imputed_values2, columns=saved_columns)
imputed_df2.shape

(135799, 82)

SCALING

In [17]:
# Before PCA one of the steps is features scaling
#modify the values of dataframe so that mean=0 and std=1
from sklearn.preprocessing import StandardScaler

scaler_fitted = StandardScaler().fit(imputed_df.values)
scaled_vals = scaler_fitted.transform(imputed_df.values)
scaled_vals.shape

(771701, 82)

In [18]:
scaled_vals2 = scaler_fitted.transform(imputed_df2.values)
scaled_vals2.shape

(135799, 82)

PCA

In [23]:
from sklearn.decomposition import PCA
n_components = 15 
pca_final = PCA(n_components)
fitted_pca = pca_final.fit(scaled_vals)
pca_15 = fitted_pca.transform(scaled_vals)
pca_15.shape

(771701, 15)

In [24]:
from sklearn.decomposition import PCA
pca_15_2 = fitted_pca.transform(scaled_vals2)
pca_15_2.shape

(135799, 15)

CLUSTERING

In [25]:
from sklearn.cluster import KMeans
k=10
kmeans = KMeans(n_clusters=k, random_state=1).fit(pca_15)
kmeans.cluster_centers_.shape

(10, 15)

CLUSTER PREDICT

In [26]:
#Here we predicted each of the points and put them in a cluster
company_results = kmeans.predict(pca_15)
customer_results = kmeans.predict(pca_15_2)
company_results.shape, customer_results.shape

((771701,), (135799,))