# 02 - Cluster Merge
The initial intent of this notebook is to combine previously formed dataframes to better inform our upcoming EDA steps. Additionally, within this notebook, we are loading and preparing a file to use for measuring our clusters in the EDA stage.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import datetime as dt

In [None]:
np.random.seed(42)

In [None]:
clusters = pd.read_csv('./source_data/2018_with_clusters.csv')
clusters = clusters.set_index('State')
clusters.head()

#### Function to return clusters from dataframe
Associating the states with the cluster according to the KMeans outputs

In [None]:
def get_clusters(df, column_list):
    
    all_clusters = {}
    
    for column in column_list:
        
        n_cluster = int(re.findall('\d+', column)[0])
        
        state_clusters = {}

        for cluster in df[column].unique():
            state_clusters[cluster] = list(df[df[column] == cluster].index)
    
        all_clusters[n_cluster] = state_clusters
    
    return all_clusters

cluster_labels = get_clusters(clusters, ['clusters-3', 'clusters-4', 'clusters-5'])
cluster_labels[4][1]

#### Function to append state column to Unemployment data
Grouped by Clusters

In [None]:
for rank in cluster_labels.keys():
    
    for cluster in cluster_labels[rank].keys():
        
        cluster_df = pd.DataFrame()

        for state in cluster_labels[rank][cluster]:
            state_df = pd.read_csv(f'./state_employment/{state}_employment.csv')
            state_df['date'] = pd.to_datetime(state_df['Year'].astype(str) + '-' + state_df['Period'])
            state_df.set_index('date', inplace=True)
            state_df['State'] = state

            cluster_df = cluster_df.append(state_df)

        file_path = './state_employment/'
        csv_name = f'KMeans-{rank}_Cluster-{cluster}_employment.csv'
        cluster_df.to_csv(f'{file_path}{csv_name}', index_label='date')

##### Read in Unemployment data with State column

In [None]:
pd.read_csv('./state_employment/KMeans-5_Cluster-0_employment.csv', index_col='date')

#### Read in Quarterly Personal Income data
The following cells will transform the raw data into the usable format for modeling and EDA.
Please note, re-running cells in this section may result in errors.

In [None]:
# Read in file, and drop unneccessary columns
personal_income = pd.read_csv('./state_metrics/quarterly_personal_income_raw.csv')
personal_income.drop(columns=['GeoFips', 'LineCode', 'Description'], inplace=True)
personal_income.head()

In [None]:
# Drop null values
personal_income.dropna(inplace=True)

In [None]:
# Removing asterisk from Hawaii and Alaska (Noting that data prior to 1950 are not available)
personal_income['GeoName'] = personal_income['GeoName'].apply(lambda s: re.findall('\w+\s*\w+\s*\w+', s)[0])

In [None]:
# Set index on df
personal_income.set_index('GeoName', inplace=True)

##### The following cells transform the data columns into the index by quarters 

In [None]:
quarters = pd.Series(personal_income.columns)

In [None]:
quarters = quarters.apply(lambda s: re.sub(':','-', s))

In [None]:
quarters = pd.to_datetime(quarters)

In [None]:
# Transpose to add datetime column
personal_income = personal_income.T

In [None]:
personal_income['Quarter'] = quarters.values

In [None]:
personal_income = personal_income.set_index('Quarter')

In [None]:
personal_income.to_csv(f'./state_metrics/Quarterly_Personal_Income.csv', index_label='Quarter')

In [None]:
# See Quarter Index
pd.read_csv('./state_metrics/quarterly_personal_income.csv', index_col='Quarter')