# Dimensionality Reduction Analysis on Sample Accounts Data

### This is the notebook that only runs the sample data from USC & OKLAHOMA

## Reload the saved combined_df and yearly_transac

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import colors
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
import seaborn as sns

import plotly.express as px

## Read the sample data from USC and OKLAHOMA to check the sample output

### Plot only USC account info (without yearly transaction info)

In [None]:
USC = pd.read_csv("../sample_data/other_samples/USC_without_transac_features.csv")

In [None]:
USC_pca = USC.iloc[:,4:].copy()
# normalize data
sc = StandardScaler()
usc_normalized = sc.fit_transform(USC_pca)

# 2 PCs by account_id and colored by org_id
pca2d = PCA(n_components=2)
principal_components2d = pca2d.fit_transform(usc_normalized)

principalDf = pd.DataFrame(data = principal_components2d
             , columns = ['principal component 1', 'principal component 2'])

fig_pca2d = px.scatter(
    principal_components2d, 
    x=principalDf['principal component 1'], 
    y=principalDf['principal component 2'],
    color=USC.is_donor, 
    hover_name=USC['account_id'],
    labels={'color': 'Is Donor'}
)

fig_pca2d.update_layout(
    width=800,
    xaxis_title='Principal component 1',
    yaxis_title='Principal component 2',
    title_text='PCA 2D for only USC Accounts (without yearly transacion info)'
)

In [None]:
loadings = pca2d.components_
num_pc = pca2d.n_features_
pc_list = ["PC"+str(i) for i in list(range(1, num_pc+1))]
loadings_df = pd.DataFrame.from_dict(dict(zip(pc_list, loadings)))
loadings_df['variable'] = USC_pca.columns.values
loadings_df = loadings_df.set_index('variable')
loadings_df

### PCA for USC's Accounts (with yearly transaction info)

In [None]:
USC_new = pd.read_csv("../sample_data/other_samples/USC_all_features.csv")

In [None]:
USC_pca = USC_new.iloc[:,4:].copy()
# normalize data
sc = StandardScaler()
usc_normalized = sc.fit_transform(USC_pca)

# 2 PCs by account_id and colored by org_id
pca2d = PCA(n_components=2)
principal_components2d = pca2d.fit_transform(usc_normalized)

principalDf = pd.DataFrame(data = principal_components2d
             , columns = ['principal component 1', 'principal component 2'])

fig_pca2d = px.scatter(
    principal_components2d, 
    x=principalDf['principal component 1'], 
    y=principalDf['principal component 2'],
    color=USC_new.in_priority_prog, 
    hover_name=USC_new['account_id'], 
    labels={'color': 'Enrolled in Priority Program'}
)

fig_pca2d.update_layout(
    width=800,
    xaxis_title='Principal component 1',
    yaxis_title='Principal component 2',
    title_text='PCA 2D for only USC Accounts (with yearly transacion info)'
)

In [None]:
loadings = pca2d.components_
num_pc = pca2d.n_features_
pc_list = ["PC"+str(i) for i in list(range(1, num_pc+1))]
loadings_df = pd.DataFrame.from_dict(dict(zip(pc_list, loadings)))
loadings_df['variable'] = USC_pca.columns.values
loadings_df = loadings_df.set_index('variable')
loadings_df.iloc[:50,:]
# The loadings looks like persforming poorly (explained variance ~= 0.1)

### Plot only OKLAHOMA's account info (without yearly transaction info)

In [None]:
OKL = pd.read_csv("../stats170ab/sample_data/other_samples/OKLAHOMA_without_transac_features.csv")

In [None]:
OKL_pca = OKL.iloc[:,4:].copy()
# normalize data
sc = StandardScaler()
okl_normalized = sc.fit_transform(OKL_pca)

# 2 PCs by account_id and colored by org_id
pca2d = PCA(n_components=2)
principal_components2d = pca2d.fit_transform(okl_normalized)

principalDf = pd.DataFrame(data = principal_components2d
             , columns = ['principal component 1', 'principal component 2'])

fig_pca2d = px.scatter(
    principal_components2d, 
    x=principalDf['principal component 1'], 
    y=principalDf['principal component 2'],
    color=OKL.is_donor, 
    hover_name=OKL['account_id'],
    labels={'color': 'Is Donor'}
)

fig_pca2d.update_layout(
    width=800,
    xaxis_title='Principal component 1',
    yaxis_title='Principal component 2',
    title_text='PCA 2D for only OKLAHOMA Accounts (without yearly transacion info)'
)

In [None]:
loadings = pca2d.components_
num_pc = pca2d.n_features_
pc_list = ["PC"+str(i) for i in list(range(1, num_pc+1))]
loadings_df = pd.DataFrame.from_dict(dict(zip(pc_list, loadings)))
loadings_df['variable'] = OKL_pca.columns.values
loadings_df = loadings_df.set_index('variable')
loadings_df

### Plot only OKLAHOMA's account info (with yearly transaction info)

In [None]:
OKL_new = pd.read_csv("../sample_data/other_samples/OKLAHOMA_all_features.csv")

In [None]:
OKL_pca = OKL_new.iloc[:,4:].copy()
# normalize data
sc = StandardScaler()
okl_normalized = sc.fit_transform(OKL_pca)

# 2 PCs by account_id and colored by org_id
pca2d = PCA(n_components=2)
principal_components2d = pca2d.fit_transform(okl_normalized)

principalDf = pd.DataFrame(data = principal_components2d
             , columns = ['principal component 1', 'principal component 2'])

fig_pca2d = px.scatter(
    principal_components2d, 
    x=principalDf['principal component 1'], 
    y=principalDf['principal component 2'],
    color=OKL_new.in_priority_prog, 
    hover_name=OKL_new['account_id'], 
    labels={'color': 'Enrolled in Priority Program'}
)

fig_pca2d.update_layout(
    width=800,
    xaxis_title='Principal component 1',
    yaxis_title='Principal component 2',
    title_text='PCA 2D for only OKLAHOMA Accounts (with yearly transacion info)'
)

In [None]:
loadings = pca2d.components_
num_pc = pca2d.n_features_
pc_list = ["PC"+str(i) for i in list(range(1, num_pc+1))]
loadings_df = pd.DataFrame.from_dict(dict(zip(pc_list, loadings)))
loadings_df['variable'] = OKL_pca.columns.values
loadings_df = loadings_df.set_index('variable')
loadings_df