# Dimensionality Reduction on Organization-Level Data

## This notebook only runs the sample data we selected from the completed datasets

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from matplotlib import colors
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
import seaborn as sns

# Load Data

In [None]:
transaction = pd.read_csv("../sample_data/other_samples/yearlyorgpayment_perperson.csv")
ticket = pd.read_csv("../sample_data/other_samples/org_yearly_tk_pperson_v2.csv")
donor = pd.read_csv("../sample_data/other_samples/cleaned_year_donation.csv")

# Data Cleaning 

## Ticket

In [None]:
ticket.dropna(inplace = True)

## Transaction

In [None]:
transaction.year = transaction.year.astype(str)
transaction = transaction.pivot(index = ["org_id"], 
                     columns = ["year"],
                     values = [c for c in transaction.columns if c not in ["org_id","year"]]).reset_index()
transaction.columns = ["_".join(x) if x[0] !="org_id" else x[0] for x in transaction.columns.ravel()]

In [None]:
transaction

In [None]:
trans_kept_year = [str(c) for c in list(range(2015,2021))]
trans_kept_years_cols = [c for c in transaction.columns if c[-4:] in trans_kept_year]
transaction_v2 = transaction[["org_id"] +trans_kept_years_cols].copy()
transaction_v2.dropna(inplace = True)
transaction_v2

## Donor

In [None]:
donor.year = donor.year.astype(str)
donor = donor.pivot(index = ["org_id"], 
                     columns = ["year"],
                     values = [c for c in donor.columns if c not in ["org_id","year"]]).reset_index()
donor.columns = ["_".join(x) if x[0] !="org_id" else x[0] for x in donor.columns.ravel()]

In [None]:
donor

In [None]:
donor_kept_year = [str(c) for c in list(range(2018,2021))]
donor_kept_years_cols = [c for c in donor.columns if c[-4:] in donor_kept_year]
donor_v2 = donor[["org_id"] +donor_kept_years_cols].copy()
donor_v2.dropna(inplace = True)
donor_v2

# Join Data

In [None]:
org_features = pd.merge(transaction_v2, ticket, on = "org_id")

In [None]:
org_features = pd.merge(donor_v2, org_features, on = "org_id")

In [None]:
kept_years = [str(c) for c in list(range(2015,2021))]
kept_years_cols = [c for c in org_features.columns if c[-4:] in kept_years]
kept_years_cols

In [None]:
org_features.columns

In [None]:
len(org_features.columns)

In [None]:
org_features = org_features[["org_id"]+kept_years_cols].copy()

In [None]:
org_features.dropna(axis = 1, inplace = True)

In [None]:
org_features.to_csv("pca_all_cols.csv")

# Generate PCA

In [None]:
def label_point(x, y, val, ax):
    a = pd.concat({'x': x, 'y': y, 'val': val}, axis=1)
    for i, point in a.iterrows():
        ax.text(point['x']+.1, point['y']+.1, str(point['val']),fontsize= 15)

def pca_plot(df_plot, figsize, xlim, ylim, title, figtitle):
    fig , ax1 = plt.subplots(figsize=figsize)

    ax1.set_xlim(xlim[0],xlim[1])
    ax1.set_ylim(ylim[0],ylim[1])

    # Plot Principal Components 1 and 2
    sns.scatterplot(x = "PC1", y = "PC2",data = df_plot, s = 100)

    # Plot reference lines
    ax1.hlines(0,xlim[0],xlim[1], linestyles='dotted', colors='grey')
    ax1.vlines(0,ylim[0],ylim[1], linestyles='dotted', colors='grey')
    ax1.tick_params(axis='x', labelsize=15)
    ax1.tick_params(axis='y', labelsize=15)
#     ax1.set_xticklabels(labels = x_label, fontsize = 15)

    ax1.set_xlabel('First Principal Component',fontsize= 15)
    ax1.set_ylabel('Second Principal Component',fontsize= 15)
    ax1.set_title(title,fontsize= 20)
    label_point(df_plot.PC1, df_plot.PC2, df_plot.org_id, plt.gca())  
    fig.savefig(f'PCA/{figtitle}.png',bbox_inches='tight')

## All Data

### PCA Plot

In [None]:
pca = PCA()
data = org_features.drop("org_id", axis =1)
data = pd.DataFrame(scale(data), index=data.index, columns=data.columns)
df_plot = pd.DataFrame(pca.fit_transform(data), 
                       columns=['PC'+str(i+1) for i in range(8)], 
                       index=data.index)
sim_df = pd.concat([org_features,df_plot], axis = 1)
pca_plot(sim_df, 
         figsize = (10,10),
         xlim = (-15, 15), 
         ylim = (-15,15),
         title = "PCA of 8 Organizations with All Features by Year",
         figtitle = "PCA_all_data")

### PCA  Loadings

In [None]:
pca_loadings = pd.DataFrame(PCA().fit(data).components_.T, 
                            index=data.columns, 
                            columns=['V'+str(i+1) for i in range(8)])

## Transaction 

In [None]:
def label_point_trans(x, y, val, ax):
    a = pd.concat({'x': x, 'y': y, 'val': val}, axis=1)
    for i, point in a.iterrows():
        if point['val'] == 'NCSU':
            ax.text(point['x']-1, point['y']+.1, str(point['val']),fontsize= 15)
        elif point['val'] == 'BAYLOR':
            ax.text(point['x']+.2, point['y']-0.15, str(point['val']),fontsize= 15)
        elif point['val'] == 'ARMY':
            ax.text(point['x']-2, point['y']-0.05, str(point['val']),fontsize= 15)
        elif point['val'] == 'COLORADO':
            ax.text(point['x']+.2, point['y']-0.01, str(point['val']),fontsize= 15)
        else:
            ax.text(point['x']+.1, point['y']+.1, str(point['val']),fontsize= 15)

def pca_plot_trans(df_plot, figsize, xlim, ylim, title,figtitle):
    fig , ax1 = plt.subplots(figsize=figsize)

    ax1.set_xlim(xlim[0],xlim[1])
    ax1.set_ylim(ylim[0],ylim[1])

    # Plot Principal Components 1 and 2
    sns.scatterplot(x = "PC1", y = "PC2",data = df_plot, s = 100)

    # Plot reference lines
    ax1.hlines(0,xlim[0],xlim[1], linestyles='dotted', colors='grey')
    ax1.vlines(0,ylim[0],ylim[1], linestyles='dotted', colors='grey')
    ax1.tick_params(axis='x', labelsize=15)
    ax1.tick_params(axis='y', labelsize=15)
#     ax1.set_xticklabels(labels = x_label, fontsize = 15)

    ax1.set_xlabel('First Principal Component',fontsize= 20)
    ax1.set_ylabel('Second Principal Component',fontsize= 20)
    ax1.set_title(title, fontsize= 20)
    label_point_trans(df_plot.PC1, df_plot.PC2, df_plot.org_id, plt.gca())  
    fig.savefig(f'PCA/{figtitle}.png',bbox_inches='tight')

In [None]:
pca = PCA()
data_trans = transaction_v2.drop("org_id", axis =1)
data_trans = pd.DataFrame(scale(data_trans), index=data_trans.index, columns=data_trans.columns)
df_plot_trans = pd.DataFrame(pca.fit_transform(data_trans), 
                       columns=['PC'+str(i+1) for i in range(min(len(data_trans),len(data_trans.columns)))], 
                       index=data_trans.index)
sim_df_trans = pd.concat([transaction_v2,df_plot_trans], axis = 1)
pca_plot_trans(sim_df_trans,
               figsize=(13,13), 
               xlim = (-15,15), 
               ylim = (-6,10), 
               title = "PCA of 21 Organizations with \nTransaction-Related Features by Year",
               figtitle = "PCA_transaction")

In [None]:
transaction_v2.to_csv("pca_transaction_cols.csv")

In [None]:
len(transaction_v2.columns)

## Donor

In [None]:
def label_point_donor(x, y, val, ax):
    a = pd.concat({'x': x, 'y': y, 'val': val}, axis=1)
    for i, point in a.iterrows():
        if point['val'] == 'FRESNO':
            ax.text(point['x']-1, point['y']+.1, str(point['val']),fontsize= 18)
        else:
            ax.text(point['x']+.1, point['y']+.1, str(point['val']),fontsize= 15)

def pca_plot_donor(df_plot, figsize, xlim, ylim, title,figtitle):
    fig , ax1 = plt.subplots(figsize=figsize)

    ax1.set_xlim(xlim[0],xlim[1])
    ax1.set_ylim(ylim[0],ylim[1])

    # Plot Principal Components 1 and 2
    sns.scatterplot(x = "PC1", y = "PC2",data = df_plot, s = 100)

    # Plot reference lines
    ax1.hlines(0,xlim[0],xlim[1], linestyles='dotted', colors='grey')
    ax1.vlines(0,ylim[0],ylim[1], linestyles='dotted', colors='grey')
    ax1.tick_params(axis='x', labelsize=15)
    ax1.tick_params(axis='y', labelsize=15)
#     ax1.set_xticklabels(labels = x_label, fontsize = 15)

    ax1.set_xlabel('First Principal Component',fontsize= 20)
    ax1.set_ylabel('Second Principal Component',fontsize= 20)
    ax1.set_title(title, fontsize= 20)
    label_point_donor(df_plot.PC1, df_plot.PC2, df_plot.org_id, plt.gca())  
    fig.savefig(f'PCA/{figtitle}.png',bbox_inches='tight')

In [None]:
donor_v2.columns

In [None]:
len(donor_v2.columns)

In [None]:
pca = PCA()
data_donor = donor_v2.drop("org_id", axis =1)
data_donor = pd.DataFrame(scale(data_donor), index=data_donor.index, columns=data_donor.columns)
df_plot_donor = pd.DataFrame(pca.fit_transform(data_donor), 
                       columns=['PC'+str(i+1) for i in range(min(len(data_donor),len(data_donor.columns)))], 
                       index=data_donor.index)
sim_df_donor = pd.concat([donor_v2,df_plot_donor], axis = 1)
pca_plot_donor(sim_df_donor,
         figsize=(10,13), 
         xlim = (-8,10), 
         ylim = (-7,8), 
         title = "PCA of 12 Organizations with \n Donation-Related Features by Year",
         figtitle = "PCA_donor")

In [None]:
donor_v2.to_csv("pca_donor_col.csv")

In [None]:
donor_v2.columns


## Ticket

In [None]:
pca = PCA()
data_ticket = ticket.drop("org_id", axis =1)
data_ticket = pd.DataFrame(scale(data_ticket), index=data_ticket.index, columns=data_ticket.columns)
df_plot_ticket = pd.DataFrame(pca.fit_transform(data_ticket), 
                       columns=['PC'+str(i+1) for i in range(min(len(data_ticket),len(data_ticket.columns)))], 
                       index=data_ticket.index)
sim_df_ticket = pd.concat([ticket,df_plot_ticket], axis = 1)
pca_plot(sim_df_ticket, 
         figsize=(15,10), 
         xlim = (-10,10), ylim = (-5,10), 
         title = "PCA of 15 Organizations with Ticketing-Related Features by Year",
         figtitle = "PCA_ticket")

In [None]:
ticket.to_csv("pca_ticket_col.csv")

In [None]:
len(ticket.columns)