Begining of analysis

 

In [None]:
#import dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from datetime import date, datetime
import os, sys 
import io
import ee


In [None]:
## set up pandas dataframe using set_options to warn when we are working on a copy instead of ooriginal dataframe
pd.set_option('mode.chained_assignment','warn')

In [None]:
# dataframe will display without being truncated
pd.set_option('display.max_rows', 150)
pd.set_option('display.max_columns', 25)
pd.set_option('display.width', 768)

In [None]:
#mounting the google drive to access the files
from google.colab import files
from google.colab import drive
drive.mount('/content/drive')

In [None]:

csv_path = "/content/drive/My Drive/p_cat_clean.csv"

In [None]:
# backup of clean dataframe, types will not be fully preserved so on import reassign, but easier with unconformity removed
df = pd.read_csv(csv_path, dtype={'Jobs Saved':int, 'NAICS Code': str, 'NAICS Category': str, }, parse_dates=[10] )
df.sample()

In [None]:
df['Bank']=df['Bank'].str.lstrip('\t')

In [None]:

df['Bank'] = df[df['Bank'] != '05/21/2020 ']

In [None]:
df.dtypes

In [None]:
df.shape

In [None]:
nat_loan_stats=df['Loan Amount'].describe()
nat_loan_stats

In [None]:

nat_loan_stats.plot(kind='bar', subplots=True, color='red',figsize=(12,10), )
left  = 0.145  # the left side of the subplots of the figure
right = 0.9    # the right side of the subplots of the figure
bottom = 0.1   # the bottom of the subplots of the figure
top = 1      # the top of the subplots of the figure
wspace = 0.3   # the amount of width reserved for blank space between subplots
hspace = 0.2   # the amount of height reserved for white space between subplots
# These two can be called on 'fig' instead of 'plt' too
plt.subplots_adjust(left=left, bottom=bottom, right=right, top=top,
                wspace=wspace, hspace=hspace)
plt.savefig('nat_loans.png')


In [None]:
national_loan_avg=df['Loan Amount'].mean()
national_loan_avg

In [None]:
national_loan_med=df['Loan Amount'].median()
national_loan_med

In [None]:
national_loan_sum=df['Loan Amount'].sum()
national_loan_sum

In [None]:
national_loan_count=df['Loan Amount'].count()

In [None]:
national_loan_sum=df['Jobs Saved'].sum()
national_loan_sum

In [None]:
nat_pct_dif=national_loan_avg/national_loan_med * 100
nat_pct_dif

In [None]:
nat_summary = pd.DataFrame({
    'Loan Avg': national_loan_avg,
    'Loan Median': national_loan_med,
    'Loan Sum': national_loan_sum,
    'Loan Count': national_loan_count,
    'Jobs Saved': national_loan_sum,
    'Avg over Median': nat_pct_dif,
},index=[0])
nat_summary.head(3)

In [None]:
nat_summary['Loan Avg'] = nat_summary.loc[:,'Loan Avg'].map("${:,.2f}".format)
nat_summary['Loan Median'] = nat_summary.loc[:,'Loan Median'].map("${:,.2f}".format)
nat_summary['Loan Sum'] = nat_summary.loc[:,'Loan Sum'].map("{:,}".format)
nat_summary['Loan Count'] = nat_summary.loc[:,'Loan Count'].map("{:,}".format)
nat_summary['Jobs Saved'] = nat_summary.loc[:,'Jobs Saved'].map("{:,}".format)
nat_summary['Avg over Median'] = nat_summary.loc[:,'Avg over Median'].map("{:,.2f}%".format)
nat_summary






In [None]:
top_banks=df.loc[:,'Bank',].groupby(df.loc[:,'Loan Amount']).sum()


In [None]:
top_banks

In [None]:
top_bank_sums=df.groupby('Bank').sum()['Loan Amount']
top_bank_sums

In [None]:
top_bank_median=df.groupby('Bank').median()['Loan Amount']
top_bank_median

In [None]:
top_bank_average=df.groupby('Bank').mean()['Loan Amount']
top_bank_average

In [None]:
banks_summary_df=pd.DataFrame({
    'Loan Sum' : top_bank_sums,
    'Average Loan' : top_bank_average,
    'Median Loan' : top_bank_median
  
})
banks_summary_df.head()


In [None]:
banks_summary_df['Loan Sum'] = banks_summary_df.loc[:,'Loan Sum'].map("${:,.2f}".format)
banks_summary_df['Average Loan'] = banks_summary_df.loc[:,'Average Loan'].map("${:,.2f}".format)
banks_summary_df['Median Loan'] = banks_summary_df.loc[:,'Median Loan'].map("${:,.2f}".format)
banks_summary_df.head()


In [None]:
top_bks_state=df.groupby('State')['Bank'].value_counts().nlargest(15)
top_bks_state.head(1)

In [None]:
loans_cat=df.groupby('description')['Loan Amount'].sum()
loans_cat.head()

In [None]:
jobs_cat=df.groupby('description')['Jobs Saved'].sum()
jobs_cat.head()

In [None]:
median_cat=df.groupby('description')['Loan Amount'].median()
median_cat.head()

In [None]:
mean_cat=df.groupby('description')['Loan Amount'].mean()
mean_cat.head()

In [None]:
category_summary_df=pd.DataFrame({
    'Jobs Saved' : jobs_cat,
    'Median Loan' : median_cat,
    'Average Loan' : mean_cat,
    'Total Loans': loans_cat,
  
})
category_summary_df.head()

In [None]:
category_summary_df['Jobs Saved'] = category_summary_df.loc[:,'Jobs Saved'].map("{:,}".format)
category_summary_df['Median Loan'] = category_summary_df.loc[:,'Median Loan'].map("${:,.2f}".format)
category_summary_df['Average Loan'] = category_summary_df.loc[:,'Average Loan'].map("${:,.2f}".format)
category_summary_df['Total Loans'] = category_summary_df.loc[:,'Total Loans'].map("${:,.2f}".format)
category_summary_df.head()

In [None]:
df.to_csv('category_summary.csv', index=0)
!cp category_summary.csv "drive/My Drive/"

In [None]:
sum_by_state=df.groupby('State',)['Loan Amount'].sum() 
sum_by_state.nlargest(8)

In [None]:
# df['Loan PCL']=pd.qcut(df['Loan Amount'], q=10)

bin_labels_10= ['10%', '20%', '30%', '40%', '50%', '60%', '70%', '80%', '90%' ]
df['Loan Percentile'] = pd.qcut(df['Loan Amount'],
                              q=[0,  .10, .20, .30, .40, .50, .60, .70, .80, 1],
                              labels=bin_labels_10)
df.head()

In [None]:
bin_labels_4 = ['25%', '45%', '50%', '75%', ]
df['Loan Quartile'] = pd.qcut(df['Loan Amount'],
                              q=[0, .25, .50, .75, 1],
                              labels=bin_labels_4)
df.head()

In [None]:
df['Loan Quartile'].value_counts()

In [None]:
df.sample()

In [None]:
loan_qt = np.percentile(df['Loan Amount'], [25, 50, 75, 100])
loan_qt


In [None]:
high_pct = df[(df["Loan Percentile"] == '90%')]

high_pct.head()


In [None]:
high_pct.shape

In [None]:
df.to_csv('top_percentile_banks.csv', index=0)
!cp top_percentile_banks.csv "drive/My Drive/"

In [None]:
df.groupby('State')['Race'].value_counts()


In [None]:
# df.groupby('Race', 'Loan Amount').quantile()
race_qt=df.groupby('Race')['Loan Amount'].quantile()
race_qt

In [None]:
race_qt.plot(kind='pie', subplots=True, figsize=(12,10), )
left  = 0.125  # the left side of the subplots of the figure
right = 0.9    # the right side of the subplots of the figure
bottom = 0.1   # the bottom of the subplots of the figure
top = 0.9      # the top of the subplots of the figure
wspace = 0.2   # the amount of width reserved for blank space between subplots
hspace = 0.2   # the amount of height reserved for white space between subplots
# These two can be called on 'fig' instead of 'plt' too
plt.subplots_adjust(left=left, bottom=bottom, right=right, top=top,
                wspace=wspace, hspace=hspace)
plt.savefig('race_quantiles_loans.png')

In [None]:
race_qt.head()

In [None]:
vt_gen_qt=df.groupby(['Race', 'Veteran'])['Loan Amount'].quantile()
vt_gen_qt

In [None]:
vt_cat=df.groupby(['Race', 'Veteran'])['description'].value_counts()
vt_cat

In [None]:
df.sample()

In [None]:
rc_jbs=df.groupby(['Race', 'Gender'])['Jobs Saved'].sum()
rc_jbs

In [None]:
race_median=df.groupby(['Race', 'Gender'])['Loan Amount'].median()
race_median

In [None]:
race_mean=df.groupby(['Race', 'Gender'])['Loan Amount'].mean()
race_mean

In [None]:
df.shape

In [None]:
rc_gen_qt=df.groupby(['Race', 'Gender'])['Loan Amount'].quantile()


In [None]:
counts_by_race=df.groupby(['Race', 'Gender'])['Loan Amount'].count()
counts_by_race

In [None]:
race_summary= pd.DataFrame({
    'Count' : counts_by_race,
    'Quantile' : rc_gen_qt,
    'Mean' : race_mean,
    'Median' : race_median,
    'Jobs' : rc_jbs


})
race_summary.head()

In [None]:
race_summary['Count'] = race_summary.loc[:,'Count'].map("{:,}".format)
race_summary['Jobs'] = race_summary.loc[:,'Jobs'].map("{:,}".format)
race_summary['Quantile'] = race_summary.loc[:,'Quantile'].map("${:,.2f}".format)
race_summary['Mean'] = race_summary.loc[:,'Mean'].map("${:,.2f}".format)
race_summary['Median'] = race_summary.loc[:,'Median'].map("${:,.2f}".format)


In [None]:
race_summary.head()

In [None]:
df.to_csv('race_summary.csv', index=0)
!cp race_summary.csv "drive/My Drive/"

In [None]:
df.sample()


In [None]:
cat_sum=df.groupby('description')['Jobs Saved'].sum()
cat_sum.head()

In [None]:
cat_sum.plot(kind='pie', subplots=True, figsize=(12,10), )
left  = 0.145  # the left side of the subplots of the figure
right = 0.9    # the right side of the subplots of the figure
bottom = 0.1   # the bottom of the subplots of the figure
top = 1      # the top of the subplots of the figure
wspace = 0.3   # the amount of width reserved for blank space between subplots
hspace = 0.2   # the amount of height reserved for white space between subplots
# These two can be called on 'fig' instead of 'plt' too
plt.subplots_adjust(left=left, bottom=bottom, right=right, top=top,
                wspace=wspace, hspace=hspace)
plt.savefig('jobs_saved_cat.png')

In [None]:
cat_loan=df.groupby('description')['Loan Amount'].sum()
cat_loan.head()

In [None]:
cat_loan.plot(kind='pie', subplots=True, figsize=(12,10), )
left  = 0.145  # the left side of the subplots of the figure
right = 0.9    # the right side of the subplots of the figure
bottom = 0.1   # the bottom of the subplots of the figure
top = 1      # the top of the subplots of the figure
wspace = 0.3   # the amount of width reserved for blank space between subplots
hspace = 0.2   # the amount of height reserved for white space between subplots
# These two can be called on 'fig' instead of 'plt' too
plt.subplots_adjust(left=left, bottom=bottom, right=right, top=top,
                wspace=wspace, hspace=hspace)
plt.savefig('loans_cat.png')

In [None]:
df.to_csv('people_ppp_super.csv', index=0)
!cp people_ppp_super.csv "drive/My Drive/"