Begining of analysis

 

In [None]:
#import dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from datetime import date, datetime
import os, sys 
import io
import ee


In [None]:
## set up pandas dataframe using set_options to warn when we are working on a copy instead of ooriginal dataframe
pd.set_option('mode.chained_assignment','warn')

In [None]:
# dataframe will display without being truncated
pd.set_option('display.max_rows', 150)
pd.set_option('display.max_columns', 25)
pd.set_option('display.width', 768)

In [None]:
#mounting the google drive to access the files
from google.colab import files
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#import csv that was cleaned up in cat_desc_normalize.ipynb
csv_path = "/content/drive/My Drive/p_cat_clean.csv"


In [None]:
#list of columns
cols=list(pd.read_csv(csv_path, nrows=1))
cols


In [None]:
# backup of clean dataframe, types will not be fully preserved so on import reassign, but easier with unconformity removed
df = pd.read_csv(csv_path, dtype={'Jobs Saved':int, 'NAICS Code': str, 'NAICS Category': str, 'Zip': str, }, parse_dates=[10] )
df.sample()

In [None]:
#found some weird characters
df['Bank']=df['Bank'].str.lstrip('\t')

In [None]:
#types are correct
df.dtypes

In [None]:
df.shape

In [None]:
nat_loan_stats=df['Loan Amount'].describe()
nat_loan_stats

In [None]:
national_loan_avg=df['Loan Amount'].mean()
national_loan_avg

In [None]:
national_loan_med=df['Loan Amount'].median()
national_loan_med

In [None]:
national_loan_sum=df['Loan Amount'].sum()
national_loan_sum

In [None]:
national_loan_count=df['Loan Amount'].count()

In [None]:
national_loan_sum=df['Jobs Saved'].sum()
national_loan_sum

In [None]:
nat_pct_dif=national_loan_avg/national_loan_med * 100
nat_pct_dif

In [None]:
#summary table of stats
nat_summary = pd.DataFrame({
    'Loan Avg': national_loan_avg,
    'Loan Median': national_loan_med,
    'Loan Sum': national_loan_sum,
    'Loan Count': national_loan_count,
    'Jobs Saved': national_loan_sum,
    'Avg over Median': nat_pct_dif,
},index=[0])
nat_summary.head(3)

# Summary Table

Loan Avg : $43,003.12 | Median Loan : $30,000 | Loan Sum: $10,491,212,067| Loan Count: 243,964 | Jobs Saved: 1,505,064 | Avg loan amount over Median: 143.34%


In [None]:
nat_summary['Loan Avg'] = nat_summary.loc[:,'Loan Avg'].map("${:,.2f}".format)
nat_summary['Loan Median'] = nat_summary.loc[:,'Loan Median'].map("${:,.2f}".format)
nat_summary['Loan Sum'] = nat_summary.loc[:,'Loan Sum'].map("{:,}".format)
nat_summary['Loan Count'] = nat_summary.loc[:,'Loan Count'].map("{:,}".format)
nat_summary['Jobs Saved'] = nat_summary.loc[:,'Jobs Saved'].map("{:,}".format)
nat_summary['Avg over Median'] = nat_summary.loc[:,'Avg over Median'].map("{:,.2f}%".format)
nat_summary






In [None]:
#who are the top lenders by amount? Creating a summary table that can be exported to csv for analysis in SQL/ Tableau later
top_bank_sums=df.groupby('Bank').sum()['Loan Amount']
top_bank_sums

In [None]:
top_bank_median=df.groupby('Bank').median()['Loan Amount']
top_bank_median

In [None]:
top_bank_average=df.groupby('Bank').mean()['Loan Amount']
top_bank_average

In [None]:
banks_summary_df=pd.DataFrame({
    'Loan Sum' : top_bank_sums,
    'Average Loan' : top_bank_average,
    'Median Loan' : top_bank_median
  
})
banks_summary_df.head()


In [None]:
banks_summary_df['Loan Sum'] = banks_summary_df.loc[:,'Loan Sum'].map("${:,.2f}".format)
banks_summary_df['Average Loan'] = banks_summary_df.loc[:,'Average Loan'].map("${:,.2f}".format)
banks_summary_df['Median Loan'] = banks_summary_df.loc[:,'Median Loan'].map("${:,.2f}".format)
banks_summary_df.sample(30)


In [158]:
banks_summary_df.to_csv('banks_summary_df.csv', index=True)
!cp banks_summary_df.csv "drive/My Drive/"

In [None]:
result=8142/243964
result

In [None]:
#some of the top institutions by state that considering they are regional have a relatively large percentage of the loans
# for example OH - Huntington bank did 3.3% of the loan count for this dataset Mississippi
top_bks_state=df.groupby('State')['Bank'].value_counts().nlargest(15)
top_bks_state

In [None]:
top_bks_state.to_csv('bks_by_state.csv', index=True)
!cp top_bks_state.csv "drive/My Drive/"

In [None]:
loans_cat=df.groupby('description')['Loan Amount'].sum()
loans_cat.head()

In [None]:
jobs_cat=df.groupby('description')['Jobs Saved'].sum()
jobs_cat.head()

In [None]:
median_cat=df.groupby('description')['Loan Amount'].median()
median_cat.head()

In [None]:
mean_cat=df.groupby('description')['Loan Amount'].mean()
mean_cat.head()

In [None]:
category_summary_df=pd.DataFrame({
    'Jobs Saved' : jobs_cat,
    'Median Loan' : median_cat,
    'Average Loan' : mean_cat,
    'Total Loans': loans_cat,
  
})
category_summary_df.head()

No surprise - Hospitality industry seemed to take the biggest hit with Construction and Health Care loans the next highest number of jobs saved. Seemingly unexpected are Professional and Technical Services and Waste Management which may deserve a "look under the hood".

In [None]:
category_summary_df['Jobs Saved'] = category_summary_df.loc[:,'Jobs Saved'].map("{:,}".format)
category_summary_df['Median Loan'] = category_summary_df.loc[:,'Median Loan'].map("${:,.2f}".format)
category_summary_df['Average Loan'] = category_summary_df.loc[:,'Average Loan'].map("${:,.2f}".format)
category_summary_df['Total Loans'] = category_summary_df.loc[:,'Total Loans'].map("${:,.2f}".format)
category_summary_df.head(30)

In [None]:
#save to csv and will convert to a table in SQL / Tableau for further analysis 

category_summary_df.to_csv('category_summary.csv', index=True)
!cp category_summary.csv "drive/My Drive/"

In [None]:
sum_by_state=df.groupby('State',)['Loan Amount'].sum() 
sum_by_state.nlargest(8)

In [None]:
# df['Loan PCL']=pd.qcut(df['Loan Amount'], q=10)

bin_labels_10= ['10%', '20%', '30%', '40%', '50%', '60%', '70%', '80%', '90%' ]
df['Loan Percentile'] = pd.qcut(df['Loan Amount'],
                              q=[0,  .10, .20, .30, .40, .50, .60, .70, .80, 1],
                              labels=bin_labels_10)
df.sample()

In [None]:
bin_labels_4 = ['25%', '45%', '50%', '75%', ]
df['Loan Quartile'] = pd.qcut(df['Loan Amount'],
                              q=[0, .25, .50, .75, 1],
                              labels=bin_labels_4)
df.sample()

In [228]:
df['DayNUM']= pd.to_datetime(df['Date']).dt.dayofweek
df.sample()

Unnamed: 0,Loan Amount,City,State,Zip,NAICS Code,Business Type,Race,Gender,Veteran,Jobs Saved,Date,Bank,NAICS Category,description,Title,Loan Percentile,Loan Quartile,DayNUM,Weekday
234671,14000.0,MONROE,NY,10950,45411,Corporation,White,,Non-Veteran,2,2020-04-13,KeyBank National Association,45,Retail Trade,Electronic Shopping and Mail-Order Houses,30%,45%,0,Mon


In [None]:
df['DayNUM'].value_counts()

The 90th percentile of loans account for 20% of total in terms of loan amount


In [None]:
df['Loan Quartile'].value_counts()

In [None]:
day_labels= {'0': 'Mon', '1':'Tues', '2' : 'Wed', '3':'Thurs', '4': 'Fri', '5':'Sat', '6' :'Sun'}
df['Weekday'] = df['DayNUM'].astype(str).replace(day_labels)
df.sample()

In [238]:
df['Business Type'] = df['Business Type'].replace({'Limited  Liability Company(LLC)':'LLC', 
                                                   'Subchapter S Corporation': 'S Corp', 
                                                   'Self-Employed Individuals':'Self Employed', 
                                                   'Independent Contractors' : 'Self Employed',
                                                   'Non-Profit Organization': 'Non-Profit',
                                                   'Limited Liability Partnership': 'Partnership',
                                                   'Joint Venture':'Partnership'
                                                   })
df.sample()

Unnamed: 0,Loan Amount,City,State,Zip,NAICS Code,Business Type,Race,Gender,Veteran,Jobs Saved,Date,Bank,NAICS Category,description,Title,Loan Percentile,Loan Quartile,DayNUM,Weekday
188214,5200.0,BILLINGS,MT,59105,42312,Corporation,White,,Non-Veteran,2,2020-04-30,Glacier Bank,42,Wholesale Trade,Motor Vehicle Supplies and New Parts Merchant ...,10%,25%,3,Thurs


In [None]:
df['Business Type'].value_counts()

In [213]:
df['Gender'].value_counts()

Male Owned      179453
Female Owned     64511
Name: Gender, dtype: int64

In [219]:
df['Gender'] = df['Gender'].replace({'Male Owned':'Male', 'Female Owned': 'Female', })
df.sample()

Unnamed: 0,Loan Amount,City,State,Zip,NAICS Code,Business Type,Race,Gender,Veteran,Jobs Saved,Date,Bank,NAICS Category,description,Title,Loan Percentile,Loan Quartile,DayNUM,Weekday
115405,100100.0,BILOXI,MS,39530,54111,Partnership,White,,Non-Veteran,7,2020-04-10,Community Bank of Mississippi,54,Professional Scientific and Technical Services,Offices of LawyersT,90%,75%,4,Fri


In [214]:
df['Race'].value_counts()

White                               184816
Asian                                30802
Hispanic                             19594
Black or African American             7373
American Indian or Alaska Native      1296
Puerto Rican                            76
Multi Group                              4
Eskimo & Aleut                           3
Name: Race, dtype: int64

In [216]:
df['Race'] = df['Race'].replace({'Black or African American':'Black', 'American Indian or Alaska Native': 'Native American', 'Eskimo & Aleut':'Eskimo'})
df.sample()

Unnamed: 0,Loan Amount,City,State,Zip,NAICS Code,Business Type,Race,Gender,Veteran,Jobs Saved,Date,Bank,NAICS Category,description,Title,Loan Percentile,Loan Quartile,DayNUM,Weekday
238789,73500.0,SEYMOUR,CT,6483,51113,Corporation,White,Male Owned,Non-Veteran,8,2020-04-04,Ion Bank,51,Information,Book PublishersT,80%,75%,5,Sat


In [None]:
df.to_csv('full_monty_peeps.csv', index=True)
!cp full_monty_peeps.csv "drive/My Drive/"

In [None]:
loan_qt = np.percentile(df['Loan Amount'], [25, 50, 75, 100])
loan_qt


In [None]:
high_pct = df[(df["Loan Percentile"] == '90%')]

high_pct.head(1)


In [None]:
#save to csv and will convert to a table in SQL / Tableau for further analysis 

high_pct.to_csv('top_percentile_banks.csv', index=True)
!cp top_percentile_banks.csv "drive/My Drive/"

Some lines of inquiry to pursue pertaining to race and gender
From the subset of people who self identified racially, Asians borrowed a disproporianatly larger amount as a percentage of population and Hispanics less. (Asians are less than 6% and Hispanics 16%). Puerto Ricans are not necessarily accounted for in the Hispanic statistics. It is hard to say whether or not this is a true representation of percentages of all loans because this dataset is small (less than 5% of the total) and only reflects the people who were willing to answer the race inquiry. Asians and Whites had roughly the same gender percentages, median and average loan amounts. Hispanics and Blacks seemed to incur lower levels of debt.

In [None]:
state_race_cts=df.groupby('State',)['Race'].value_counts()
state_race_cts.head()

In [None]:
df

In [None]:

race_avg=df.groupby('Race')['Loan Amount'].mean()
race_avg

In [None]:
# Define plot space
fig, ax = plt.subplots(figsize=(10, 6))
ax.set(title = "Average Loan Amount by Race",
       xlabel = "", 
       ylabel = "",

       )
labels=race_avg.index
explode = (0.0, 0.0, 0.0, 0.0, 0.075, 0,  0.075, 0)
# Define x and y axes
ax.pie(race_avg, explode=explode, labels=labels)
plt.savefig('avg_loan_race.png', bbox_inches="tight")

In [None]:
# df.groupby('Race', 'Loan Amount').quantile()
race_count=df.groupby('Race').count()
print(race_count)

In [None]:
# Define plot space
fig, ax = plt.subplots(figsize=(10, 6))
ax.set(title = "",
       xlabel = "", 
       ylabel = "",

       )
labels=race_count.index
explode = (0.0, 0.0, 0.0, 0.0, 0., 0.0, 0.0, 0.05)
left  = 0.125  # the left side of the subplots of the figure
right = 0.9    # the right side of the subplots of the figure
bottom = 0.1   # the bottom of the subplots of the figure
top = 0.9      # the top of the subplots of the figure
wspace = 0.5  # the amount of width reserved for blank space between subplots
hspace = 0.9   # the amount of height reserved for white space between subplots
# These two can be called on 'fig' instead of 'plt' too
plt.subplots_adjust(left=left, bottom=bottom, right=right, top=top,
                wspace=wspace, hspace=hspace)

# Define x and y axes
ax.pie(race_count['Loan Amount'], explode=explode, labels=labels, rotatelabels=True)
plt.savefig('race_countof_loans.png')

In [None]:
vt_gen_qt=df.groupby(['Race', 'Veteran'])['Loan Amount'].quantile()
vt_gen_qt

In [None]:
vt_cat=df.groupby(['Race', 'Veteran'])['description'].value_counts()
vt_cat

In [None]:
df.sample()

In [None]:
rc_jbs=df.groupby(['Race', 'Gender'])['Jobs Saved'].sum()
rc_jbs

In [None]:
race_median=df.groupby(['Race', 'Gender'])['Loan Amount'].median()
race_median

In [None]:
race_mean=df.groupby(['Race', 'Gender'])['Loan Amount'].mean()
race_mean

In [None]:
df.shape

In [None]:
rc_gen_qt=df.groupby(['Race', 'Gender'])['Loan Amount'].quantile()


In [None]:
counts_by_race=df.groupby(['Race', 'Gender'])['Loan Amount'].count()
counts_by_race

In [None]:
race_summary= pd.DataFrame({
    'Count' : counts_by_race,
    'Quantile' : rc_gen_qt,
    'Mean' : race_mean,
    'Median' : race_median,
    'Jobs' : rc_jbs


})
race_summary.head()

In [None]:
race_summary['Count'] = race_summary.loc[:,'Count'].map("{:,}".format)
race_summary['Jobs'] = race_summary.loc[:,'Jobs'].map("{:,}".format)
race_summary['Quantile'] = race_summary.loc[:,'Quantile'].map("${:,.2f}".format)
race_summary['Mean'] = race_summary.loc[:,'Mean'].map("${:,.2f}".format)
race_summary['Median'] = race_summary.loc[:,'Median'].map("${:,.2f}".format)
race_summary

In [None]:
#save to csv and will convert to a table in SQL / Tableau for further analysis 

race_summary.to_csv('race_summary.csv', index=True)
!cp race_summary.csv "drive/My Drive/"

In [None]:
df.sample()


In [None]:
cat_sum=df.groupby('description')['Jobs Saved'].sum()
cat_sum.head()

In [None]:
cat_sum.sort_values().plot(kind='barh', y='Jobs Saved', subplots=True, figsize=(12,9), color="lightskyblue")

plt.savefig('category_jobs_saved.png', bbox_inches="tight")

In [None]:
cat_loan=df.groupby('description')['Loan Amount'].sum()
cat_loan.head()

In [None]:
cat_loan.sort_values().plot(kind='barh', y='Jobs Saved', subplots=True, figsize=(12,9), color="lightcoral")

plt.savefig('catgory_loans.png', bbox_inches="tight")

In [None]:
count_race = df.groupby('Race').agg('count')
count_desc = df.groupby('description').agg('count')
count_sum = len(df)




In [None]:
# def group_lower_ranking_values(column):
#     pie_counts = df.groupby(column).agg('count')
#     pct_value = df[lambda df: df.columns[0]].quantile(.75)
#     values_below_pct_value = df[lambda df: df.columns[0]].loc[lambda s: s < pct_value].index.values
#     def fix_values(row):
#         if row[column] in values_below_pct_value:
#             row[column] = 'Other'
#         return row 
#     pie_grouped = df.apply(fix_values, axis=1).groupby(column).agg('count')
#     return pie_grouped

# racial_index = group_lower_ranking_values('Race')
# description_index = group_lower_ranking_values('description')

In [None]:
racial_index=df.groupby('Race').agg('count')
print(racial_index)

In [None]:
gender_loan_amount=df.groupby('Gender')['Loan Amount'].agg('sum')
print(gender_loan_amount)

In [None]:
%matplotlib inline

In [None]:
# Define plot space
fig, ax = plt.subplots(figsize=(10, 6))
ax.set(title = "Total Percent of Loans by Gender",
       xlabel = "", 
       ylabel = "",

       )
labels=gender_loan_amount.index
explode = (0.0, 0.05)
left  = 0.125  # the left side of the subplots of the figure
right = 0.9    # the right side of the subplots of the figure
bottom = 0.1   # the bottom of the subplots of the figure
top = 0.9      # the top of the subplots of the figure
wspace = 0.5  # the amount of width reserved for blank space between subplots
hspace = 0.5   # the amount of height reserved for white space between subplots
# These two can be called on 'fig' instead of 'plt' too
plt.subplots_adjust(left=left, bottom=bottom, right=right, top=top,
                wspace=wspace, hspace=hspace)

# Define x and y axes
ax.pie(gender_loan_amount, explode=explode, labels=labels, rotatelabels=False)
plt.savefig('gender_loan_total.png', bbox_inches='tight')

In [None]:
df.to_csv('ppl_ppp_w_quantiles_and_categories.csv', index=True)
!cp people_ppp_super.csv "drive/My Drive/"
