In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


## INTRODUCTION

### Import all the necessary functions to be used.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import warnings 
warnings.filterwarnings('ignore')
import plotly.graph_objects as go
import plotly.express as px

### Read in Kiva loans file

In [None]:
data = pd.read_csv('/kaggle/input/data-science-for-good-kiva-crowdfunding/kiva_loans.csv')

### Subset data for Puerto Rico

In [None]:
df = data[data['country']=='Puerto Rico'].reset_index()

### Overview of Puerto Rico subset data

In [None]:
df.head(5)

In [None]:
df.tail(5)

In [None]:
df.info()

In [None]:
df.columns

In [None]:
df.describe()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.duplicated().sum()

### Drop columns that are not necessarily needed

In [None]:
df.drop('index',axis=1,inplace=True)

In [None]:
df

Checking for missing values

In [None]:
df.isna().sum()

In [None]:
missing  = df.isna().sum().to_frame().reset_index()
missing.columns = ['Column', 'Frequency']
missing.sort_values('Frequency',inplace=True)

In [None]:
fig = go.Figure()
colors=[' #34567f ']*len(missing.Column)
fig.add_trace(go.Bar(y=missing.Frequency,x=missing.Column,marker_color=colors))
fig.update_layout(
title = 'Distribution of Missing Values in Columns',
    title_x=0.5,
    xaxis_title = 'Columns',
    yaxis_title = 'No of missing Values'
)
fig.show()

From the missing values we can tell that partner_id and region columns have no data at all.
In accordance to the analysis questions,lets drop region since it is innaplicable

In [None]:
df.drop('region', axis=1,inplace=True)

### Exploratory data analysis.

1. For the top sector, what activity had the highest amount of loans? 

In [None]:
puerto = data[data['country'] == 'Puerto Rico'].reset_index(drop = True)
puerto.head(2)

In [None]:
activity_df = puerto.groupby('sector')['loan_amount', 'lender_count', 'funded_amount'].sum()\
         .sort_values(by = 'loan_amount', ascending = False).reset_index().head(10)

activity_df

2. Defining Varibles to be used

In [None]:
sector = activity_df['sector']
loan = activity_df['loan_amount']
fund = activity_df['funded_amount']
lender = activity_df['lender_count']

In [None]:
df['sector'].value_counts()

In [None]:
loans = df.groupby('sector')['loan_amount'].sum().sort_values(ascending = False).reset_index().head(10)
loans

Barplot showing Loan Amount by Sector

In [None]:
plt.figure(figsize = (10,5))

plt.title('Loan Amount by Sector', fontsize = 15)
plt.xlabel('Sector', fontsize = 15)
plt.ylabel('Loan Amount', fontsize = 15)

plt.xticks(rotation = 60)

plt.bar(sector, loan, edgecolor = 'k')

plt.show()

3. how is funded amount distributed across sectors in Puerto Rico?

In [None]:
fund = df.groupby('sector')['funded_amount'].sum().sort_values(ascending = False).reset_index()
fund

Histogram showing the distribution of funded amount in Puerto Rico

In [None]:
plt.Figure(figsize = (10,5))

x = np.array(['funded_amount'])
plt.Figure(figsize = (10,5))
plt.hist(x)
plt.show()

4. What were the differences in distribution time and posted time of the loans? How does that compare to repayment interval for the various loans?

In [None]:
df.head(2)

In [None]:
time = df.groupby('sector')['posted_time','disbursed_time'].sum().sort_values(by ='posted_time', ascending = False).reset_index().head(10)
time

5. What does the repayment interval look like for various loan amounts? In various sectors? In various activities?

In [None]:
pay = df.groupby('sector')['loan_amount','term_in_months'].sum().sort_values(by ='loan_amount', ascending = True).reset_index().head(10)
pay

In [None]:
plt = go.Figure()
plt.add_trace(go.Box(name='term in months',y=df.term_in_months))
plt.update_layout(
title = 'Boxplot Distribution of term in months',
title_x = 0.5,
yaxis_title='months')
plt.show()

In [None]:
repayment_interval = df.groupby('repayment_interval')['loan_amount'].sum().sort_values().reset_index()
repayment_interval

In [None]:
repayment_interval = df.groupby(['repayment_interval','sector'])['loan_amount'].sum().sort_values().reset_index()
repayment_interval

In [None]:
repayment_interval = df.groupby(['repayment_interval','activity'])['loan_amount'].sum().sort_values().reset_index()
repayment_interval

6. What were the numbers between male and female recipients?

In [None]:
def gender_lead(gender):
    gender = str(gender)
    if gender.startswith('f'):
        gender = 'female'
    else:
        gender = 'male'
    return gender

In [None]:
df['gender_lead'] = df['borrower_genders'].apply(gender_lead)
df['gender_lead'].nunique()

In [None]:
f = df['gender_lead'].value_counts()[0]
m = df['gender_lead'].value_counts()[1]

print('{} females ({}%) vs {} males ({}%) got loans'.format(f,round(f*100/(f+m),2),m,round(m*100/(f+m)),2))

In [None]:
df_gender = pd.DataFrame(dict(gender = ['female','male'], counts = [f,m]))
df_gender

In [None]:
import matplotlib.pyplot as plt


In [None]:
plt.bar(df_gender.gender,df_gender.counts) 

plt.show()

### Finding the relation between two variables

a) Funded amount vs Loan amount

In [None]:
amount = df.groupby('sector')['loan_amount','funded_amount'].sum().sort_values(by ='loan_amount', ascending = True).reset_index().head(10)
amount

Scatter plot

In [None]:
sns.scatterplot(x='funded_amount',y='loan_amount',data=amount);

b) Loan amount vs Term in months

In [None]:
loan_term= df.groupby('sector')['loan_amount','term_in_months'].sum().sort_values(by ='loan_amount', ascending = True).reset_index().head(10)
loan_term

Bar plot

In [None]:
plt.xticks (rotation = 60)
sns.barplot(x='loan_amount',y='term_in_months',data=loan_term);


c) Funded amount vs term in months

In [None]:
fund_term= df.groupby('sector')['funded_amount','term_in_months'].sum().sort_values(by ='funded_amount', ascending = True).reset_index().head(10)
fund_term

Bar plot

In [None]:
sns.barplot(x='term_in_months',y='funded_amount',data=fund_term);

d) Loan amount vs lender count

In [None]:
count = df.groupby('sector')['loan_amount','lender_count'].sum().sort_values(by ='loan_amount', ascending = True).reset_index().head(10)
count

Scatter plot

In [None]:
sns.scatterplot(x='loan_amount',y='lender_count',data=count);

e) Funded amount vs lender count

In [None]:
fund_count = df.groupby('sector')['funded_amount','lender_count'].sum().sort_values(by ='funded_amount', ascending = True).reset_index().head(10)
fund_count

Scatter plot

In [None]:
sns.scatterplot(x='funded_amount',y='lender_count',data=fund_count);

f) Term in months vs Lender count

In [None]:
term_count = df.groupby('sector')['term_in_months','lender_count'].sum().sort_values(by ='lender_count', ascending = True).reset_index().head(10)
term_count

Bar plot

In [None]:
sns.barplot(x='lender_count',y='term_in_months',data=term_count);

g) Loan amount compared to the different sectors and activities in Puerto Rico

In [None]:
loans_amnt = df.groupby('sector')['loan_amount'].sum().sort_values(ascending = False).reset_index().head(10)
loans_amnt

In [None]:
loans_amnt = df.groupby('activity')['loan_amount'].sum().sort_values(ascending = False).reset_index().head(10)
loans_amnt

Multiple Plots for loan amount compared to sector and activity

In [None]:
activity_df = puerto.groupby('activity')['loan_amount', 'lender_count', 'funded_amount'].sum()\
         .sort_values(by = 'loan_amount', ascending = False).reset_index().head(10)

activity_df

In [None]:
activity = activity_df['activity']

In [None]:
plt.figure(figsize = (15,5))
plt.subplot(1,2,1)
plt.title('Loan Amount by Sector')

plt.xticks(rotation = 75)
plt.xlabel('Sector')
plt.ylabel('Loan Amount')

plt.plot(sector,loan)

plt.subplot(1,2,2)
plt.title('Loan Amount by Activity')

plt.xticks(rotation = 75)
plt.xlabel('Activity')
plt.ylabel('Loan Amount')

plt.plot(activity,loan)

plt.show()

### Puerto Rico compared to other countries

In [None]:
country_rank = data['country'].value_counts().to_frame().reset_index()
country_rank.columns=['country','Number']

In [None]:
country_rank.head(10)

In [None]:
country_rank = df['country'].value_counts().to_frame().reset_index()
country_rank.columns=['country','Number']

In [None]:
country_rank

In [None]:
country_loan = data.groupby('country').sum()['loan_amount'].sort_values(ascending = False).to_frame().reset_index()
country_loan.columns = ['Country', 'Total_amount']
country_loan.head(10)

In [None]:
country_loan = df.groupby('country').sum()['loan_amount'].sort_values(ascending = False).to_frame().reset_index()
country_loan.columns = ['Country', 'Total_amount']
country_loan