In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import warnings 
warnings.filterwarnings('ignore')
import plotly.graph_objects as go
import plotly.express as px

#### Read in Kiva loans csv file

In [None]:
dfo = pd.read_csv('../input/data-science-for-good-kiva-crowdfunding/kiva_loans.csv')

#### Subset data for country Cambodia

In [None]:
df = dfo[dfo['country']=='Cambodia'].reset_index(drop = True)

## 1. Overview of the Data

In [None]:
df.head(5)

In [None]:
df.columns

In [None]:
df.shape

In [None]:
df.info()

Statistical summary

In [None]:
df.describe()

In [None]:
df.describe(include = 'O')

## 2. Checking for missing values

In [None]:
null  = df.isnull().sum().sort_values(ascending = False).reset_index()
null.columns = ['Column', 'Frequency']
null

In [None]:
fig = go.Figure()
colors=[' #61725f ']*len(null.Column)
fig.add_trace(go.Bar(y=null.Frequency,x=null.Column,marker_color=colors))
fig.update_layout(
title = 'Distribution of Null Values in Columns',
    title_x=0.5,
    xaxis_title = 'Columns',
    yaxis_title = 'No of missing Values'
)
fig.show()

From the above bar chart we see that `Tags` column has extreemly high number of null values, so it has to be dropped
#### a. Drop column Tags
https://stackoverflow.com/questions/43311555/how-to-drop-column-according-to-nan-percentage-for-dataframe

In [None]:
df.drop('tags', axis=1,inplace=True)

Fill null values

In [None]:
df['funded_time'].mode()

In [None]:
df['funded_time'].fillna(df['funded_time'].mode(), inplace = True)

In [None]:
df['use'].fillna(df['use'].mode(), inplace = True)

In [None]:
df['borrower_genders'].fillna(df['borrower_genders'].mode(), inplace = True)

Drop remaining null values

In [None]:
df.dropna(inplace = True)

Check if there still exists null values.

In [None]:
df.isna().sum()

## 3. Univariate Analysis
Explore variables one by one 

### I. Continous variables
To understand central tendency and spread of each variable

### a. Funded amount

#### i.Boxplot

In [None]:
fig = go.Figure()
fig.add_trace(go.Box(name='funded amount',y=df.funded_amount))

fig.update_layout(
title = 'Boxplot Distribution of Funded amount in Cambodia',
title_x = 0.5,
yaxis_title='Amount in dollars')
fig.show()

We have 2 outliers who received funding of 20k and 30k

The Rest of the users received funding of less than 3k

#### ii. Histogram

In [None]:
fig = go.Figure()
fig.add_trace(go.Histogram(x=df.funded_amount, xbins=dict(start=0,end=5000)))
fig.update_layout(
    xaxis_title = 'Funded Amount',
    yaxis_title = 'Frequency',
    title = 'Histogram of Funded Amount',
    title_x = 0.3
)
fig.show()

Has a positive skew(to the right)

### b. Loan amount

#### i.Boxplot

In [None]:
fig = go.Figure()
fig.add_trace(go.Box(name='loan amount',y=df.loan_amount))
fig.update_layout(
title = 'Boxplot Distribution of Loan amount in Cambodia',
title_x = 0.5,
yaxis_title='Amount in dollars')
fig.show()

We have 2 outliers who received loan of 20k and 30k

The Rest of the users received a loan of less than 3k

#### ii. Histogram

In [None]:
fig = go.Figure()
fig.add_trace(go.Histogram(x=df.loan_amount, xbins=dict(start=0,end=3000)))
fig.update_layout(
    xaxis_title = 'Loan Amount',
    yaxis_title = 'Frequency',
    title = 'Histogram of Loan Amount',
    title_x = 0.5
)
fig.show()

Has a positive skew(to the right)

### c. term in months

#### i.Boxplot

In [None]:
fig = go.Figure()
fig.add_trace(go.Box(name='term in months',y=df.term_in_months))
fig.update_layout(
title = 'Boxplot Distribution of term in months',
title_x = 0.5,
yaxis_title='months')
fig.show()

We have several outliers ranging from 24 to 120 months

#### ii. Histogram

In [None]:
fig = go.Figure()
fig.add_trace(go.Histogram(x=df.term_in_months, xbins=dict(start=0,end=100)))
fig.update_layout(
    xaxis_title = 'Months',
    yaxis_title = 'Frequency',
    title = 'Histogram of Tern In Months',
    title_x = 0.3
)
fig.show()

Has a positive skew(to the right)

### d. Lender Count
Number of lenders contributing to the loan

#### i.Boxplot

In [None]:
fig = go.Figure()
fig.add_trace(go.Box(name='Lender Count',y=df.lender_count))
fig.update_layout(
title = 'Boxplot Distribution of Lender Count',
title_x = 0.5,
yaxis_title='No of Lenders')
fig.show()

#### ii. Histogram

In [None]:
fig = go.Figure()
fig.add_trace(go.Histogram(x=df.lender_count,xbins=dict(start=0,end=100)))
fig.update_layout(
    xaxis_title = 'Lender Count',
    yaxis_title = 'Frequency',
    title = 'Histogram of Lender Count',
    title_x = 0.3
)
fig.show()

### II. Categorical Variables
To understand the distribution of each category

, sector, region, currency, borrower_genders, repayment_interval

### a. Activity

In [None]:
activity = df.activity.value_counts().to_frame().head(20).reset_index()
activity.columns=['Activity','Frequency']

#### i. Frequency Table

In [None]:
activity

#### ii. Bar Plot

In [None]:
fig = go.Figure()
colors=[' #61725f ']*len(activity.Activity)
fig.add_trace(go.Bar(y=activity.Activity,x=activity.Frequency,orientation='h',marker_color=colors))
fig.update_yaxes(autorange='reversed')
fig.update_layout(
title = 'Top 20 Activities Funded By Kiva',
    title_x=0.5,
    xaxis_title = 'Frequency',
    yaxis_title = 'Activity'
)
fig.show()

### b. Sector

In [None]:
sector = df.sector.value_counts().to_frame().head(20).reset_index()
sector.columns=['Sector','Frequency']

#### i. Frequency Table

In [None]:
sector

#### ii. Bar Plot

In [None]:
fig = go.Figure()
colors=[' #61725f ']*len(sector.Sector)
fig.add_trace(go.Bar(y=sector.Sector,x=sector.Frequency,orientation='h',marker_color=colors))
fig.update_yaxes(autorange='reversed')
fig.update_layout(
title = 'Top 20 Sectors Funded By Kiva',
    title_x=0.5,
    xaxis_title = 'Frequency',
    yaxis_title = 'sector'
)
fig.show()

#### iii. Pie Chart

In [None]:
labels = sector.Sector
values = sector.Frequency
fig = go.Figure(data=[go.Pie(labels=labels, values=values)])
fig.update_layout(
title='Represention of Sectors Funded by Kiva Loans In Cambodia ',
title_x = 0.2)
fig.show()

### c. Region

Split multiple values from the rows for an acurate count

In [None]:
region_list = []
for region in df.region.values:
    if str(region) != "nan":
        region_list.extend( [lst.strip() for lst in region.split(",")] )
temp_data = pd.Series(region_list).value_counts()

Strip() = Removes spaces at the beginning and at the end of the string eg https://www.w3schools.com/python/ref_string_strip.asp

extend() = Add the elements of one list to another list eg https://www.w3schools.com/python/ref_list_extend.asp

#### i. Frequency Table

In [None]:
Region=temp_data.to_frame().reset_index()
Region.columns=['Region','Frequency']
region = Region.head(20)
region

#### ii. Bar Plot

In [None]:
fig = go.Figure()
colors=[' #61725f ']*len(region.Region)
fig.add_trace(go.Bar(y=region.Region,x=region.Frequency,orientation='h',marker_color=colors))
fig.update_yaxes(autorange='reversed')
fig.update_layout(
title = 'Top 20 Regions Funded By Kiva',
    title_x=0.5,
    xaxis_title = 'Frequency',
    yaxis_title = 'Region'
)
fig.show()

#### iii. mapbox

In [None]:
dfo1 = pd.read_csv('../input/data-science-for-good-kiva-crowdfunding/kiva_mpi_region_locations.csv')
dfl = dfo1[dfo1['country']=='Cambodia'].reset_index()
dfl = dfl[['region','lat','lon']]
dfl.columns = ['Region','lat','lon']
dfl.at[1,['Region','lat','lon']]=['Battambang',13.0957,103.2022]
dfl.at[4,['lat','lon']] = [11.4650,104.52085]
dfl.dropna(inplace=True)

In [None]:
dfl.set_index('Region', inplace=True)

In [None]:
dfs = [Region,dfl]

In [None]:
from functools import reduce
dfc = reduce(lambda left,right:pd.merge(left,right,on='Region'),dfs)
dfc

In [None]:
px.set_mapbox_access_token('pk.eyJ1IjoiZXJuZXN0NDA0IiwiYSI6ImNrOWlmOG1idjAwdTEzbHBjdnB5MzFndXEifQ.i_TnCFGI64JcmoA0caIhgQ')

In [None]:
px.scatter_mapbox(dfc, lat = 'lat', lon = 'lon', color = 'Region', size = 'Frequency', size_max = 15, title = 'Mapbox Showing Different Regions vs Number of Loan They Recieved')


### d. Borrower_genders

In [None]:
gender_list=[]
for gender in df.borrower_genders.values:#Goes through every row in the column
    if str(gender) != 'nan':# skips null cells
        gender_list.extend([lst.strip() for lst in gender.split(',')])
        #In the cell,we strip() remove white spaces eg " kenya " and split comma separated values into individual elements
        #Using extend
temp_data = pd.Series(gender_list).value_counts()
gender = temp_data.to_frame().head(20).reset_index()
gender.columns = ['Gender', 'Frequency']


#### i. Frequency Table

In [None]:
gender

#### ii. Pie Chart

In [None]:
labels = gender.Gender
values = gender.Frequency
fig = go.Figure(data=[go.Pie(labels=labels, values=values)])
fig.update_layout(
title='Represention of Genders Funded by Kiva Loans In Cambodia ',
title_x = 0.2)
fig.show()

### e. Repayment intervals 

In [None]:
repayment_interval = df['repayment_interval'].value_counts().to_frame().reset_index()
repayment_interval.columns = ['Repayment_interval','Frequency']

#### i. Frequency Table

In [None]:
repayment_interval

#### ii. Pie Chart

In [None]:
labels = repayment_interval.Repayment_interval
values = repayment_interval.Frequency
fig = go.Figure(data=[go.Pie(labels=labels, values=values)])
fig.update_layout(
title='Represention of Repayment Intervals In Cambodia ',
title_x = 0.2)
fig.show()

Reference: https://medium.com/@bramtunggala/a-simple-way-to-finding-the-difference-between-two-dates-in-pandas-179d2714b6c

## 4. Bi-variate analysis
Finding relationship between 2 variables

### I. Continous vs Continous Variables

#### a. Funded amount vs Loan amount

##### i. Scatter plot

In [None]:
sns.scatterplot(x='funded_amount',y='loan_amount',data=df);


##### ii. Correlation

In [None]:
np.corrcoef(df.funded_amount,df.loan_amount)

#### b. Funded amount vs Term in months

##### i. Scatter plot

In [None]:
sns.scatterplot(x='funded_amount',y='term_in_months',data=df);

##### ii. Correlation

In [None]:
np.corrcoef(df.funded_amount,df.term_in_months)

In [None]:
# sns.scatterplot(x='disbursed_to_funded_time',y='funded_amount',data=df)

#### c. Funded amount vs  Lender count

##### i. Scatter plot

In [None]:
sns.scatterplot(x='funded_amount',y='lender_count',data=df);

##### ii. Correlation

In [None]:
np.corrcoef(df.funded_amount,df.lender_count)

#### c. Loan amount vs Term in months

##### i. Scatter plot

In [None]:
sns.scatterplot(y='loan_amount',x='term_in_months',data=df);

##### ii. Correlation

In [None]:
np.corrcoef(df.loan_amount,df.term_in_months)

#### d. Loan amount vs Lender count

##### i. Scatter plot

In [None]:
sns.scatterplot(x='loan_amount',y='lender_count',data=df);

##### ii. Correlation

In [None]:
np.corrcoef(df.loan_amount,df.lender_count)

#### e. Term in months vs Lender count 

##### i. Scatter plot

In [None]:
sns.scatterplot(x='term_in_months',y='lender_count',data=df);

##### ii. Correlation

In [None]:
np.corrcoef(df.term_in_months,df.lender_count)

### II. Categorical vs  Continous variables

#### a. Sectors vs Loan Amount

##### i. Barchart of Top Sectors By Total Loan Amount Recieved

In [None]:
count = round(df.groupby(['sector'])['loan_amount'].sum().sort_values(ascending=False))
fig = go.Figure()
fig.add_trace(go.Bar(y=count.index,x=count.values,orientation='h'))
fig.update_yaxes(autorange='reversed')
fig.update_layout(
title = 'Top Sectors By Total Loan Amount Recieved',
    title_x=0.5,
    xaxis_title='loan amount in Dollar',
    yaxis_title='Sector'
)
fig.show()

##### ii. Barchart of  Top Sectors By Average Loan Amount Individuals Recieved

In [None]:
count = round(df.groupby(['sector'])['loan_amount'].mean().sort_values(ascending=False))
fig = go.Figure()
fig.add_trace(go.Bar(y=count.index,x=count.values,orientation='h'))
fig.update_yaxes(autorange='reversed')
fig.update_layout(
title = 'Top Sectors By Average Loan Amount Recieved',
    title_x=0.5,
    xaxis_title='loan amount in Dollars',
    yaxis_title='Sector'
)
fig.show()

#### b. Region vs  Loan Amount

##### i. Barchart of  Top Region By Total Loan Amount Recieved

In [None]:
count = round(df.groupby(['region'])['loan_amount'].sum().sort_values(ascending=False)).head(20)
fig = go.Figure()
fig.add_trace(go.Bar(y=count.index,x=count.values,orientation='h'))
fig.update_yaxes(autorange='reversed')
fig.update_layout(
title = 'Top Region By Total Loan Amount Recieved',
    title_x=0.5,
    xaxis_title='loan amount in Dollar',
    yaxis_title='Region'
)
fig.show()

##### ii. Barchart of  Top Region By Average Loan Amount Individuals Recieved

In [None]:
count = round(df.groupby(['region'])['loan_amount'].mean().sort_values(ascending=False)).head(20)
fig = go.Figure()
fig.add_trace(go.Bar(y=count.index,x=count.values,orientation='h'))
fig.update_yaxes(autorange='reversed')
fig.update_layout(
title = 'Top Region By Average Loan Amount Recieved',
    title_x=0.5,
    xaxis_title='loan amount in Dollar',
    yaxis_title='Region'
)
fig.show()

#### c. Repayment Interval vs Loan Amount

##### i. Barchart of  Repayment Interval By Total Loan Amount

In [None]:
count = round(df.groupby(['repayment_interval'])['loan_amount'].sum().sort_values(ascending=False)).head(20)
fig = go.Figure()
fig.add_trace(go.Bar(y=count.index,x=count.values,orientation='h'))
fig.update_yaxes(autorange='reversed')
fig.update_layout(
title = 'Repayment Interval By Total Loan Amount',
    title_x=0.5,
    xaxis_title='loan amount in Dollar',
    yaxis_title='repayment interval'
)
fig.show()

##### ii. Barchart of  Repayment Interval By Average Loan Amount Individuals Recieved

In [None]:
count = round(df.groupby(['repayment_interval'])['loan_amount'].mean().sort_values(ascending=False)).head(20)
fig = go.Figure()
fig.add_trace(go.Bar(y=count.index,x=count.values,orientation='h'))
fig.update_yaxes(autorange='reversed')
fig.update_layout(
title = 'Repayment Interval By Average Loan Amount',
    title_x=0.5,
    xaxis_title='loan amount in Dollar',
    yaxis_title='repayment interval'
)
fig.show()

## 5. Trends over time

### i. Frequecy of Loans over time

In [None]:
df.index = pd.to_datetime(df['funded_time'])
fund_time = df['funded_time'].resample('w').count().to_frame()
fund_time.columns  = ['Frequency']
fig = go.Figure()
fig.add_trace(go.Scatter(x=fund_time.index, y=fund_time.Frequency,
                    mode='lines',
                    name='lines'))
fig.update_layout(
    title='Loan Trends of Over Time(weekly)',
    title_x=0.5,
    yaxis_title='No. of loans',
    xaxis_title='Timeline'

)
fig.show()


##### resample()
Makes bins based on days, weeks, or even months 

Reference:https://stackoverflow.com/questions/14530556/resample-time-series-in-pandas-to-a-weekly-interval

## 6.  Cambodia compare to other countries

#### i. Number of loans by country

In [None]:
country_rank = dfo['country'].value_counts().to_frame().head(20).reset_index()
country_rank.columns=['country','Number']

In [None]:
country_rank.head(5)

In [None]:
rank = country_rank.index[country_rank.country == 'Cambodia'].tolist()
rank = rank[0]

In [None]:
fig = go.Figure()
colors=[' #61725f ']*len(country_rank.country)
colors[rank]= 'crimson'
fig.add_trace(go.Bar(y=country_rank.country,x=country_rank.Number,orientation='h',marker_color=colors))
fig.update_yaxes(autorange='reversed')
fig.update_layout(
title = 'Number of Loans in Cambodia compared to other countries',
    title_x=0.5,
)
fig.show()

#### ii. Amount of loans by country

In [None]:
country_fund = dfo.groupby('country').sum()['loan_amount'].sort_values(ascending = False).to_frame().reset_index()
country_fund.columns = ['Country', 'Total_amount']
country_fund.head(10)

In [None]:
fig = px.choropleth(country_fund, 
                    locations="Country", 
                    locationmode = "country names",
                    color="Total_amount",
                    
                    hover_name="Country"
                   )
fig.update_layout(
    title_text = 'Top Countries By Total Amount Loaned',
    title_x = 0.5,
    geo=dict(
        showframe = False,
        showcoastlines = False,
    ))
    
fig.show()