In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np
import os
import pandas as pd
import sys
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from wordcloud import WordCloud,STOPWORDS

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [2]:
kiva_loans = pd.read_csv('../input/kiva_loans.csv')
loan_themes_by_region = pd.read_csv('../input/loan_themes_by_region.csv')
kiva_mpi_region_locations = pd.read_csv('../input/kiva_mpi_region_locations.csv')
loan_theme_ids = pd.read_csv('../input/loan_theme_ids.csv')

In [3]:
kiva_loans.head()

In [4]:
plt.figure(figsize=(12,8))
plt.scatter(range(len(kiva_loans['loan_amount'])),np.sort(kiva_loans['loan_amount'].values))
plt.xlabel("index")
plt.ylabel("Loan in USD")
plt.show()

In [5]:
plt.figure(figsize=(12,8))
plt.scatter(range(len(kiva_loans['funded_amount'])),np.sort(kiva_loans['funded_amount'].values))
plt.xlabel("index")
plt.ylabel("Fund amount in USD")
plt.show()

In [6]:
loans_without_outlier = kiva_loans[kiva_loans['loan_amount'] < 20000.0]
funds_without_outlier = kiva_loans[kiva_loans['funded_amount'] < 20000.0]

In [7]:
plt.figure(figsize=(12,8))
sns.distplot(loans_without_outlier.loan_amount.values,bins=50)
plt.xlabel("Loan amount")
plt.show()

In [8]:
plt.figure(figsize=(12,8))
sns.distplot(funds_without_outlier.loan_amount.values,bins=50)
plt.xlabel("Fund amount")
plt.show()

Both distributions are right skewed.

In [9]:
kiva_loans['loan_amount'].sum()

Total loan amount was 565421150 dollars.

In [10]:
kiva_loans['funded_amount'].sum()

Total funded amount by Kiva.org was 527563815 dollars.

## Lender Distribution

In [30]:
plt.figure(figsize=(12,8))
plt.scatter(range(len(kiva_loans['lender_count'])),np.sort(kiva_loans['lender_count'].values))
plt.xlabel("index")
plt.ylabel("Lender Count")
plt.show()

In [43]:
top_lender = kiva_loans['lender_count'].value_counts().head(30)
plt.figure(figsize=(12,8))
sns.barplot(top_lender.index, top_lender.values)
plt.xlabel("Lender Count(Number of lenders)")
plt.ylabel("Count")
plt.show()

## Top 10 Sector

In [11]:
plt.figure(figsize=(12,6))
kiva_loans['sector'].value_counts().head(10).plot.bar()

## Top 10 Activity

In [12]:
plt.figure(figsize=(12,6))
kiva_loans['activity'].value_counts().head(10).plot.barh()

## Top 10 countries 

In [13]:
plt.figure(figsize=(12,6))
kiva_loans['country'].value_counts().head(10).plot.barh()

## Top 10 Region

In [14]:
plt.figure(figsize=(12,6))
kiva_loans['region'].value_counts().head(10).plot.barh()

## Borrowers based on gender

In [15]:
male_count = 0
female_count = 0
for i in kiva_loans['borrower_genders']:
    li = str(i).split(',')
    for j in li:
        if j.strip() == 'female':
            female_count += 1
        else:
            male_count += 1

In [16]:
[male_count, female_count]

In [17]:
sns.barplot(x=['male', 'female'], y=[male_count, female_count],
            label="Total", color="b")

Female borrowers are more than male borrowers.

## Repayment Interval

In [18]:
plt.figure(figsize=(12,6))
kiva_loans['repayment_interval'].value_counts().head().plot.bar()

## Top 5 dates 

In [19]:
plt.figure(figsize=(12,6))
kiva_loans['date'].value_counts().head().plot.barh()

Highest number of loans were given in 2017.

In [20]:
use = kiva_loans["use"][~pd.isnull(kiva_loans["use"])]

In [21]:
wordcloud = WordCloud(max_font_size=50, width=600, height=300).generate(' '.join(use))

In [22]:
plt.figure(figsize=(12,8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")

In [48]:
#Correlation Matrix
corr = kiva_loans.corr()
plt.figure(figsize=(12,12))
sns.heatmap(corr, 
            xticklabels=corr.columns.values,
            yticklabels=corr.columns.values, annot=True, cmap='cubehelix', square=True)
plt.title('Correlation between different features')
corr

In [23]:
loan_themes_by_region.head()

In [24]:
loan_theme_ids.head()

In [25]:
kiva_mpi_region_locations.head()

## Null values

In [26]:
kiva_loans.isnull().sum(axis=0)

In [27]:
loan_theme_ids.isnull().sum(axis=0)

In [28]:
loan_themes_by_region.isnull().sum(axis=0)

In [29]:
kiva_mpi_region_locations.isnull().sum(axis=0)

In [47]:
top_mpi = kiva_mpi_region_locations['MPI'].value_counts().head()
plt.figure(figsize=(12,8))
top_mpi.plot.bar()
#sns.barplot(kiva_mpi_region_locations.index, kiva_mpi_region_locations.values)
plt.xlabel("Top MPI")
plt.ylabel("Count")
plt.show()

To be continued...