In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime as dt
import warnings

warnings.filterwarnings('ignore')

sns.set(style='whitegrid', palette="deep", font_scale=1.2, rc={"figure.figsize": [20,10]})
%matplotlib inline

pd.set_option('float_format', '{:.2f}'.format)
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', None)

In [2]:
import os
currentpath = os.getcwd()
lenders_path = os.path.join(currentpath,'Data/lenders.csv')
loans_path = os.path.join(currentpath,'Data/loans.csv')
loans_lenders_path = os.path.join(currentpath,'Data/loans_lenders.csv')

loans_lenders = pd.read_csv(loans_lenders_path)
loans = pd.read_csv(loans_path)
lenders = pd.read_csv(lenders_path)

In [3]:
import requests
import bs4
import json
import re
import time

## Scraped full list of field partners

In [4]:
url = 'https://www.kiva.org/about/where-kiva-works'
res = requests.get(url)
soup = bs4.BeautifulSoup(res.text,'lxml')


In [82]:
scraped_df = []


for link in soup.findAll('a', attrs={'href': re.compile("^https://www.kiva.org/about/where-kiva-works/partners/")}):
    try:
        field_partner_info = {}
        
#         print(link.get('href'))
#         print(link)
#         print(link.string)
        field_partner_info['link'] = link.get('href')
        field_partner_info['field_partner_name'] = link.string
        field_partner_info['id'] = re.findall(r'\b\d+\b', link.get('href'))[0]
        scraped_df.append(field_partner_info)
    except AttributeError:
        pass

https://www.kiva.org/about/where-kiva-works/partners/199
<a class="img img-w50 thumb" href="https://www.kiva.org/about/where-kiva-works/partners/199"><img alt="CrediCampo" src="https://www-kiva-org-0.freetls.fastly.net/img/w50/9fe114dc87586de9cf42d1a2d33227ec.jpg" title="CrediCampo"/></a>
None
https://www.kiva.org/about/where-kiva-works/partners/199
<a href="https://www.kiva.org/about/where-kiva-works/partners/199">CrediCampo</a>
CrediCampo
https://www.kiva.org/about/where-kiva-works/partners/181
<a class="img img-w50 thumb" href="https://www.kiva.org/about/where-kiva-works/partners/181"><img alt="Credo" src="https://www-kiva-org-0.freetls.fastly.net/img/w50/12f9ab1ecff95dd5f28c88d9f0fd0a87.jpg" title="Credo"/></a>
None
https://www.kiva.org/about/where-kiva-works/partners/181
<a href="https://www.kiva.org/about/where-kiva-works/partners/181">Credo</a>
Credo
https://www.kiva.org/about/where-kiva-works/partners/145
<a class="img img-w50 thumb" href="https://www.kiva.org/about/where-kiva-

In [84]:
field_partner_df = pd.DataFrame.from_dict(scraped_df)
field_partner_clean = field_partner_df[field_partner_df['field_partner_name'].notnull()]
field_partner_clean.to_csv('field_partner_list.csv',index=False)

## Scraped detailed field partner info

In [192]:
page_url = 'https://www.kiva.org/about/where-kiva-works/partners/200'
    
# Obtain Request
res = requests.get(page_url)

# Turn into Soup
soup = bs4.BeautifulSoup(res.text,'lxml')
soup

<!DOCTYPE html>
<html class="no-js" lang="en" xmlns:fb="http://www.facebook.com/2008/fbml" xmlns:og="http://opengraphprotocol.org/schema/">
<head>
<meta charset="utf-8"/>
<script>
		var __start = (new Date()).getTime();
	</script>
<script type="text/javascript">
			function tombstone(filename) {
				var xhttp = new XMLHttpRequest();
				xhttp.open("POST", window.location.protocol + "//" + window.location.host + '/ajax/jsTombstoneLogger', true);
				xhttp.setRequestHeader("Content-type", "application/x-www-form-urlencoded");
				xhttp.send("jsFilename=" + filename);
			}
		</script>
<meta content="IE=edge,chrome=1" http-equiv="X-UA-Compatible"/>
<meta content="width=device-width, initial-scale=1, user-scalable=yes" name="viewport"/>
<meta content="Make a loan to an entrepreneur across the globe for as little as $25. Kiva is the world's first online lending platform connecting online lenders to entrepreneurs across the globe." name="description"/>
<meta content="Kiva - Where Kiva works" 

In [212]:
scraped_df = []
printcounter = 1
start_time = time.time()
accum=0

for id in list(field_partner_clean['id']):
# for id in list(set(full_list['id'].unique())-set(field_partner_info['id'].unique())):
    field_partner_details = {}
#     print(id)
    # Concatenate to get new page URL
    page_url = 'https://www.kiva.org/about/where-kiva-works/partners/'+str(id)
    
    # Obtain Request
    res = requests.get(page_url)
    
    # Turn into Soup
    soup = bs4.BeautifulSoup(res.text,'lxml')
    
    # Return dictionary
    field_partner_details['id'] = id
    try:
        if len(re.findall(r'\d[.]?\d{0,1}?', str(soup.findAll('div', attrs={'class': 'partnerRating'})))[0])==1:
            partner_rating = re.findall(r'\d[.]?\d{0,1}?', str(soup.findAll('div', attrs={'class': 'partnerRating'})))[0]
        elif len(re.findall(r'\d[.]?\d{0,1}?', str(soup.findAll('div', attrs={'class': 'partnerRating'})))[0])==2:
            partner_rating = (re.findall(r'\d[.]?\d{0,1}?', str(soup.findAll('div', attrs={'class': 'partnerRating'})))[0])+(re.findall(r'\d[.]?\d{0,1}?', str(soup.findAll('div', attrs={'class': 'partnerRating'})))[1])
    except IndexError:
        partner_rating = 'Inactive'
    
    field_partner_details['partner_rating'] = partner_rating
    field_partner_details['time_on_kiva'] = soup.findAll('dd')[2].text
    field_partner_details['kiva_borrowers'] = soup.findAll('dd')[3].text
    field_partner_details['total_loans'] = soup.findAll('dd')[4].text
    field_partner_details['are_interest_fees_charged'] = soup.findAll('dd')[5].text
    field_partner_details['avg_cost_to_borrower'] = soup.findAll('dd')[6].text
    field_partner_details['profitability_ROA'] = soup.findAll('dd')[7].text
    field_partner_details['avg_loan_size'] = soup.findAll('dd')[8].text
    field_partner_details['delinquency_rate'] = soup.findAll('dd')[9].text
    field_partner_details['loans_at_risk_rate'] = soup.findAll('dd')[10].text
    field_partner_details['default_rate'] = soup.findAll('dd')[11].text
    field_partner_details['currency_exchange_loss_rate'] = soup.findAll('dd')[12].text
    field_partner_details['fundraising_status'] = soup.findAll('dd')[13].text
    field_partner_details['country'] = soup.findAll('dd')[18].text
    field_partner_details['capital'] = soup.findAll('dd')[19].text
    field_partner_details['official_language'] = soup.findAll('dd')[20].text
    field_partner_details['population'] = soup.findAll('dd')[21].text
    field_partner_details['avg_annual_income'] = soup.findAll('dd')[22].text
    field_partner_details['labour_force'] = soup.findAll('dd')[23].text
    field_partner_details['population_below_poverty_line'] = soup.findAll('dd')[24].text
    field_partner_details['literacy_rate'] = soup.findAll('dd')[25].text
    field_partner_details['infant_mortality_rate_per_1000'] = soup.findAll('dd')[26].text
    field_partner_details['life_expectancy'] = soup.findAll('dd')[27].text

    scraped_df.append(field_partner_details)
#     print(loans_info)
#     print(scraped_df)
    accum += 1
#     print(printcounter)

    if (printcounter ==1):
        print(f'Iterations: {accum}, Duration: {time.time()-start_time}')

        printcounter = 1
        time.sleep(1)
        start_time=time.time()
    else:
        printcounter += 1
            

Iterations: 1, Duration: 2.190384864807129
Iterations: 2, Duration: 2.0952670574188232
Iterations: 3, Duration: 2.0616021156311035
Iterations: 4, Duration: 2.3672518730163574
Iterations: 5, Duration: 2.162287950515747
Iterations: 6, Duration: 1.9914298057556152
Iterations: 7, Duration: 2.301414966583252
Iterations: 8, Duration: 2.159877061843872
Iterations: 9, Duration: 2.5698628425598145
Iterations: 10, Duration: 2.765516757965088
Iterations: 11, Duration: 2.390052080154419
Iterations: 12, Duration: 2.5739898681640625
Iterations: 13, Duration: 2.107067108154297
Iterations: 14, Duration: 2.064099073410034
Iterations: 15, Duration: 2.2317159175872803
Iterations: 16, Duration: 2.1164770126342773
Iterations: 17, Duration: 1.974482774734497
Iterations: 18, Duration: 2.1114509105682373
Iterations: 19, Duration: 2.370236873626709
Iterations: 20, Duration: 2.3766250610351562
Iterations: 21, Duration: 3.19868803024292
Iterations: 22, Duration: 2.419203996658325
Iterations: 23, Duration: 2.4529

Iterations: 184, Duration: 2.304811716079712
Iterations: 185, Duration: 2.141728162765503
Iterations: 186, Duration: 2.2911500930786133
Iterations: 187, Duration: 2.470839023590088
Iterations: 188, Duration: 2.5239596366882324
Iterations: 189, Duration: 2.6693499088287354
Iterations: 190, Duration: 2.454814910888672
Iterations: 191, Duration: 2.278944969177246
Iterations: 192, Duration: 5.991783857345581
Iterations: 193, Duration: 2.353039026260376
Iterations: 194, Duration: 2.2055439949035645
Iterations: 195, Duration: 2.249323606491089
Iterations: 196, Duration: 2.448667287826538
Iterations: 197, Duration: 2.3833680152893066
Iterations: 198, Duration: 2.3570401668548584
Iterations: 199, Duration: 2.591486930847168
Iterations: 200, Duration: 2.3967931270599365
Iterations: 201, Duration: 2.3150901794433594
Iterations: 202, Duration: 2.303860902786255
Iterations: 203, Duration: 2.375602960586548
Iterations: 204, Duration: 2.5135419368743896
Iterations: 205, Duration: 2.5381038188934326


In [213]:
field_partner_info = pd.DataFrame.from_dict(scraped_df)
field_partner_info.to_csv('field_partner_detailed_info_v2.csv',index=False)

In [215]:
v1 = pd.read_csv('field_partner_detailed_info.csv')
v2 = pd.read_csv('field_partner_detailed_info_v2.csv')
v3 = v1.append(v2,ignore_index=True).drop_duplicates(inplace=True)
v3.to_csv('field_partner_detailed_info_no_dup.csv',index=False)