In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(color_codes=True)
import warnings
warnings.filterwarnings('ignore')

### Importing the libraries to use and reading in the data set.

In [None]:
kiva = pd.read_csv('../input/data-science-for-good-kiva-crowdfunding/kiva_loans.csv')
kiva_loan_theme = pd.read_csv('../input/data-science-for-good-kiva-crowdfunding/loan_theme_ids.csv')
kiva_loan_region = pd.read_csv('../input/data-science-for-good-kiva-crowdfunding/loan_themes_by_region.csv')

## What country got the most loans? Does the number of times a country is referenced relate to the quantity of loans it got?

In [None]:
country_loan_count = kiva['country'].value_counts().reset_index().head(10)
country_loan_count.columns = ['country','loan count']
country_loan_count

### The above countries got the most number of loans with Philippines in the lead followed by Kenya.

In [None]:
country_loan_amount = kiva.groupby('country')['loan_amount'].sum().sort_values(ascending = False).head(10).reset_index()
country_loan_amount

In [None]:
plt.figure(figsize = [15,5])
plt.subplot(1,2,1)
sns.barplot(x = 'country',y = 'loan count',data = country_loan_count)
plt.title("Countries With The Highest Number Of Loans")
plt.xlabel('Country')
plt.ylabel('Loan Count')
plt.xticks(rotation = 75)

plt.subplot(1,2,2)
sns.barplot(x = 'country',y = 'loan_amount',data = country_loan_amount)
plt.title("Countries With The Highest Loan Amounts")
plt.xlabel('Country')
plt.ylabel('Loan Amount')
plt.xticks(rotation = 75)
plt.show()

### For the two leading countries, the number of times they got loans is proportional to the loan amounts. This is not the case for the other countries. In both scenarios however, Phillipine is leading by far. For the remaining nine countries, the loan amounts have lesser ranges.

## Subsetting the data set to country of choice which is Nigeria.

In [None]:
nigeria = kiva[kiva['country']=='Nigeria'].reset_index(drop = True)

In [None]:
nigeria_2=kiva_loan_region[kiva_loan_region['country']=='Nigeria'].reset_index(drop = True)
nigeria_2.head(2)

Reviewing the data set to understand it.

In [None]:
nigeria.head(2)

Above is a glimpse of the data set

In [None]:
nigeria['funded_amount']==nigeria['loan_amount']

The loan amount is not equal to the funded amount in all instances.

In [None]:
nigeria['activity']==nigeria['sector']

Activity does not match the sector in all instances.

In [None]:
nigeria.info()

The data set has 10136 entries

Checking for missing values

In [None]:
nigeria.isnull().sum()

Some string values are missing. These are in columns that do not hold key values which is a bit of relief to this analysis. We can indicate not applicable on the missing values.

In [None]:
nigeria.fillna('n_a',inplace = True)

In [None]:
nigeria.describe()

The maximum loan given was 35000 and the maximum term was 22 months.
The minimum loan amount was 25 and minimum term 2 months.

In [None]:
nigeria.duplicated().sum()

In [None]:
nigeria.columns

## Which region had the highest loan amounts?

In [None]:
nigeria['region'].unique()

In [None]:
nigeria['region'].nunique()

### There were 22 regions in total

In [None]:
# which region got the most loans
loan_count_region = nigeria['region'].value_counts().reset_index().head(10)
loan_count_region.columns=['region','frequency']
loan_count_region

### Kaduna region got loans 10000 times followed by Uruagu and NNewi as shown above. Let's look at the total loan amount per region.

In [None]:
nigeria_loan_region = nigeria.groupby('region')[['loan_amount']].sum().sort_values(by = 'loan_amount',ascending = False).head(10).reset_index()
nigeria_loan_region.columns=['region','loan_amount']
nigeria_loan_region

In [None]:
plt.figure(figsize = [20,10])

plt.subplot(1,2,1)
sns.barplot(nigeria_loan_region['region'],nigeria_loan_region['loan_amount'])
plt.title("Nigeria's total loan amount for top ten regions")
plt.ylabel('Total loan amount')
plt.xlabel('Top ten regions')
plt.xticks(rotation = 75)

plt.subplot(1,2,2)
sns.barplot(loan_count_region['region'],loan_count_region['frequency'])
plt.title("Nigeria's total loan count for top ten regions")
plt.ylabel('Total loan count')
plt.xlabel('Top ten regions')
plt.xticks(rotation = 75)




plt.show()

### Above are the top ten regions in loan amounts. For the top three regions, the number of times the regions got loans is proportional to the loan amount. This does not apply to the other regions.

## What sector got the most loans? Does the number of times a sector is referenced relate to the quantity of loans it got?


In [None]:
nigeria['sector'].unique()

### 7 sectors acquired loans in Nigeria. The number of times the sectors got the loans is as below:

In [None]:
#sector with the most loans
nigeria['sector'].value_counts()

### Let's look at the total loan amount for each sector and compare with the number of times the sectors acquired loans

In [None]:
nigeria_sector_loans = nigeria.groupby('sector')[['loan_amount']].sum().sort_values(by = 'loan_amount',ascending=False).reset_index()
nigeria_sector_loans.columns = ['sector','loan_amount_total']
nigeria_sector_loans

In [None]:
plt.figure(figsize = [20,10])
plt.subplot(1,2,1)
sns.barplot(nigeria_sector_loans['sector'],nigeria_sector_loans['loan_amount_total'])
plt.title("Nigeria's loan amount per sector")
plt.xlabel('Sector')
plt.ylabel('Loan_amount')
plt.xticks(rotation = 75)

plt.subplot(1,2,2)
sns.countplot(x = nigeria['sector'])
plt.title("Nigeria's Loan Count Per Sector")
plt.xlabel('Sector')
plt.ylabel('Loan Count')
plt.xticks(rotation = 75)
plt.show()

### The  agricultural sector is leading in both loan amount and loan count. Education and Personal use sectors are popular as well. For the top three sectors, the loan amount is evidently proportional to the frequency of loans.

## For the top sector, what activity had the highest amount of loans? What does that say about that activity?

In [None]:
nigeria['activity'].unique()

In [None]:
nigeria.groupby('sector')['activity'].value_counts()


### For the top sector (Agriculture), farming activity had the most number of loans followed by personal expenses in the personal use sector. In conclusion, farming is a popular activity in nigeria that requires frequent funding. Let's look at the total loan amount for each activity and compare with the frequency of loans for the same.

In [None]:
nigeria_loan_activity = nigeria.groupby('activity')[['loan_amount']].sum().sort_values(by = 'loan_amount',ascending= False).reset_index().head(10)
nigeria_loan_activity

In [None]:
import plotly.express as ps
plt.figure(figsize = [10,10])
plt.pie(nigeria_loan_activity['loan_amount'],labels = nigeria_loan_activity['activity'])

plt.show()

### Farming accounts for 50% of the total loan amount in Nigeria. We can conclude that farming is a major economic activity in Nigeria and requires alot of funding. For the top five activities, the loan amount is proportional to the number of times which is not applicable with the rest of the activities.

### The top ten loan activities were as follows:

In [None]:
top_activity = nigeria[['activity']].value_counts().reset_index().head(10)
top_activity.columns=['activity','frequency']
top_activity

In [None]:
plt.figure(figsize = [15,10])
plt.pie(nigeria['activity'].value_counts(),labels = nigeria['activity'].value_counts())
plt.legend(top_activity['activity'],loc= 'upper right',bbox_to_anchor = (2,1))
plt.title("Nigeria's top loan activities graphical representation")

plt.show()

### Other than having the largest loan amount, farming also has the most borrowers. Farming seems to be a popular activity in Nigeria. Could be Nigeria is self dependent in food production and does not import alot or maybe high unemployment rates in the formal sector leading people to self employ in farming.

## What were the numbers between male and female recipients? Does that communicate anything?

In [None]:
def gender_lead(gender):
    gender=str(gender)
    if gender.startswith ('f'):
        gender='female'
    else:
        gender = 'male'
    return gender
    

In [None]:
nigeria['main_gender']=nigeria['borrower_genders'].apply(gender_lead)

### I created a new column named main_gender on the assumption that the first gender that appears under borrow_genders represents the majority.

In [None]:
nigeria.head(2)

In [None]:
nigeria.value_counts('main_gender')

In [None]:
nigeria.groupby('sector')['main_gender'].value_counts()

In [None]:
plt.figure(figsize = (10,5))

plt.title('Loan Amount by Sector Gender Representation')

plt.xticks(rotation = 75)

sns.barplot(x = 'sector', y = 'loan_amount', data = nigeria, ci = None,
           estimator = np.sum, hue = 'main_gender')

plt.show()

### There were more male borrowers compared to the female ones. The above information breaks down the total gender count for each of the sectors. The male gender seems to be very aggressive in borrowing in Nigeria especially in the personal use, education and agricultural sectors.

 ## What does the loan amount look like for various repayment interval? In various sectors? In various activities?

In [None]:
nigeria['repayment_interval'].unique()

In [None]:
nigeria.groupby('sector')['repayment_interval'].value_counts()

In [None]:
nigeria_repayment_interval = nigeria[['sector','activity','region','loan_amount','repayment_interval']]
nigeria_repayment_interval

In [None]:
plt.figure(figsize = (10,5))


plt.subplot(1,2,1)

plt.title("Repayment Intervals for Various Sectors")            
sns.barplot(x = 'sector', y = 'loan_amount', data = nigeria_repayment_interval, ci = None,
           estimator = np.sum, hue = 'repayment_interval')
plt.xticks(rotation = 75)

plt.subplot(1,2,2)

plt.title("Repayment Intervals for Various Activities")
sns.barplot(x = 'activity', y = 'loan_amount', data = nigeria_repayment_interval, ci = None,
           estimator = np.sum, hue = 'repayment_interval')
plt.xticks(rotation = 90)
plt.show()

## The most dominant field partners.

In [None]:
nigeria['partner_id'].value_counts()

In [None]:
nigeria['partners']=nigeria_2['Field Partner Name']
nigeria.head(2)

In [None]:
nigeria['partners'].value_counts()

In [None]:
sns.countplot(x = nigeria['partners'])
plt.title("Nigeria's Dominant Partners")
plt.xlabel('Partner')
plt.ylabel('Count')
plt.xticks(rotation = 75)
plt.show()

## The partner names and their total loan amount.

In [None]:
nigeria_partners = nigeria.groupby('partners')['loan_amount'].sum().reset_index()
nigeria_partners

In [None]:
sns.barplot(nigeria_partners['partners'],nigeria_partners['loan_amount'])
plt.title("Nigeria's Partners Loan Amount")
plt.xlabel('Partner')
plt.ylabel('Total Loan Amounts')
plt.xticks(rotation = 75)
plt.show()

## The partner names and their total funded amount.

In [None]:
nigeria_partners_funded = nigeria.groupby('partners')['funded_amount'].sum().reset_index()
nigeria_partners_funded

In [None]:
sns.barplot(nigeria_partners_funded['partners'],nigeria_partners_funded['funded_amount'])
plt.title("Nigeria's Partners Funded Amount")
plt.xlabel('Partner')
plt.ylabel('Total Funded Amounts')
plt.xticks(rotation = 75)
plt.show()

## The partner names and their total number of loans.

In [None]:
nigeria['partners'].value_counts()

In [None]:
sns.countplot(x = nigeria['partners'])
plt.title("Nigeria's Dominant Partners")
plt.xlabel('Partner')
plt.ylabel('Count')
plt.xticks(rotation = 75)
plt.show()

## Top reasons for giving a loan.

In [None]:
nigeria.head(2)

In [None]:
nigeria['use'].unique()

In [None]:
plt.figure(figsize=[15,5])
sns.countplot(x = nigeria['activity'])
plt.title("Nigeria's Top Loan Reasons")
plt.xlabel('Activity')
plt.ylabel('Count')
plt.xticks(rotation = 75)
plt.show()

## The distribution of the loan amount.

In [None]:
plt.figure(figsize = (10,5))
sns.distplot(nigeria['loan_amount'],bins = 10, rug = True)

plt.show()

## The distribution of the funded amount.

In [None]:
plt.figure(figsize = (10,5))
sns.distplot(nigeria['funded_amount'],rug = True, bins = 10)

plt.show()

## The distribution of the repayment term.

In [None]:
plt.figure(figsize = (10,5))
sns.distplot(nigeria['term_in_months'], rug = True)

plt.show()

## The lender count against the funded amount. Explain the obtained results.

In [None]:
plt.figure(figsize = (10,5))
plt.plot(nigeria['funded_amount'],nigeria['lender_count'])
plt.title("Nigeria's Funded Amount vs Lender Count")
plt.xlabel('Funded Amount')
plt.ylabel('Lender Count')
plt.show()

## The distribution of the funded amount by region.

In [None]:
nigeria_region_funded = nigeria.groupby('region')['funded_amount'].sum().reset_index()
nigeria_region_funded

In [None]:
plt.figure(figsize = (10,5))
sns.lineplot(nigeria_region_funded['region'],nigeria_region_funded['funded_amount'])
plt.title("Nigeria's Funded Amount Per Region")
plt.xlabel('Region')
plt.ylabel('Total Funded Amounts')
plt.xticks(rotation = 75)
plt.show()

## The funded amount with the sector as the hue.

In [None]:
plt.figure(figsize = (20,10))
sns.scatterplot(x = 'funded_amount', y = 'lender_count', hue = 'sector', data = nigeria)

plt.show()

## A boxplot of regions against funded amount.

In [None]:
plt.figure(figsize = (10,5))
sns.boxplot(nigeria_region_funded['region'],nigeria_region_funded['funded_amount'])
plt.title("Nigeria's Funded Amount Per Region")
plt.xlabel('Region')
plt.ylabel('Total Funded Amounts')
plt.xticks(rotation = 75)
plt.show()

## The repayment interval for different regions.

In [None]:
nigeria[['region','repayment_interval']]

In [None]:
nig_repayment_interval = nigeria_repayment_interval[nigeria_repayment_interval['region']!='Kaduna']
plt.figure(figsize = (10,5))
plt.title("Repayment Intervals for Regions other Than Kaduna")
sns.barplot(x = 'region', y = 'loan_amount', data = nig_repayment_interval, ci = None,
           estimator = np.sum, hue = 'repayment_interval')
plt.xticks(rotation = 75)
plt.show()

In [None]:
nig_repayment_interval_kaduna = nigeria_repayment_interval[nigeria_repayment_interval['region']=='Kaduna']
plt.figure(figsize = (5,5))
plt.title("Repayment Intervals for Kaduna")
sns.barplot(x = 'region', y = 'loan_amount', data = nig_repayment_interval_kaduna, ci = None,
           estimator = np.sum, hue = 'repayment_interval')
plt.xticks(rotation = 75)
plt.show()

In [None]:
plt.figure(figsize = (10,5))
plt.title("Repayment Intervals for Different Regions")
sns.barplot(x = 'region', y = 'loan_amount', data = nigeria_repayment_interval, ci = None,
           estimator = np.sum, hue = 'repayment_interval')
plt.xticks(rotation = 75)
plt.show()

## Boxplots of the loan amount in sectors

In [None]:
plt.figure(figsize = (10,5))
sns.boxplot(nigeria_sector_loans['sector'],nigeria_sector_loans['loan_amount_total'])
plt.title("Nigeria's Loan Amount Per Sector")
plt.xlabel('Sector')
plt.ylabel('Total Loan Amount')
plt.xticks(rotation = 75)
plt.show()

## The number of field partners per region. (MPI dataset)

In [None]:
nigeria_2.columns

In [None]:
nigeria_2.groupby('region')['Field Partner Name'].value_counts()

In [None]:
plt.figure(figsize=[15,5])
sns.countplot(x = nigeria_2['region'], hue = nigeria_2['Field Partner Name'])
plt.title("Nigeria's Field Partners per Region")
plt.xlabel('Region')
plt.ylabel('Count')
plt.xticks(rotation = 75)
plt.show()

## The number of regions.

In [None]:
nigeria['region'].nunique()

In [None]:
plt.figure(figsize=[15,5])
sns.countplot(x = nigeria['region'])
plt.title("Nigeria Regions")
plt.xlabel('Region')
plt.ylabel('Count')
plt.xticks(rotation = 75)
plt.show()

In [None]:
from mpl_toolkits.basemap import Basemap 

In [None]:
nigeria_2.head(2)

## The number of loans per region.

In [None]:
loan_count_region_2 = nigeria['region'].value_counts().reset_index()
loan_count_region_2.columns=['region','frequency']
loan_count_region_2

In [None]:
plt.figure(figsize = (10,5))
sns.lineplot(loan_count_region_2['region'],loan_count_region_2['frequency'])
plt.title("Nigeria's Loan Count Per Region")
plt.xlabel('Region')
plt.ylabel('Total Loan Count')
plt.xticks(rotation = 75)
plt.show()

## The number of loans per sector.

In [None]:
nigeria['sector'].value_counts()

In [None]:
plt.figure(figsize = (10,5))
sns.countplot(x = nigeria['sector'])
plt.title("Nigeria's Loan Count Per Sector")
plt.xlabel('Sector')
plt.ylabel('Loan Count')
plt.xticks(rotation = 75)
plt.show()