In [1]:
# IMPORTING PACKAGES 

# shows in jupyter notebook
%matplotlib inline

import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
from datetime import datetime
import dateutil.parser

# this allows us to show several ouputs in one cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# display only 80 columns
pd.set_option("display.max_columns", 80)
pd.set_option("expand_frame_repr", True)

In [2]:
# READING DATASET
#dataset downloaded from http://build.kiva.org (900MB compressed)

path = "kiva_data/"
filename = "loans.csv"

df = pd.read_csv(path + filename, nrows = 10000)

In [3]:
#shows three random samples of the data
df.sample(n = 3) 

Unnamed: 0,LOAN_ID,LOAN_NAME,ORIGINAL_LANGUAGE,DESCRIPTION,DESCRIPTION_TRANSLATED,FUNDED_AMOUNT,LOAN_AMOUNT,STATUS,IMAGE_ID,VIDEO_ID,ACTIVITY_NAME,SECTOR_NAME,LOAN_USE,COUNTRY_CODE,COUNTRY_NAME,TOWN_NAME,CURRENCY_POLICY,CURRENCY_EXCHANGE_COVERAGE_RATE,CURRENCY,PARTNER_ID,POSTED_TIME,PLANNED_EXPIRATION_TIME,DISBURSE_TIME,RAISED_TIME,LENDER_TERM,NUM_LENDERS_TOTAL,NUM_JOURNAL_ENTRIES,NUM_BULK_ENTRIES,TAGS,BORROWER_NAMES,BORROWER_GENDERS,BORROWER_PICTURED,REPAYMENT_INTERVAL,DISTRIBUTION_MODEL
7972,894916,Cecelia,English,Cecelia works hard to support her family. Cece...,Cecelia works hard to support her family. Cece...,450,450,funded,1894354,,Food Production/Sales,Food,"to buy ingredients like meat, fish, chicken me...",PH,Philippines,"Roxas City, Capiz",shared,0.1,PHP,145,2015-06-01 05:01:37.000 +0000,2015-07-03 17:50:03.000 +0000,2015-05-05 07:00:00.000 +0000,2015-06-04 11:30:53.000 +0000,7,2,1,1,,Cecelia,female,True,irregular,field_partner
5692,1119372,Hamidah,English,"Good day, lenders! Meet one of KSPPS BMI’s cl...","Good day, lenders! Meet one of KSPPS BMI’s cl...",175,175,funded,2257487,,Personal Housing Expenses,Housing,To build a clean water source at her home.,ID,Indonesia,Tangerang,shared,0.1,IDR,406,2016-07-20 02:42:01.000 +0000,2016-08-28 01:10:03.000 +0000,2016-06-30 07:00:00.000 +0000,2016-07-31 04:06:20.000 +0000,37,6,1,1,"#Elderly, #Eco-friendly, #Health and Sanitatio...",Hamidah,female,True,irregular,field_partner
8848,991150,Zahid,English,"Zahid is 57 years old. He is refugee, living i...","Zahid is 57 years old. He is refugee, living i...",1150,1150,funded,2045985,,Livestock,Agriculture,to purchase sheep,AZ,Azerbaijan,Bilasuvar,shared,0.1,AZN,56,2015-12-07 17:53:14.000 +0000,2016-01-08 18:30:02.000 +0000,2015-11-26 08:00:00.000 +0000,2016-01-08 18:17:23.000 +0000,20,39,4,2,"#Animals, #Elderly, #Refugee, user_favorite",Zahid,male,True,monthly,field_partner


In [4]:
#number of columns shown as a list
df.columns.tolist()

['LOAN_ID',
 'LOAN_NAME',
 'ORIGINAL_LANGUAGE',
 'DESCRIPTION',
 'DESCRIPTION_TRANSLATED',
 'FUNDED_AMOUNT',
 'LOAN_AMOUNT',
 'STATUS',
 'IMAGE_ID',
 'VIDEO_ID',
 'ACTIVITY_NAME',
 'SECTOR_NAME',
 'LOAN_USE',
 'COUNTRY_CODE',
 'COUNTRY_NAME',
 'TOWN_NAME',
 'CURRENCY_POLICY',
 'CURRENCY_EXCHANGE_COVERAGE_RATE',
 'CURRENCY',
 'PARTNER_ID',
 'POSTED_TIME',
 'PLANNED_EXPIRATION_TIME',
 'DISBURSE_TIME',
 'RAISED_TIME',
 'LENDER_TERM',
 'NUM_LENDERS_TOTAL',
 'NUM_JOURNAL_ENTRIES',
 'NUM_BULK_ENTRIES',
 'TAGS',
 'BORROWER_NAMES',
 'BORROWER_GENDERS',
 'BORROWER_PICTURED',
 'REPAYMENT_INTERVAL',
 'DISTRIBUTION_MODEL']

In [5]:
# shows the data type of the column "LOAD_ID"
df["LOAN_ID"].dtype

dtype('int64')

In [6]:
# number of observations and features (rows and columns) using .shape function
df.shape
print ("There are "  + str(df.shape[0]) + " observations and " + str(df.shape[1]) + " features")

(10000, 34)

There are 10000 observations and 34 features


In [7]:
#describing categorical features (string)

#filters for all columns that are a data type object (string in this case)
categorical = df.dtypes[df.dtypes == "object"].index

#random row of our new filtered dataset
df[categorical].sample()

#getting summary statistics from our new filtered dataset
df[categorical].describe()
#count: is the number of of active fields (not empty)
#unique: is the number of fields that are unique (not repeated)
#top: most popular value in the feature
#freq: how frequent is the most popular value in the feature

Unnamed: 0,LOAN_NAME,ORIGINAL_LANGUAGE,DESCRIPTION,DESCRIPTION_TRANSLATED,STATUS,ACTIVITY_NAME,SECTOR_NAME,LOAN_USE,COUNTRY_CODE,COUNTRY_NAME,TOWN_NAME,CURRENCY_POLICY,CURRENCY,POSTED_TIME,PLANNED_EXPIRATION_TIME,DISBURSE_TIME,RAISED_TIME,TAGS,BORROWER_NAMES,BORROWER_GENDERS,BORROWER_PICTURED,REPAYMENT_INTERVAL,DISTRIBUTION_MODEL
2042,Pacita,English,Pacita is 60 years old and married with six ch...,Pacita is 60 years old and married with six ch...,funded,Farming,Agriculture,to buy fertilizers and other farm supplies.,PH,Philippines,"Himamaylan, Negros Occidental",shared,PHP,2017-03-20 04:19:54.000 +0000,2017-04-22 04:40:04.000 +0000,2017-02-27 08:00:00.000 +0000,2017-03-23 15:22:58.000 +0000,"#Woman Owned Biz, #Elderly",Pacita,female,True,irregular,field_partner


Unnamed: 0,LOAN_NAME,ORIGINAL_LANGUAGE,DESCRIPTION,DESCRIPTION_TRANSLATED,STATUS,ACTIVITY_NAME,SECTOR_NAME,LOAN_USE,COUNTRY_CODE,COUNTRY_NAME,TOWN_NAME,CURRENCY_POLICY,CURRENCY,POSTED_TIME,PLANNED_EXPIRATION_TIME,DISBURSE_TIME,RAISED_TIME,TAGS,BORROWER_NAMES,BORROWER_GENDERS,BORROWER_PICTURED,REPAYMENT_INTERVAL,DISTRIBUTION_MODEL
count,9681,9708,9708,6407,10000,10000,10000,9708,10000,10000,8928,10000,10000,10000,9628,10000,9567,4274,9681,9708,9708,10000,10000
unique,6891,7,9708,6407,4,144,15,8570,75,75,2124,2,64,10000,9348,1261,9487,1898,7049,327,196,3,2
top,Anonymous Group,English,"Agée de 37 ans, Louise est mariée .Le nombre ...",Gloria is a married woman with four children. ...,funded,General Store,Agriculture,to build a sanitary toilet for her family.,PH,Philippines,Lahore,shared,PHP,2016-09-22 05:53:50.000 +0000,2013-03-03 09:18:09.000 +0000,2014-01-22 08:00:00.000 +0000,2017-12-13 03:41:04.000 +0000,user_favorite,Evelyn,female,true,monthly,field_partner
freq,62,6666,1,1,9559,997,2387,51,2487,2487,92,8688,2487,1,3,41,3,401,29,6399,8402,5083,9991
