# Experimental simple performance testing notebook for Pandas
- testing and comparing simple dataframe / sql operations of commong data (pre-)processing tasks 
- various available single-machine Python solutions are to be tested: Pandas, PySpark, Turi Create and Dask.
- execution times, CPU load and maximal memory use should be tracked

## Kiva dataset 
- [Kiva](https://www.kaggle.com/gaborfodor/additional-kiva-snapshot): crowdfunding data with lenders and loans, with additional geographic data
- Pandas Kiva examples: https://www.kaggle.com/gaborfodor/additional-kiva-snapshot/kernels?sortBy=hotness&group=everyone&pageSize=20&datasetId=14345&language=Python


## imports, setup

In [1]:
import timeit
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
plt.style.use('fivethirtyeight')
%matplotlib inline

## read files to dataframes: loans and lenders

In [2]:
start = timeit.default_timer()
full_start = timeit.default_timer()

loans_df = pd.read_csv("../../kiva/loans.csv", parse_dates=['raised_time']) # 2.1 GB, 1.419.607 lines
print('loans: ', loans_df.shape)

lenders_df = pd.read_csv("../../kiva/lenders.csv") #, parse_dates=['raised_time'])  # 130 MB file
print('lenders: ', lenders_df.shape)

print('ellapsed time: ', timeit.default_timer() - start)

loans:  (1419607, 31)
lenders:  (2349174, 11)
ellapsed time:  36.91002261500398


## read, transform and count loan_lenders 
string enumeration to rows: split tuple strings to array, then explode the array to rows


In [3]:
# version 1: this one is the best option
# https://stackoverflow.com/questions/32468402/how-to-explode-a-list-inside-a-dataframe-cell-into-separate-rows
start = timeit.default_timer()

loans_lenders_raw_df = pd.read_csv("../../kiva/loans_lenders.csv", #nrows=200000, 
                                   dtype={'loan_id': np.int32, 'lenders': object})
# full: 339 MB file, 1.387.433 lines --> 6.3GB, 28.293.931 lines, 388 sec
# 100.000 heading line --> 2.060.259 output lines
# 200.000 heading line --> 4.110.948 output lines, 1.1 GB mem

loans_lenders_raw_df.reset_index(inplace=True)
rows = []
_ = loans_lenders_raw_df.apply(lambda row: [rows.append([row['loan_id'], nn]) 
                         for nn in str.split(str.replace(row.lenders, ' ', ''), ',')], axis=1)

loans_lenders_df = pd.DataFrame(rows, columns=['loan_id', 'lender']) #.set_index(['loan_id', 'lender'])

print('ellapsed time: ', timeit.default_timer() - start)
loans_lenders_df.head(5)

ellapsed time:  373.1914171449898


Unnamed: 0,loan_id,lender
0,483693,muc888
1,483693,sam4326
2,483693,camaran3922
3,483693,lachheb1865
4,483693,rebecca3499


## join, filter and sort loan and lender data
get distinct joined lines with renamed columns, then write to an output file (for fully materialized results)
- filtering on lenders.country_code: 
  - 'US': 25% of lenders
  - 'CA': 3% of lenders --> 3.5 GB joined

In [4]:
start = timeit.default_timer()

# filter unique lenders: CA: 67.970
lenders_df = lenders_df[lenders_df['country_code']=='CA']

print('ellapsed time: ', timeit.default_timer() - start)

ellapsed time:  0.3694360749941552


In [5]:
start = timeit.default_timer()

# join: 
joined_df_1 = pd.merge(loans_lenders_df, lenders_df, left_on='lender', right_on='permanent_name')
joined_df = pd.merge(joined_df_1, loans_df, on='loan_id')

#joined_df.to_csv('../../kiva/pandas-result-joined.csv') # 3.7 GB

print('ellapsed time: ', timeit.default_timer() - start)

ellapsed time:  22.023902241999167


In [6]:
joined_df.head(2)

Unnamed: 0,loan_id,lender,permanent_name,display_name,city,state,country_code_x,member_since,occupation,loan_because,...,raised_time,lender_term,num_lenders_total,num_journal_entries,num_bulk_entries,tags,borrower_genders,borrower_pictured,repayment_interval,distribution_model
0,483693,alan5513,alan5513,Alan,Edmonton,Alberta,CA,1237674751,,A small loan is an investment in enabling some...,...,2012-11-16 18:51:23,8.0,44,2,1,,female,True,irregular,field_partner
1,483693,nick9464,nick9464,nick and anna,victoria,BC,CA,1277253851,self employed,it helps those who need help,...,2012-11-16 18:51:23,8.0,44,2,1,,female,True,irregular,field_partner


## group and sort joined data

* group by on the exploded loans_lenders table (6 GB): count distinct loan_id by lender

In [7]:
start = timeit.default_timer()

lender_loan_count_df = loans_lenders_df.groupby(['lender'], as_index=True)['loan_id'].nunique() \
#    .sort_values(by='sector_name', ascending=False)

print('ellapsed time: ', timeit.default_timer() - start)
print(lender_loan_count_df.shape)

lender_loan_count_df.head(5)

print('full ellapsed time: ', timeit.default_timer() - full_start)

ellapsed time:  17.453769783998723
(1383799,)
full ellapsed time:  450.0331337139942
