<h1 style="text-align:center">Analyzing Loans from Lending Club</h1>

In [129]:
import pandas as pd
pd.options.display.max_columns = 99

## Lets see first five rows

In [130]:
first_five_rows = pd.read_csv('loans_2007.csv', nrows=5)
first_five_rows

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,inq_last_6mths,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,last_credit_pull_d,collections_12_mths_ex_med,policy_code,application_type,acc_now_delinq,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,10.65%,162.87,B,B2,,10+ years,RENT,24000.0,Verified,Dec-2011,Fully Paid,n,credit_card,Computer,860xx,AZ,27.65,0.0,Jan-1985,1.0,3.0,0.0,13648.0,83.7%,9.0,f,0.0,0.0,5863.155187,5833.84,5000.0,863.16,0.0,0.0,0.0,Jan-2015,171.62,Jun-2016,0.0,1.0,INDIVIDUAL,0.0,0.0,0.0,0.0,0.0
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,15.27%,59.83,C,C4,Ryder,< 1 year,RENT,30000.0,Source Verified,Dec-2011,Charged Off,n,car,bike,309xx,GA,1.0,0.0,Apr-1999,5.0,3.0,0.0,1687.0,9.4%,4.0,f,0.0,0.0,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,Apr-2013,119.66,Sep-2013,0.0,1.0,INDIVIDUAL,0.0,0.0,0.0,0.0,0.0
2,1077175,1313524.0,2400.0,2400.0,2400.0,36 months,15.96%,84.33,C,C5,,10+ years,RENT,12252.0,Not Verified,Dec-2011,Fully Paid,n,small_business,real estate business,606xx,IL,8.72,0.0,Nov-2001,2.0,2.0,0.0,2956.0,98.5%,10.0,f,0.0,0.0,3005.666844,3005.67,2400.0,605.67,0.0,0.0,0.0,Jun-2014,649.91,Jun-2016,0.0,1.0,INDIVIDUAL,0.0,0.0,0.0,0.0,0.0
3,1076863,1277178.0,10000.0,10000.0,10000.0,36 months,13.49%,339.31,C,C1,AIR RESOURCES BOARD,10+ years,RENT,49200.0,Source Verified,Dec-2011,Fully Paid,n,other,personel,917xx,CA,20.0,0.0,Feb-1996,1.0,10.0,0.0,5598.0,21%,37.0,f,0.0,0.0,12231.89,12231.89,10000.0,2214.92,16.97,0.0,0.0,Jan-2015,357.48,Apr-2016,0.0,1.0,INDIVIDUAL,0.0,0.0,0.0,0.0,0.0
4,1075358,1311748.0,3000.0,3000.0,3000.0,60 months,12.69%,67.79,B,B5,University Medical Group,1 year,RENT,80000.0,Source Verified,Dec-2011,Current,n,other,Personal,972xx,OR,17.94,0.0,Jan-1996,0.0,15.0,0.0,27783.0,53.9%,38.0,f,461.73,461.73,3581.12,3581.12,2538.27,1042.85,0.0,0.0,0.0,Jun-2016,67.79,Jun-2016,0.0,1.0,INDIVIDUAL,0.0,0.0,0.0,0.0,0.0


## Lets read first 1000 rows and calculate the amount of memory needed

### Lets define a function so that we can test it out

In [131]:
def find_memeory_usage(filename, numrows):
    first_n_rows = pd.read_csv(filename, nrows=numrows)
    first_n_rows_mem_usage = first_n_rows.memory_usage(deep=True).sum()/ (1024 ** 2)
    return first_n_rows_mem_usage

In [132]:
!wc -l 'loans_2007.csv'

42539 loans_2007.csv


In [133]:
first_1000_rows_mem_usage = find_memeory_usage('loans_2007.csv', 1000)
first_1000_rows_mem_usage

1.5502090454101562

In [134]:
num_rows = round(1000 * 5/ first_1000_rows_mem_usage)
num_rows

3225.0

> ## We will use 3225 rows to keep the memory usage less than 5MB

In [135]:
find_memeory_usage('loans_2007.csv', 3225)

4.997628211975098

## Lets start Analyzing the dataframe 

In [136]:
first_1000_rows = pd.read_csv('loans_2007.csv', nrows=1000)
first_1000_rows.select_dtypes(include=['object']).columns

Index(['term', 'int_rate', 'grade', 'sub_grade', 'emp_title', 'emp_length',
       'home_ownership', 'verification_status', 'issue_d', 'loan_status',
       'pymnt_plan', 'purpose', 'title', 'zip_code', 'addr_state',
       'earliest_cr_line', 'revol_util', 'initial_list_status', 'last_pymnt_d',
       'last_credit_pull_d', 'application_type'],
      dtype='object')

## Lets calculate the total memory

> ## Handing trailers when iterating

In [137]:
class NextIterator:
    def __init__(self, iterator):
        self._iterator = iterator
        self._buffer = []

    def __iter__(self):
        return self

    @property
    def has_next(self):
        try:
            self._buffer = [next(self._iterator)]
            return True
        except StopIteration:
            return False

    def __next__(self):
        if self._buffer:
            return self._buffer.pop()
        else:
            # returns the dataframe
            return next(self._iterator)

In [138]:
# initialize
total_memory = 0
num_rows = 0
has_next= True
trailers = 2

# lets chunk the file for every 3225 rows <5MB size
chunk_iter = pd.read_csv('loans_2007.csv', chunksize=3225)
chunk_next = NextIterator(chunk_iter)

# lets determine the numeric and string columns first
while has_next:
    temp_chunk = next(chunk_next)
    # figure out if we need to remove the two trailer records
    if chunk_next.has_next:
        chunk = temp_chunk
    else:
        chunk = temp_chunk[:-trailers]
        has_next = False
     
    # lets calculate the total memory
    total_memory += chunk.memory_usage(deep=True).sum()
    #print(total_memory/(1024**2))
    
    # lets add the chunk rows to total rows
    num_rows += len(chunk)
        
# total memeory in MB
total_memory = total_memory/(1024 ** 2)

print('Number of rows: {} & Total Memory in MB: {}'. format(num_rows, round(total_memory,2)))         

Number of rows: 42536 & Total Memory in MB: 66.23


## Lets Parse date and some sprucing up to figure out the numbers and Strings

In [139]:
# find difference between two lists
def diff_bn_lists(l1, l2):
    return list(set(l1).symmetric_difference(l2))

In [140]:
# remove non-numeric record from the chunk
def remove_non_num_record(chunk, col):
    # lets find the outliers
    mask = pd.to_numeric(chunk[col], errors='coerce').isna()
    temp_chunk_col = chunk[~mask].copy()
    temp_chunk_col[col] = temp_chunk_col[col].astype('float')
    return temp_chunk_col.copy()    

In [141]:
# initialize
numeric_cols, string_cols = [],[]
nums_in_strings = {}
total_memory = 0
has_next= True
num_rows = 0
percent_cols = ['int_rate','revol_util']
date_cols = ['last_credit_pull_d','issue_d','last_pymnt_d','earliest_cr_line']

# lets chunk the file for every 3225 rows <5MB size
chunk_iter = pd.read_csv('loans_2007.csv', chunksize=3225)
chunk_next = NextIterator(chunk_iter)

# lets determine the numeric and string columns first
while has_next:
    temp_chunk = next(chunk_next)
    
    # figure out if we need to remove the two trailer records
    if chunk_next.has_next:
        chunk = temp_chunk
    else:
        chunk = temp_chunk.iloc[:-trailers,:].copy()
        #print(chunk.tail())
        has_next = False
        
    # lets remove the id columns
    chunk = remove_non_num_record(chunk,'id')
        
    # lets handle the percents and convert them into float
    for col in percent_cols:
        chunk[col] = chunk[col].apply(lambda x: str(x).replace('%',''))
        chunk[col] = chunk[col].astype('float')
    
    # lets handle date columns
    for col in date_cols:
        chunk[col] = pd.to_datetime(chunk[col], format='%b-%Y')
        
    # lets figure out the objects and numericals
    temp_numeric_cols = list(chunk.select_dtypes(include=['int','float']).columns)
    temp_string_cols = list(chunk.select_dtypes(include=['object']).columns)
    
    # lets handle string columns
    if len(diff_bn_lists(string_cols, temp_string_cols)) > 0:
        string_cols = list(set(string_cols).union(set(temp_string_cols)))
        
    # lets find any numeric columns in string_cols
    temp_nums_in_string = (set(temp_numeric_cols).union(set(numeric_cols)))\
    .intersection(set(string_cols))
    
    # if we find any numbers in string cols remove them and document the value counts
    if len(temp_nums_in_string) > 0:
        
        # lets remove them from numeric_cols
        numeric_cols = list(set(numeric_cols) - temp_nums_in_string)
        temp_numeric_cols = list(set(temp_numeric_cols) - temp_nums_in_string)
        
       
        # lets add them to the nums_in_strings
        for col in temp_nums_in_string:
            # lets find the outliers
            mask = pd.to_numeric(chunk[col], errors='coerce').isna()
            temp_chunk_col = chunk[col][mask]
            # remove the nulls
            temp_chunk_col = temp_chunk_col[~temp_chunk_col.isna()]
            
            # now add them to dictionary
            if not col in nums_in_strings:
                nums_in_strings[col] = [temp_chunk_col.value_counts()]
            else:
                nums_in_strings[col].append(temp_chunk_col.value_counts())
     
    # lets handle numeric columns
    if len(diff_bn_lists(numeric_cols, temp_numeric_cols)) > 0:
        numeric_cols = list(set(numeric_cols).union(set(temp_numeric_cols)))
        
    # lets calculate the total memory
    total_memory += chunk.memory_usage(deep=True).sum()
    
    # lets add the chunk rows to total rows
    num_rows += len(chunk)
        
# lets groupby the value counts and find the sums
for col in nums_in_strings:
    temp_val_col = pd.concat(nums_in_strings[col])
    nums_in_strings[col] = temp_val_col.groupby(temp_val_col.index).sum()
    
# total memeory in MB
total_memory = total_memory/(1024 ** 2)

print('Number of String Columns: {0} & Number of Numeric Columns: {1}'.\
      format(len(string_cols), len(numeric_cols)))           

Number of String Columns: 15 & Number of Numeric Columns: 33


## Outliers in numeric columns and the counts

In [142]:
nums_in_strings

{}

## Total Memory and Number rows

In [143]:
print('Number of rows: {} & Total Memory in MB: {}'. format(num_rows, round(total_memory,2)))

Number of rows: 42535 & Total Memory in MB: 52.66


## Lets figure out all the columns which are good candidates for categories

In [144]:
# initialize
unique_string_cols = {}
category_cand_cols = []
float_no_missing_values = {}
integer_cand_cols = []
has_next = True

# lets chunk the file for every 3225 rows <5MB size
chunk_iter = pd.read_csv('loans_2007.csv', chunksize=3225)
chunk_next = NextIterator(chunk_iter)

# lets determine the numeric and string columns first
while has_next:
    temp_chunk = next(chunk_next)
    
    # figure out if we need to remove the two trailer records
    if chunk_next.has_next:
        chunk = temp_chunk
    else:
        chunk = temp_chunk.iloc[:-trailers,:].copy()
        has_next = False
    
    # lets remove the id columns
    chunk = remove_non_num_record(chunk,'id')
        
    for col in string_cols:
        if not col in unique_string_cols:
            unique_string_cols[col] = [pd.Series(chunk[col].unique())]
        else:
            unique_string_cols[col].append(pd.Series(chunk[col].unique()))
            
    for col in numeric_cols:
        if not col in float_no_missing_values:
            float_no_missing_values[col] = [pd.Series(chunk[col].isna().sum())]
        else:
            float_no_missing_values[col].append(pd.Series(chunk[col].isna().sum()))

# figure out the string columns which are candidate for categories
for col in unique_string_cols:
    unique_string_cols[col] = pd.concat(unique_string_cols[col]).unique()
    
    if len(unique_string_cols[col]) < num_rows/2:
        category_cand_cols.append(col)
        
# figure out the numerical columns which are candidate for integers
for col in float_no_missing_values:
    float_no_missing_values[col] = pd.concat(float_no_missing_values[col]).sum()
    
    if float_no_missing_values[col] == 0:
        integer_cand_cols.append(col)
    
print(category_cand_cols)

['verification_status', 'title', 'addr_state', 'zip_code', 'application_type', 'initial_list_status', 'term', 'purpose', 'loan_status', 'sub_grade', 'grade', 'home_ownership', 'pymnt_plan', 'emp_length']


In [145]:
## lets remove some columns from categoires based on the data
remove_cat_cols = ['last_credit_pull_d','issue_d','zip_code','last_pymnt_d','earliest_cr_line','int_rate','revol_util']
for col in remove_cat_cols:
    if col in category_cand_cols:
        category_cand_cols.remove(col)

In [146]:
print(category_cand_cols)

['verification_status', 'title', 'addr_state', 'application_type', 'initial_list_status', 'term', 'purpose', 'loan_status', 'sub_grade', 'grade', 'home_ownership', 'pymnt_plan', 'emp_length']


In [147]:
integer_cand_cols = ['id','member_id']
print(integer_cand_cols)

['id', 'member_id']


In [148]:
print(num_rows/2)

21267.5


In [149]:
for col in unique_string_cols:
    print("Number of Unique Values for '{}': {}".format(col, len(unique_string_cols[col])))

Number of Unique Values for 'verification_status': 3
Number of Unique Values for 'title': 21265
Number of Unique Values for 'addr_state': 50
Number of Unique Values for 'zip_code': 837
Number of Unique Values for 'emp_title': 30659
Number of Unique Values for 'application_type': 1
Number of Unique Values for 'initial_list_status': 1
Number of Unique Values for 'term': 2
Number of Unique Values for 'purpose': 14
Number of Unique Values for 'loan_status': 9
Number of Unique Values for 'sub_grade': 35
Number of Unique Values for 'grade': 7
Number of Unique Values for 'home_ownership': 5
Number of Unique Values for 'pymnt_plan': 2
Number of Unique Values for 'emp_length': 12


In [150]:
for col in float_no_missing_values:
    print("Number of Null Values for '{}': {}".format(col, float_no_missing_values[col]))

Number of Null Values for 'chargeoff_within_12_mths': 145
Number of Null Values for 'member_id': 0
Number of Null Values for 'inq_last_6mths': 29
Number of Null Values for 'installment': 0
Number of Null Values for 'acc_now_delinq': 29
Number of Null Values for 'tax_liens': 105
Number of Null Values for 'annual_inc': 4
Number of Null Values for 'total_rec_late_fee': 0
Number of Null Values for 'collection_recovery_fee': 0
Number of Null Values for 'total_pymnt_inv': 0
Number of Null Values for 'open_acc': 29
Number of Null Values for 'out_prncp_inv': 0
Number of Null Values for 'total_pymnt': 0
Number of Null Values for 'revol_bal': 0
Number of Null Values for 'funded_amnt': 0
Number of Null Values for 'pub_rec': 29
Number of Null Values for 'total_rec_int': 0
Number of Null Values for 'funded_amnt_inv': 0
Number of Null Values for 'recoveries': 0
Number of Null Values for 'revol_util': 90
Number of Null Values for 'policy_code': 0
Number of Null Values for 'last_pymnt_amnt': 0
Number 

## Lets convert the string columns to categories

In [151]:
# initialize
total_memory = 0
num_rows = 0
has_next = True

# lets chunk the file for every 3225 rows <5MB size
chunk_iter = pd.read_csv('loans_2007.csv', chunksize=3225)
chunk_next = NextIterator(chunk_iter)

# lets determine the numeric and string columns first
while has_next:
    temp_chunk = next(chunk_next)
    
    # figure out if we need to remove the two trailer records
    if chunk_next.has_next:
        chunk = temp_chunk
    else:
        chunk = temp_chunk.iloc[:-trailers,:].copy()
        has_next = False
    
    # lets remove the id columns
    chunk = remove_non_num_record(chunk,'id')
        
    for col in category_cand_cols:
        chunk[col] = chunk[col].astype('category')
        
    # lets calculate the total memory
    total_memory += chunk.memory_usage(deep=True).sum()
    
    # lets add the chunk rows to total rows
    num_rows += len(chunk)
        
# total memeory in MB
total_memory = total_memory/(1024 ** 2)

print('Number of rows: {} & Total Memory in MB: {}'. format(num_rows, round(total_memory,2)))     

Number of rows: 42535 & Total Memory in MB: 35.07


## Lets Convert the numerics too and see the memory usedm

In [152]:
# initialize
total_memory = 0
num_rows = 0
has_next = True

# lets chunk the file for every 3225 rows <5MB size
chunk_iter = pd.read_csv('loans_2007.csv', chunksize=3225)
chunk_next = NextIterator(chunk_iter)

# lets determine the numeric and string columns first
while has_next:
    temp_chunk = next(chunk_next)
    
    # figure out if we need to remove the two trailer records
    if chunk_next.has_next:
        chunk = temp_chunk
    else:
        chunk = temp_chunk.iloc[:-trailers,:].copy()
        has_next = False
        
    # lets remove the id columns
    chunk = remove_non_num_record(chunk,'id')
        
    # lets handle the percents and convert them into float
    for col in percent_cols:
        chunk[col] = chunk[col].apply(lambda x: str(x).replace('%',''))
        chunk[col] = chunk[col].astype('float')
        
    # lets handle date columns
    for col in date_cols:
        chunk[col] = pd.to_datetime(chunk[col], format='%b-%Y')
        
    for col in category_cand_cols:
        chunk[col] = chunk[col].astype('category')
    
    for col in integer_cand_cols:
        chunk[col] = chunk[col].astype('int')
        chunk[col] = pd.to_numeric(chunk[col], downcast='integer')
        
    for col in set(numeric_cols)- set(integer_cand_cols):
        chunk[col] = chunk[col].astype('float')
        chunk[col] = pd.to_numeric(chunk[col], downcast='float')
        
    # lets calculate the total memory
    total_memory += chunk.memory_usage(deep=True).sum()
    
    # lets add the chunk rows to total rows
    num_rows += len(chunk)
        
# total memeory in MB
total_memory = total_memory/(1024 ** 2)

print('Number of rows: {} & Total Memory in MB: {}'. format(num_rows, round(total_memory,2)))  

Number of rows: 42535 & Total Memory in MB: 16.02


> # Reduced the memory footprint by 4 fold from 66MB to 16MB

In [153]:
chunk.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 611 entries, 41925 to 42535
Data columns (total 52 columns):
id                            611 non-null int32
member_id                     611 non-null int32
loan_amnt                     611 non-null float32
funded_amnt                   611 non-null float32
funded_amnt_inv               611 non-null float32
term                          611 non-null category
int_rate                      611 non-null float32
installment                   611 non-null float32
grade                         611 non-null category
sub_grade                     611 non-null category
emp_title                     573 non-null object
emp_length                    611 non-null category
home_ownership                611 non-null category
annual_inc                    607 non-null float32
verification_status           611 non-null category
issue_d                       611 non-null datetime64[ns]
loan_status                   611 non-null category
pymnt_plan   

In [154]:
chunk.tail()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,inq_last_6mths,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,last_credit_pull_d,collections_12_mths_ex_med,policy_code,application_type,acc_now_delinq,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens
42531,73582,73096,3500.0,3500.0,225.0,36 months,10.28,113.389999,C,C1,,< 1 year,RENT,180000.0,Not Verified,2007-06-01,Does not meet the credit policy. Status:Fully ...,n,other,Wedding coming up,100xx,NY,10.0,,NaT,,,,0.0,,,f,0.0,0.0,3719.431152,239.110001,3500.0,219.429993,0.0,0.0,0.0,2008-03-01,0.0,2013-02-01,,1.0,INDIVIDUAL,,,,,
42532,72998,72992,1000.0,1000.0,0.0,36 months,9.64,32.110001,B,B4,Halping hands company inc.,< 1 year,RENT,12000.0,Not Verified,2007-06-01,Does not meet the credit policy. Status:Fully ...,n,other,delight,021xx,MA,10.0,,NaT,,,,0.0,,,f,0.0,0.0,1155.600952,0.0,1000.0,155.600006,0.0,0.0,0.0,2010-06-01,32.41,2014-09-01,,1.0,INDIVIDUAL,,,,,
42533,72176,70868,2525.0,2525.0,225.0,36 months,9.33,80.690002,B,B3,,< 1 year,RENT,110000.0,Not Verified,2007-06-01,Does not meet the credit policy. Status:Fully ...,n,other,Car repair bill,100xx,NY,10.0,,NaT,,,,0.0,,,f,0.0,0.0,2904.498779,258.820007,2525.0,379.5,0.0,0.0,0.0,2010-06-01,82.029999,2007-05-01,,1.0,INDIVIDUAL,,,,,
42534,71623,70735,6500.0,6500.0,0.0,36 months,8.38,204.839996,A,A5,,< 1 year,NONE,,Not Verified,2007-06-01,Does not meet the credit policy. Status:Fully ...,n,other,Buying a car,100xx,NY,4.0,,NaT,,,,0.0,,,f,0.0,0.0,7373.904785,0.0,6500.0,873.900024,0.0,0.0,0.0,2010-06-01,205.320007,2007-08-01,,1.0,INDIVIDUAL,,,,,
42535,70686,70681,5000.0,5000.0,0.0,36 months,7.75,156.110001,A,A3,Homemaker,10+ years,MORTGAGE,70000.0,Not Verified,2007-06-01,Does not meet the credit policy. Status:Fully ...,n,other,Aroundthehouse,068xx,CT,8.81,,NaT,,,,0.0,,,f,0.0,0.0,5619.762207,0.0,5000.0,619.76001,0.0,0.0,0.0,2010-06-01,156.389999,2015-02-01,,1.0,INDIVIDUAL,,,,,
