In [103]:
import pandas as pd
import numpy as np
import os
import xlrd
from functools import reduce

To read in these massive excel files first want to extract names of sheets

In [10]:
file1 = "/Users/quinnunderriner/Desktop/Work/eden_rent_roll/TenantLeasePayments_Trending_(Active)_May_2020_Sept_2020.xlsx"
file2 = "/Users/quinnunderriner/Desktop/Work/eden_rent_roll/Tenant_Lease_Payments_May 2019_Sept_2019.xlsx"

In [263]:
xls = xlrd.open_workbook(file1, on_demand=True)
may2020_sept2020 = xls.sheet_names()
may2020_sept2020 = [x for x in may2020_sept2020 if not x.endswith('Summary')] #we want to drop summary tabs 

In [271]:
# need to drop data from earlier in months where we have double data 
may2020_sept2020 = [x for x in may2020_sept2020 if not "15" in x]
may2020_sept2020 = [x for x in may2020_sept2020 if not "10" in x]


In [17]:
xls2 = xlrd.open_workbook(file2, on_demand=True)
may19_sept19 = xls2.sheet_names()

['May 31', 'June 30', 'July 31', 'August 31', 'Sept. 30']


In [192]:
def read_in(file,sheet):
    """
    inputs:
        file (str): filepath of excel file
        sheet (str): name of sheet in excel sheet
    returns:
        df (dataframe)
    Read in data from a given excel sheet, do some cleaning and then 
    make new column names that adds the date for financial transaction 
    and drops the rest of the data. 
    """
    df = pd.read_excel(file,sheet_name =sheet)
    new_header = df.iloc[4] #grab the forth row for the header
    df = df[5:] #take the data less the blank rows
    df.columns = new_header
    df = df[:-2]#drop bottom two rows, which contain summary info
    df = df.reset_index().drop(columns=["index"])
    df = df[['Tenant Lease Charge','Is Subsidized?', 'Fixed Income?', 'Tenant Rent Collected',
             'Percent Collected',"Period","Tenant"]]
    df = df.rename(columns={"Tenant Lease Charge":"Tenant Lease Charge"+" "+df["Period"][5],
                   "Tenant Rent Collected":"Tenant Rent Charge"+" "+df["Period"][5],
                  "Percent Collected":"Tenant Percent Collected"+" "+df["Period"][5],
                   'Is Subsidized?':"Is Subsidized?"+" "+df["Period"][5], 
                    'Fixed Income?':"Fixed Income?"+" "+df["Period"][5]}) 
    df = df.drop(columns={"Period"})
    return df

In [154]:
def read_in_check_status(file,sheet):
    """
    inputs:
        file (str): filepath of excel file
        sheet (str): name of sheet in excel sheet
    returns:
        df (dataframe)
    Read in data from a given excel sheet, do some cleaning and then
    create monthly columns checking subsidy status.
    """
    df = pd.read_excel(file,sheet_name =sheet)
    new_header = df.iloc[4] #grab the forth row for the header
    df = df[5:] #take the data less the blank rows
    df.columns = new_header
    df = df[:-2]#drop bottom two rows, which contain summary info
    df = df.reset_index().drop(columns=["index"])
    df = df[['Tenant', 'Is Subsidized?', 'Fixed Income?']]

    #df = df.drop(columns={"Period"})
    return df

In [310]:
def make_header(df):
    """
    inputs:
        df (dataframe)
    returns:
        df (dataframe)
    do some basic cleanup of the header row 
    """
    new_header = df.iloc[4] #grab the forth row for the header
    df = df[5:] #take the data less the blank rows
    df.columns = new_header
    return df

In [276]:
#read in the correct sheets from all files calling read_in function 
df = pd.read_excel(file2,sheet_name =may19_sept19[0])
df = make_header(df)

df_list = [df, read_in(file2,may19_sept19[1]),
           read_in(file2,may19_sept19[2]),
           read_in(file2,may19_sept19[3]),
           read_in(file2,may19_sept19[4]),
           read_in(file1,may2020_sept2020[0]),
           read_in(file1,may2020_sept2020[1]),
           read_in(file1,may2020_sept2020[2]),
           read_in(file1,may2020_sept2020[3]),
           read_in(file1,may2020_sept2020[4])]

In [278]:
#This lambda function then merges all the dataframes from df_list together 
df_final = reduce(lambda left,right: pd.merge(left,right,on='Tenant'), df_list)

df_final = df_final.drop(columns=["Name","Period"]) #drop cols

In [281]:
#write out initial data to csv
df_final.to_csv("Inital_eden_rent_roll_data_19-20_v2.csv")

Need to melt dataset so each row is amount of rent collected in a given month to be able to do groupbys

In [212]:
#grab initial fixed columns for first month in dataset 
fixed_cols = ['Property', 'Property Name', 'City', 'County', 'Unit', 'Tenant',
       'Is Subsidized?', 'Fixed Income?']

In [286]:
# additionally grab just monthly percentage payment columns 
percent_collected = [num for num in df_final.columns if ("Percent") in num]

In [291]:
#merge these two lists in the total list of columns we want so we can melt dataset 
# for this initial analysis assuming that fixed income status and stuff do not change
collect = fixed_cols + percent_collected
df_rent_collected = df_final[collect]

In [292]:
#create a melted dataset so that each row is a payment 
monthly_percent_collected = pd.melt(df_rent_collected, id_vars=fixed_cols,var_name='Percent Collected')

In [293]:
# there are 90,640 payments in this dataset 
len(monthly_percent_collected)

90640

In [295]:
# to group by year, need a year col 
monthly_percent_collected["year"] = monthly_percent_collected["Percent Collected"].str[-2:]

In [301]:
len(monthly_percent_collected[monthly_percent_collected.value > 100])

14577

In [308]:
#16 percent of all payments exceed total, implying paying back rent. 
len(monthly_percent_collected[monthly_percent_collected.value > 100])/ len(monthly_percent_collected)

0.16082303618711385

In [302]:
#negative rent total 
len(monthly_percent_collected[monthly_percent_collected.value < 0])

63

In [309]:
(len(monthly_percent_collected[monthly_percent_collected.value < 0])/len(monthly_percent_collected))

0.0006950573698146514

In [323]:
#I'm going to drop these negative values as I believe them to be errors. 
monthly_percent_collected = monthly_percent_collected[monthly_percent_collected.value >= 0]

In [327]:
monthly_percent_collected.value = monthly_percent_collected.value.astype(float) #make value type that works with groupby 
monthly_percent_collected[monthly_percent_collected.value <=100][["year","value"]].groupby("year").describe()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


Unnamed: 0_level_0,value,value,value,value,value,value,value,value
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
19,37847.0,78.487889,40.651772,0.0,98.735,100.0,100.0,100.0
20,38153.0,91.361198,26.732037,0.0,100.0,100.0,100.0,100.0


In [329]:
#count the number of non payments per year 
monthly_percent_collected[["year","value"]].groupby('year').agg(lambda x: x.eq(0).sum())

Unnamed: 0_level_0,value
year,Unnamed: 1_level_1
19,7898.0
20,2727.0


In [330]:
7898.0 - 2727.0

5171.0

In [None]:
Count of zeros by year 

if i can order the dates up right could make a nice line chart 