In [1]:
import numpy as np
import pandas as pd

DUMMY_DATA_PATH = 'dataset/dummy/'
DUMMY_BANK_DATA = DUMMY_DATA_PATH+'BSA.csv'
DUMMY_MAIN_DATA = DUMMY_DATA_PATH+'data.csv'

In [20]:
main_df = pd.read_csv(DUMMY_MAIN_DATA)
bank_df = pd.read_csv(DUMMY_BANK_DATA)

In [21]:
main_df.head()

Unnamed: 0,address,ads_cmpid,ads_creative,ads_matchtype,ads_network,ads_targetid,amount,application_id,birthdate,browser,...,registered_office_city,registered_office_state,role_in_firm,role_on_application,seo_city,state,utm_medium,utm_source,utm_term,year_of_incorporation
0,,,,,,,,1024.0,,Opera,...,,,,0,,,,,,
1,"At. Pandharbodi, Gondiya",,,,,,300000.0,716.0,23/10/1982,Chrome,...,Gondia,MAHARASHTRA,1.0,4,,MAHARASHTRA,,,,2014.0
2,"Near Heena Manjeel, Serve No. 53,, kale padal ...",,,,,,200000.0,1031.0,08/09/1987,Chrome,...,PUNE,MAHARASHTRA,1.0,4,,MAHARASHTRA,,,,2016.0
3,Mangasule Gali,977169039.0,231222500000.0,e,{google_search},kwd-11424241,300000.0,2056.0,02/04/1982,Chrome,...,Pune,MAHARASHTRA,1.0,4,,KARNATAKA,ppc,adwords,business loans,2014.0
4,"Near Pratiksha Building,, 1, Natraj Niwas, Ata...",,,,,,500000.0,9047.0,13/04/1979,Chrome,...,THANE,MAHARASHTRA,1.0,4,,MAHARASHTRA,Banner,Facebook,Carousel-Ad,2014.0


In [22]:
to_remove_cols = [
     'address', # textual, and hence not helpful
     'ads_cmpid', # unique key
     'ads_creative', # unique key
#      'ads_matchtype',
#      'ads_network',
     'ads_targetid', # unique key
#      'amount',
#      'application_id',
#      'birthdate',
#      'browser',
#      'campaign_city',
#      'city',
#      'company_size',
     'country', # all are india as-is, so no point keeping it..
     'created_date', # has no correlation on the model.. 
#      'email',
     'firm_name', # each name is potentially unique, also string so can't do much
     'firm_pan', # unique for each company, NOTE: there is a pattern than can be used to extract features!! (TODO)
#      'firm_type',
#      'gender',
#      'industry',
#      'ip',
#      'last_fy_profit',
#      'latitude',
#      'loan_created',
#      'longitude',
#      'name',
#      'network',
     'pan', # unique for each individual, NOTE: there is a pattern than can be used to extract features!! (TODO)
#      'pincode',
#      'platform',
#      'registered_office_city',
#      'registered_office_state',
#      'role_in_firm',
#      'role_on_application',
#      'seo_city',
#      'state',
#      'utm_medium',
#      'utm_source',
#      'utm_term',
#      'year_of_incorporation',
]

main_df = main_df.drop(columns=to_remove_cols)
main_df.head()

Unnamed: 0,ads_matchtype,ads_network,amount,application_id,birthdate,browser,campaign_city,city,company_size,country,...,registered_office_city,registered_office_state,role_in_firm,role_on_application,seo_city,state,utm_medium,utm_source,utm_term,year_of_incorporation
0,,,,1024.0,,Opera,,,,,...,,,,0,,,,,,
1,,,300000.0,716.0,23/10/1982,Chrome,,,,India,...,Gondia,MAHARASHTRA,1.0,4,,MAHARASHTRA,,,,2014.0
2,,,200000.0,1031.0,08/09/1987,Chrome,,,,India,...,PUNE,MAHARASHTRA,1.0,4,,MAHARASHTRA,,,,2016.0
3,e,{google_search},300000.0,2056.0,02/04/1982,Chrome,Pune,Mumbai,5.0,India,...,Pune,MAHARASHTRA,1.0,4,,KARNATAKA,ppc,adwords,business loans,2014.0
4,,,500000.0,9047.0,13/04/1979,Chrome,,Mumbai,,India,...,THANE,MAHARASHTRA,1.0,4,,MAHARASHTRA,Banner,Facebook,Carousel-Ad,2014.0


In [24]:
'''
- make 0 amounts as NaNs
- birthdat -> age
- browser: some really low counts
- campaign_city: some really low counts
- city: some really low counts
- email: publicly hosted email domain or personal email domain
- firm_type: is skewed (need to figure things out..)

'''
print(len(main_df))

432


In [35]:
# list(main_df)
main_df['gender'].value_counts()

Male      339
Female     36
Name: gender, dtype: int64

In [None]:
# only keep applications that are in both data sets
appln_id = pd.Series(list(set(main_df['application_id']) & set(bank_df['appl_id'])))
main_df = main_df.loc[main_df['application_id'].isin(appln_id)]
bank_df = bank_df.loc[bank_df['appl_id'].isin(appln_id)]

In [None]:
def _aggregate_columns(df, application_id_col):
    # group by application id and merge all rows into lists
    new_df = pd.DataFrame()
    g = bank_df.groupby(application_id_col)
    for k in list(df):
        if k == application_id_col:
            continue
        new_df = pd.concat([new_df, g[k].apply(list)], axis=1)
    return new_df.reset_index()


def setup_aggregations(df, application_id_col):
    '''
    fix bank data (for applications with multiple rows)
    - average the averages
    - add high_credit_cp
    - add invard returns
    - max of all the maxs
    - min of all the mins
    - add outward_returns
    - drop totals (because average is better and normalized)
    '''
    df = df.drop(columns=['total_bi_inflow', 'total_bi_outflow', 'total_inflow', 'total_outflow'])
    df = _aggregate_columns(df, application_id_col)
    new_df = pd.DataFrame()
    for k in list(df):
        if k == application_id_col:
            new_df = pd.concat([new_df, df[k]], axis=1)
        elif 'average' in k:
            new_df = pd.concat([new_df, df[k].apply(np.average)], axis=1)
        elif 'max' in k:
            new_df = pd.concat([new_df, df[k].apply(np.max)], axis=1)
        elif 'min' in k:
            new_df = pd.concat([new_df, df[k].apply(np.min)], axis=1)
        else:
            new_df = pd.concat([new_df, df[k].apply(np.sum)], axis=1)
    return new_df

bank_df = setup_aggregations(bank_df, 'appl_id')