In [1]:
import numpy as np
import pandas as pd

DUMMY_DATA_PATH = 'dataset/dummy/'
DUMMY_BANK_DATA = DUMMY_DATA_PATH+'BSA.csv'
DUMMY_MAIN_DATA = DUMMY_DATA_PATH+'data.csv'

In [2]:
main_df = pd.read_csv(DUMMY_MAIN_DATA)
bank_df = pd.read_csv(DUMMY_BANK_DATA)

In [3]:
to_remove_cols = [
     'address', # textual, and hence not helpful
     'ads_cmpid', # unique key
     'ads_creative', # unique key
#      'ads_matchtype',
#      'ads_network',
     'ads_targetid', # unique key
#      'amount',
#      'application_id',
     'birthdate', # use firm age (better fit to the model)
#      'browser',
#      'campaign_city',
#      'city',
#      'company_size',
     'country', # all are india as-is, so no point keeping it..
     'created_date', # has no correlation on the model.. 
#      'email',
     'firm_name', # each name is potentially unique, also string so can't do much
     'firm_pan', # unique for each company, NOTE: there is a pattern than can be used to extract features!! (TODO)
#      'firm_type',
#      'gender',
#      'industry',
     'ip', # can we do something about IPs? (TODO)
#      'last_fy_profit',
     'latitude', # IDK how helpful coordinates are.. We have the city and states; and so we can use that..
#      'loan_created', # THIS IS OUR TARGET! THIS IS SKEWED, SO A NAIVE IMPL WILL ALSO HAVE 96% ACCURACY :D 
     'longitude', # see comment for 'latitude'
     'name', # see comment for firm name
#      'network',
     'pan', # unique for each individual, NOTE: there is a pattern than can be used to extract features!! (TODO)
     'pincode',
#      'platform',
#      'registered_office_city',
#      'registered_office_state',
#      'role_in_firm',
#      'role_on_application',
     'seo_city', # only 1 value.. 
#      'state',
#      'utm_medium',
#      'utm_source',
     'utm_term', # idk what to do with this data.. ¯\_(ツ)_/¯
#      'year_of_incorporation',
]

main_df = main_df.drop(columns=to_remove_cols)
main_df.head()

Unnamed: 0,ads_matchtype,ads_network,amount,application_id,birthdate,browser,campaign_city,city,company_size,email,...,network,platform,registered_office_city,registered_office_state,role_in_firm,role_on_application,state,utm_medium,utm_source,year_of_incorporation
0,,,,1024.0,,Opera,,,,Kaif1779@gmail.com,...,Opera Software Americas LLC,mobile:Pike v8.0 release 461,,,,0,,,,
1,,,300000.0,716.0,23/10/1982,Chrome,,,,vihanmarketing36@gmail.com,...,Idea Cellular Limited,Win32,Gondia,MAHARASHTRA,1.0,4,MAHARASHTRA,,,2014.0
2,,,200000.0,1031.0,08/09/1987,Chrome,,,,faijiyatoursandtravels@gmail.com,...,Idea Cellular Limited,mobile:Linux armv8l,PUNE,MAHARASHTRA,1.0,4,MAHARASHTRA,,,2016.0
3,e,{google_search},300000.0,2056.0,02/04/1982,Chrome,Pune,Mumbai,5.0,sagarnk2008@gmail.com,...,Reliance Jio Infocomm Limited,mobile:Linux aarch64,Pune,MAHARASHTRA,1.0,4,KARNATAKA,ppc,adwords,2014.0
4,,,500000.0,9047.0,13/04/1979,Chrome,,Mumbai,,natrajmoily@gmail.com,...,Syscon Infoway Pvt. Ltd.,mobile:Linux armv8l,THANE,MAHARASHTRA,1.0,4,MAHARASHTRA,Banner,Facebook,2014.0


In [11]:
'''
Things to clean in main dataframe:
- [x] amount: make 0 amounts as NaNs
- [x] birthdate: calculate age of the person (do we need this if we have the age of firm?) --> removed col: using firm age
- [ ] browser: some really low counts
- [ ] campaign_city: some really low counts
- [ ] city: some really low counts
- [ ] email: publicly hosted email domain or personal email domain
- [ ] firm_type: is skewed (need to figure things out..)
- [ ] last_fy_profit: convert 0 to NaNs
- [ ] platform: combine all the 'mobile:Linux'?
- [ ] registered_office_city: combine same values (cases are different hence are treated as separate values)
- [ ] role_in_firm: categorical; so don't use the numbers as is..
- [ ] role_on_application: categorical; so don't use the numbers as is..
- [ ] year_of_incorporation: -> compute age of firm
'''
print(len(main_df), len(list(main_df)))

main_df.loc[main_df['amount'] == 0, 'amount'] = np.NAN

432 25


Unnamed: 0,ads_matchtype,ads_network,amount,application_id,birthdate,browser,campaign_city,city,company_size,email,...,network,platform,registered_office_city,registered_office_state,role_in_firm,role_on_application,state,utm_medium,utm_source,year_of_incorporation


In [None]:
# list(main_df)
main_df['year_of_incorporation'].sort_values().value_counts()

In [None]:
# only keep applications that are in both data sets
appln_id = pd.Series(list(set(main_df['application_id']) & set(bank_df['application_id'])))
main_df = main_df.loc[main_df['application_id'].isin(appln_id)]
bank_df = bank_df.loc[bank_df['application_id'].isin(appln_id)]

In [None]:
list(bank_df)

In [None]:
def _aggregate_columns(df, application_id_col):
    # group by application id and merge all rows into lists
    new_df = pd.DataFrame()
    g = bank_df.groupby(application_id_col)
    for k in list(df):
        if k == application_id_col:
            continue
        new_df = pd.concat([new_df, g[k].apply(list)], axis=1)
    return new_df.reset_index()


def setup_aggregations(df, application_id_col):
    '''
    fix bank data (for applications with multiple rows)
    - average the averages
    - add high_credit_cp
    - add invard returns
    - max of all the maxs
    - min of all the mins
    - add outward_returns
    - drop totals (because average is better and normalized)
    '''
    df = df.drop(columns=['total_business_inflow', 'total_business_outflow', 'total_inflow', 'total_outflow'])
    df = _aggregate_columns(df, application_id_col)
    new_df = pd.DataFrame()
    for k in list(df):
        if k == application_id_col:
            new_df = pd.concat([new_df, df[k]], axis=1)
        elif 'average' in k:
            new_df = pd.concat([new_df, df[k].apply(np.average)], axis=1)
        elif 'max' in k:
            new_df = pd.concat([new_df, df[k].apply(np.max)], axis=1)
        elif 'min' in k:
            new_df = pd.concat([new_df, df[k].apply(np.min)], axis=1)
        else:
            new_df = pd.concat([new_df, df[k].apply(np.sum)], axis=1)
    return new_df

bank_df = setup_aggregations(bank_df, 'application_id')

In [None]:
bank_df.head()