In [1]:
import numpy as np
import pandas as pd
import re
from sklearn.model_selection import train_test_split

import seaborn as sns
import matplotlib.pyplot as plt

import os

In [2]:
# Load in datasets
# https://www.kaggle.com/yagnasrip2/dataset
# https://www.kaggle.com/mahbubrob/usa-cities
ds = pd.read_csv('/kaggle/input/dataset/Training Data.csv')
usa_ds = pd.read_csv('/kaggle/input/usa-cities/usa_cities.csv')

In [3]:
# Extract the exact amount of required states and cities
us_state_names = usa_ds.province.value_counts().keys().to_list()[:28]
us_city_names = usa_ds.city_ascii.value_counts().keys().to_list()[:316]

In [4]:
# Create map of states
states = list(ds.state.value_counts().index.to_list())

states_set = set()
for state in states:
    states_set.add('_'.join(state.split()))

states_set.remove('Uttar_Pradesh[5]')

state_map = dict(zip(states_set, us_state_names))

state_map['Andhra Pradesh'] = state_map['Andhra_Pradesh']
state_map['Himachal Pradesh'] = state_map['Himachal_Pradesh']
state_map['Jammu and Kashmir'] = state_map['Jammu_and_Kashmir']
state_map['Madhya Pradesh'] = state_map['Madhya_Pradesh']
state_map['Uttar Pradesh'] = state_map['Uttar_Pradesh']
state_map['Uttar Pradesh[5]'] = state_map['Uttar_Pradesh']
state_map['Uttar_Pradesh[5]'] = state_map['Uttar_Pradesh']
state_map['West Bengal'] = state_map['West_Bengal']
state_map['Tamil Nadu'] = state_map['Tamil_Nadu']

In [5]:
# Create map of cities
cities = list(ds.city.value_counts().index.to_list())

cities_set = set()
for city in cities:
    cities_set.add(re.sub(r'\[[0-9]+\]', "", '_'.join(city.split())))
    
city_map = dict(zip(cities_set, us_city_names))

for city in cities:
    if city not in cities_set:
        city_map[city] = city_map[re.sub(r'\[[0-9]+\]', "", '_'.join(city.split()))]

In [6]:
# Copy datasets
ds_copy = ds.copy(deep=True)

In [7]:
# Replace State and city names in the datasets
for india in state_map:
    ds_copy.state.replace(india, state_map[india], inplace=True)   

for india in city_map:
    ds_copy.city.replace(india, city_map[india], inplace=True)

In [8]:
# Rename columns in the train and test dataset
ds_copy.rename(columns = {
    "married" : "has_child",
    "income" : "monthly_income",
    "car_ownership" : "owns_bike",
    "house_ownership" : "residential_status",
    "current_job_years" : "years_employed",
    "current_house_years" : "age_house",
    "city" : "home_city",
    "state" : "home_state",
    "risk_flag": "ineligible_for_funds"
}, inplace=True)

In [9]:
# Modify the data to make it untraceable
ds_copy.has_child = ds_copy.has_child.apply(lambda x: "yes" if x == "married" else "no")

ds_copy.monthly_income = ds_copy.monthly_income.apply(lambda x: x/12)

ds_copy.residential_status = ds_copy.residential_status.apply(lambda x: "on_rent" if x == "rented" else "own_house" if x == "owned" else "no_house")

In [10]:
# Add underscores to some cities and states with space in names in order to make it a bit harder to work with
ds_copy['home_state'] = ds_copy.apply(lambda x: '_'.join(x.home_state.split()) if (x.Id%5) == 0 else x.home_state, axis=1)

ds_copy['home_city'] = ds_copy.apply(lambda x: '_'.join(x.home_city.split()) if (x.Id%37) == 0 else x.home_city, axis=1)

In [11]:
# Add random numbers in square brackets (similar data was present in dataset), forcing participants to preprocess data efficiently
for n in np.random.randint(low=0, high=ds_copy.shape[0], size = int(0.07 * ds_copy.shape[0])):
    ds_copy.iat[n, 8] += f"[{np.random.randint(low=0, high=100)}]"
    
for n in np.random.randint(low=0, high=ds_copy.shape[0], size = int(0.07 * ds_copy.shape[0])):
    ds_copy.iat[n, 9] += f"[{np.random.randint(low=0, high=100)}]"

In [12]:
# Add a bit of noise to the income
noise = np.random.uniform(-50, 200, (ds_copy.shape[0],))
ds_copy.monthly_income += noise

noise = np.random.normal(np.random.randn(), np.random.rand(), (ds_copy.shape[0],))
ds_copy.experience = abs(ds_copy.experience + noise)

In [13]:
# Extract target label
X = ds_copy.drop('ineligible_for_funds', axis=1)
y = ds_copy['ineligible_for_funds']

In [14]:
# Use Stratified K-Fold to split the training set into train and test sets
from sklearn.model_selection import StratifiedKFold
train_idx, test_idx = next(StratifiedKFold(n_splits=4).split(X, y))

train, train['ineligible_for_funds'] = X.loc[train_idx], y.loc[train_idx]
test, test['ineligible_for_funds'] = X.loc[test_idx], y.loc[test_idx]

In [15]:
# Create null values
for n in np.random.randint(low=0, high=train.shape[0], size = int(0.09856 * train.shape[0])):
    train.iat[n, 1] = None
    
for n in np.random.randint(low=0, high=train.shape[0], size = int(0.09601 * train.shape[0])):
    train.iat[n, 5] = None
    train.iat[n, 11] = None
    
for n in np.random.randint(low=0, high=train.shape[0], size = int(0.1001 * train.shape[0])):
    train.iat[n, 6] = None

In [16]:
# Reindex all columns
for i in range(1, train.shape[0] + 1):
    train.iat[i-1, 0] = i
    
for i in range(train.shape[0] + 1, ds_copy.shape[0] + 1):
    test.iat[i-1 - train.shape[0], 0] = i

In [17]:
# Create test data and labels
test_X = test.drop('ineligible_for_funds', axis=1)
test_y = pd.concat([test.Id, test.ineligible_for_funds], axis=1)

In [18]:
# Export the data to .csv
train.to_csv('train_dataset.csv', index=False)
test_X.to_csv('test_dataset.csv', index=False)
test_y.to_csv('test_labels_PRIVATE.csv', index=False)