In this approach, we try to combine our two first approaches with past kernels made available publicly on https://zindi.africa/hackathons/south-african-covid-19-vulnerability-map/discussions.



# Xgboost

### Libraries versions

Please use these exact versions when rerunning the code for score reproducibility. A different version might yield a slightly different score.

pandas : 1.0.3
seaborn : 0.10.0
matplotlib : 3.2.1
numpy : 1.18.2
scikit-learn : 0.22.2.post1
lightgbm : 2.2.3
xgboost : 0.7.post3

In [None]:
## Reading libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import KMeans
import lightgbm as lgb
from sklearn.model_selection import KFold
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA
import warnings

In [None]:
## defining the metric
def metric(x,y):
  return np.sqrt(mean_squared_error(x,y))

In [None]:
import os
os.chdir("/content/drive/MyDrive/Carte de vulnérabilité du COVID-19 en Afrique du Sud by Nimba Hub 3,000,000 GNF/") #Change the path to the working directory

In [None]:
!ls

In [None]:
train = pd.read_csv('Data/Train_maskedv2.csv')
test = pd.read_csv('Data/Test_maskedv2.csv')
vdefinition = pd.read_csv('Data/variable_descriptions_v2.csv')
submission = pd.read_csv('Data/samplesubmissionv2.csv')

###### KMeans and some data cleaning.

In [None]:
### Applying Kmeans to almost all features and generating a 'cluster' feature.
to_drop=['dw_11', 'dw_12','lan_13']
train_copy=train.copy()
columns=train_copy.drop(["ward","target_pct_vunerable"]+to_drop,1).columns
train_copy=train_copy[columns]
km=KMeans(7,random_state=42)
km=km.fit(train_copy[columns])
train["cluster"]=km.predict(train[columns])
test["cluster"]=km.predict(test[columns])

In [None]:
## Dropping wards in the training data that have more than 17500 households + 1 outlier.
train = train[train['total_households']<=17500]
train = train[train.index!=1094]

In [None]:
## Binned feature on total_households
train['total_householdslessthan5000'] = train['total_households'].apply(lambda x:1 if 2500<x<=5000  else 0)
test['total_householdslessthan5000'] = test['total_households'].apply(lambda x:1 if 2500<x<=5000  else 0)

###### A bunch of feature interactions

In [None]:
train['Individualsperhouse'] = train['total_individuals'] / train['total_households']
test['Individualsperhouse'] = test['total_individuals'] / test['total_households']

In [None]:
train['Luxury_01'] = train['car_01']+train['stv_00']+train['psa_01']
train['Luxury_00'] = train['car_00'] +train['stv_01']+train['psa_00']

In [None]:
test['Luxury_01'] = test['car_01']+test['stv_00']+test['psa_01']
test['Luxury_00'] = test['car_00'] +test['stv_01']+test['psa_00']

In [None]:
train['NoSchoolAttendace'] = train['psa_01'] + train['psa_02']+ train['psa_03']
test['NoSchoolAttendace'] = test['psa_01'] + test['psa_02']+ test['psa_03']

In [None]:
train['InformalDwellings'] = train['dw_02'] + train['dw_07'] + train['dw_06']
test['InformalDwellings'] = test['dw_02'] + test['dw_07'] + test['dw_06']

In [None]:
train['TraditionalVSInformalDwellings'] = np.absolute(train['dw_01'] - train['dw_08'])
test['TraditionalVSInformalDwellings'] = np.absolute(test['dw_01'] - test['dw_08'])

In [None]:
train['total_households']/=train['total_households'].max()
train['total_individuals']/=train['total_individuals'].max()

test['total_households']/=test['total_households'].max()
test['total_individuals']/=test['total_individuals'].max()

In [None]:
train['SAOldPeopleSesothoVSSetswana'] = np.absolute(train['lan_06'] - train['lan_07'])

test['SAOldPeopleSesothoVSSetswana'] = np.absolute(test['lan_06'] - test['lan_07'])

###### Target encoding + PCA 

In [None]:
target_mean = train.groupby(['cluster']).mean()[['target_pct_vunerable']]
for i in list(target_mean.columns):
  target_mean.rename({i:i+"_mean"},axis=1,inplace=True)
train = train.merge(target_mean,how="left",on='cluster')
test = test.merge(target_mean,how="left",on='cluster')


In [None]:
pca = PCA(random_state=42,n_components=1)
pg_features =  train.filter(regex='lan_.*')
train_pca = pca.fit_transform(pg_features)
train['pca_lan_0'] = train_pca[:,0]

In [None]:
pg_features =  test.filter(regex='lan_.*')
test_pca = pca.transform(pg_features)
test['pca_lan_0'] = test_pca[:,0]

###### Training.

In [None]:
target = train['target_pct_vunerable']

In [None]:
train = train.drop(['psa_00','psa_02','psa_03','psa_04','psa_01','lgt_00','stv_01','car_01','lln_01','ward','dw_12','dw_13','lan_13','target_pct_vunerable'], axis=1)
test = test.drop(['psa_00','psa_02','psa_03','psa_04','psa_01','lgt_00','stv_01','car_01','lln_01','ward','dw_12','dw_13','lan_13'], axis=1)

###### a 4 folds averaging solution using xgboost.

In [None]:
kf = KFold(n_splits=4,shuffle=False)
lgbm = XGBRegressor(n_estimators=50000,random_state=42,max_depth=5,learning_rate=0.03888)
scores = []
pred_test = np.zeros(len(test))
for (train_index,test_index) in kf.split(train,target):
  X_train,X_test = train.iloc[train_index],train.iloc[test_index]
  y_train,y_test = target.iloc[train_index],target.iloc[test_index]
  lgbm.fit(X_train,y_train,early_stopping_rounds=500,eval_set=[(X_test,y_test)],eval_metric='rmse')
  scores.append(metric(lgbm.predict(X_test),y_test))
  pred_test+=lgbm.predict(test)

In [None]:
np.mean(scores)

###### Plotting features importances.

In [None]:
warnings.simplefilter(action='ignore', category=FutureWarning)

# sorted(zip(clf.feature_importances_, X.columns), reverse=True)
feature_imp = pd.DataFrame(sorted(zip(lgbm.feature_importances_,train.columns)), columns=['Value','Feature'])

plt.figure(figsize=(30, 20))
sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False))
plt.title('XGB Features (avg over folds)')
plt.tight_layout()
plt.show()

###### And finally the submission.

In [None]:
submission['target_pct_vunerable'] = np.absolute(pred_test/4)
submission.to_csv('Submissions/first.csv',index=False)
submission['target_pct_vunerable'] = np.clip(submission['target_pct_vunerable'], a_min=0, a_max=90)
submission.to_csv('Submissions/first_cliped.csv',index=False)

# Models Combinaison approach

## Problem Statement


Can we infer important COVID-19 public health risk factors from outdated data? In many countries census and other survey data may be incomplete or out of date. This challenge is to develop a proof-of-concept for how machine learning can help governments more accurately map COVID-19 risk in 2020 using old data, without requiring a new costly, risky, and time-consuming on-the-ground survey.

The 2011 census gives us valuable information for determining who might be most vulnerable to COVID-19 in South Africa. However, the data is nearly 10 years old, and we expect that some key indicators will have changed in that time. Building an up-to-date map showing where the most vulnerable are located will be a key step in responding to the disease. A mapping effort like this requires bringing together many different inputs and tools. For this competition, we’re starting small. Can we infer important risk factors from more readily available data?

The task is to predict the percentage of households that fall into a particularly vulnerable bracket - large households who must leave their homes to fetch water - using 2011 South African census data. Solving this challenge will show that with machine learning it is possible to use easy-to-measure stats to identify areas most at risk even in years when census data is not collected.


## Installing libraries

In [None]:
# Installing the necessary libraries
!pip install catboost
!pip install rgf-python

## Importing libraries

In [7]:
!pip install -q xgboost
!pip install -q lightgbm



In [8]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import requests
from io import StringIO 
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR, NuSVR
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor, XGBRFRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, BayesianRidge
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import RandomForestRegressor, StackingRegressor,HistGradientBoostingRegressor, ExtraTreesRegressor
from sklearn.metrics import mean_squared_error
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.cluster import KMeans
import warnings
from rgf.sklearn import RGFRegressor
warnings.filterwarnings('ignore')

## Loading data

In [9]:
# Created links to shared files via google drive
#
train = 'https://drive.google.com/file/d/1_wLi9i-pUk6Kaizjb5i6-Pd0Vi7L1d-E/view?usp=sharing'
test = 'https://drive.google.com/file/d/1OeT53v7tZLnB71n1j4r6KFbtrouz38ej/view?usp=sharing'

# Created a function to read a csv file shared via google and return a dataframe
#
def read_csv(url):
    url = 'https://drive.google.com/uc?export=download&id=' + url.split('/')[-2]
    csv_raw = requests.get(url).text
    csv = StringIO(csv_raw)
    df = pd.read_csv(csv)
    return df

# Creating training and testing datataframes
#
train = read_csv(train)
test = read_csv(test)

## Combining the training and test data


In [10]:
# Combining test and train for easy feature engineering.
target = train.target_pct_vunerable

train['separator'] = 0
test['separator'] = 1

train, test = train.align(test, join = 'inner', axis = 1)

comb = pd.concat([train, test])


## Feature engineering

In [11]:
# Examining feature interactions from the most important features from model's feature importances graph and creating new magic features.
# While there is no science into it and it's mostly trial and error, the new features improved the score greatly and if we had computational power, 
# we could have explored more interactions.

comb['household_size'] = comb.total_individuals/comb.total_households
comb['gf_1'] = comb['dw_01'] * comb['psa_01']
comb['gf_2'] = comb['gf_1'] * comb['psa_00']
comb['gf_3'] = comb['gf_1'] * comb['psa_02']
comb['gf_4'] = comb['gf_1'] * comb['psa_03']
comb['gf_5'] = comb['gf_1'] * comb['gf_2']
comb['gf_6'] = comb['gf_5'] * comb['gf_2']
comb['dw_01_2'] = comb['dw_01'] ** 2
comb['psa_00_2'] = comb['psa_00'] ** 2
luxury_stuff = ['psa_01','car_01','stv_00']
not_luxury_stuff = ['psa_00','car_00','stv_01']
comb['luxury_stuff'] = comb[luxury_stuff].sum(axis=1)
comb['not_luxury_stuff'] = comb[not_luxury_stuff].sum(axis=1)
comb['a_luxury_stuff'] = comb[luxury_stuff].mean(axis=1)
comb['a_not_luxury_stuff'] = comb[not_luxury_stuff].mean(axis=1)

## Separating train and test datasets

In [12]:
# Separating the train and test datasets.
train = comb[comb.separator == 0]
test = comb[comb.separator == 1]

train.drop('separator', axis = 1, inplace = True)
test.drop('separator', axis = 1, inplace = True)

## Splitting training and validation sets

In [13]:
# The columns dropped were those that from the feature importance of the baseline model, were of least importance and just added noise to the model.

X = train.drop(columns=['ward', 'dw_13', 'dw_12', 'lan_13', 'psa_03'])
y = target.copy()
tes = test.drop(['ward', 'dw_13', 'dw_12', 'lan_13', 'psa_03'], 1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=2020)

## Training different models

In [14]:

# In stacking, the most important thing is model diversification. from linear, SVM, KNN and Decision trees and many variations of them. 
# The variations are different values of key parameters of each model. 
# While we did not have the time to tune parameters of each model, except the meta learner Catboost, educated guesses on 
# the parameters were made to have as much variability as possible.

estimators_1 = [
    ('xgb', XGBRegressor(random_state=2020, objective ='reg:squarederror', learning_rate=0.05)),
    ('lr', LinearRegression()),
    ('rf', RandomForestRegressor(random_state=2020)),
    ('lgb', LGBMRegressor(learning_rate=0.2, random_state=2020)),
    ('svr', SVR(degree=2)),
    ('lasso', Lasso(random_state=2020)),
    ('RGF', RGFRegressor()),
    ('kneiba', KNeighborsRegressor(n_neighbors=4)),
    ('cat', CatBoostRegressor(logging_level='Silent', random_state=2020))
]

predictions_1 = StackingRegressor(estimators=estimators_1, final_estimator=CatBoostRegressor(logging_level='Silent', depth=6, bagging_temperature=5, random_state=2020)).fit(X_train, y_train).predict(tes)

estimators_2 = [
    ('xgb', XGBRegressor(objective ='reg:squarederror', learning_rate=0.2, random_state=2020)),
    ('lr', LinearRegression()),
    ('rf', RandomForestRegressor(random_state=2020)),
    ('lgb', LGBMRegressor(learning_rate=0.05, random_state=2020)),
    ('svr', SVR(degree=5)),
    ('RGF', RGFRegressor()),
    ('lasso', Lasso(random_state=2020)),
    ('kneiba', KNeighborsRegressor(n_neighbors=6)),
    ('cat', CatBoostRegressor(logging_level='Silent', random_state=2020))
]

predictions_2 = StackingRegressor(estimators=estimators_2, final_estimator=CatBoostRegressor(logging_level='Silent', depth=6, bagging_temperature=5, random_state=2020)).fit(X_train, y_train).predict(tes)

predictions_cat_1 = CatBoostRegressor(logging_level='Silent', depth=6, bagging_temperature=5, random_state=2020).fit(X_train, y_train).predict(tes)


# Further averaging, blending and retraining to generalise well
# While the ratios are greater than one, it still works a treat. This is definitely one of the parameters to tune to achieve great results.
stack = [x*0.56 + y*0.51 for x, y in zip(predictions_1, predictions_2)]
stack_2 = [x*0.56 + y*0.51 for x, y in zip(stack, predictions_cat_1)]

X,y = tes.copy(), stack_2
preds_ridge = Ridge(random_state=2020).fit(X, y).predict(X)

# We added a new feature to the test dataset, where we clustered the wards to 150 clusters, then used Catboost's encoder to encode the clusters.
X['cluster'] = KMeans(150, random_state=2020).fit(X).predict(X)
preds_cat = CatBoostRegressor(random_state=2020, verbose = False, depth=6, bagging_temperature=5, cat_features=['cluster']).fit(X, y).predict(X)

# blended the Ridge and Catboost predictions.
final_blend_2 = [x*0.2 +y*0.8 for x, y in zip(preds_ridge, preds_cat)]

# Clipping the values from between 0 - 90 was also important as we know that the target variable is between 0 to 100.
final_blend_2 = np.clip(final_blend_2, a_min=0, a_max=90)

# Applying regularization to the final blend by substracting a constant from the predictions and clipping again.
exp = final_blend_2 - 0.48
exp = np.clip(exp, a_min=0, a_max=90)

## Retraining predictions

In [15]:
# Retraining on the test data by using the prediction of the stacked regressors as our target.
# We also added the clusters but had to manually mean encode the clusters to the target variable as LinearRegression cannot encode categorical variables.
X = tes.copy()

X['cluster'] = KMeans(150, random_state=2020).fit(X).predict(X)
X['target'] = exp
X['encoded'] = X['cluster'].map(X.groupby('cluster')['target'].mean())
y=X.target
X=X.drop(['cluster', 'target'], 1)
preds_1 = CatBoostRegressor(verbose = False, random_state=2020).fit(X,y).predict(X)*0.7 + LinearRegression().fit(X, y).predict(X)*0.3
preds_2 = CatBoostRegressor(verbose = False, random_state=2020).fit(X,y).predict(X)*0.5 + LinearRegression().fit(X, y).predict(X)*0.5
preds_3 = CatBoostRegressor(verbose = False, random_state=2020).fit(X,y).predict(X)*0.6 + LinearRegression().fit(X, y).predict(X)*0.4

final = [x*0.3 + y*0.3 + z*0.4 for x, y, z in zip(preds_1, preds_2, preds_3)]

## Further retraining of predictions

In [16]:
# Retraining again this time using Regularized Greedy Forests and Catboost.
X['final'] = final
y = X.final
X = X.drop('final', 1)
preds_1 = CatBoostRegressor(verbose = False, random_state=2020).fit(X,y).predict(X)*0.7 + RGFRegressor().fit(X, y).predict(X)*0.3
preds_2 = CatBoostRegressor(verbose = False, random_state=2020).fit(X,y).predict(X)*0.5 + RGFRegressor().fit(X, y).predict(X)*0.5
preds_3 = CatBoostRegressor(verbose = False, random_state=2020).fit(X,y).predict(X)*0.6 + RGFRegressor().fit(X, y).predict(X)*0.4

final2 = [x*0.3 + y*0.3 + z*0.4 for x, y, z in zip(preds_1, preds_2, preds_3)]


## Creating a submission file

In [17]:
# Clipping for the final time and creating the submission file.
final2 = np.clip(final2, a_min=0, a_max=90)
sub_df = pd.DataFrame({'ward': test.ward, 'target_pct_vunerable': final2-0.2})
sub_df.to_csv('../Submissions/second.csv', index = False)

## Challenges faced

We faced a problem of reproducibility as the score was changing with each submission with no change in code. However, that was solved by setting the *random_state* parameter of all models that have it to the same value. Now, the solution provides a consistently similar score each time it's rerun.


However, the solution has a better private Leader board score of *3.50354986128398* which is better than the score we uploaded in time which was *3.52760301028188*

# CatBoost

In [2]:
!pip install -q catboost



In [3]:
# Import the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from catboost import CatBoostRegressor, CatBoostClassifier
from sklearn.metrics import mean_squared_error

warnings.filterwarnings('ignore')
%matplotlib inline
pd.set_option('max_column', 1000)

In [5]:
train = pd.read_csv('../Data/Train_maskedv2.csv')
test = pd.read_csv('../Data/Test_maskedv2.csv')
vdefinition = pd.read_csv('../Data/variable_descriptions_v2.csv')
submission = pd.read_csv('../Data/samplesubmissionv2.csv')

In [6]:
pd.set_option('max_colwidth', 200)
pd.set_option('max_info_rows', 1000)

In [7]:
train.drop(columns=['dw_12', 'dw_13', 'lan_13'], inplace=True)
test.drop(columns=['dw_12', 'dw_13', 'lan_13'], inplace=True)

In [8]:
train_len = len(train)
data=pd.concat([train,test])

In [9]:
data['rich'] = data['car_01']+data['stv_01']+data['psa_01']+data['dw_02']+data['lln_00']
data['poor'] = data['car_00'] +data['stv_00']+data['psa_00']+data['dw_01']+data['lln_01']

In [10]:
data['household_size'] =data['total_individuals'] / data['total_households']

In [11]:
from sklearn.cluster import KMeans
columns=data.drop(["target_pct_vunerable","ward"],1).columns

data_km=data[columns].copy()

data_km["total_households"]/=data_km["total_households"].max()
data_km["total_individuals"]/=data_km["total_individuals"].max()

km=KMeans(15,random_state=2019)
data["cluster"]=km.fit_predict(data_km[columns])

In [12]:
train = data[:train_len]
test = data[train_len:]

In [13]:
_id = test['ward']
test.drop(columns=['target_pct_vunerable','ward'], inplace=True)
train.drop(columns=['ward'], inplace=True)

In [14]:
train.shape, test.shape

((3174, 50), (1102, 49))

In [15]:
train['total_households'] = np.log10(train['total_households'])
test['total_households'] = np.log10(test['total_households'])

train['total_individuals'] = np.log10(train['total_individuals'])
test['total_individuals'] = np.log10(test['total_individuals'])

In [16]:
X = train.drop(columns=['target_pct_vunerable'])
y = train['target_pct_vunerable']

In [17]:
X.columns

Index(['total_households', 'total_individuals', 'dw_00', 'dw_01', 'dw_02',
       'dw_03', 'dw_04', 'dw_05', 'dw_06', 'dw_07', 'dw_08', 'dw_09', 'dw_10',
       'dw_11', 'psa_00', 'psa_01', 'psa_02', 'psa_03', 'psa_04', 'stv_00',
       'stv_01', 'car_00', 'car_01', 'lln_00', 'lln_01', 'lan_00', 'lan_01',
       'lan_02', 'lan_03', 'lan_04', 'lan_05', 'lan_06', 'lan_07', 'lan_08',
       'lan_09', 'lan_10', 'lan_11', 'lan_12', 'lan_14', 'pg_00', 'pg_01',
       'pg_02', 'pg_03', 'pg_04', 'lgt_00', 'rich', 'poor', 'household_size',
       'cluster'],
      dtype='object')

In [18]:
X.isna().sum().any()

False

In [3]:
from rgf.sklearn import RGFRegressor

In [19]:
col = ['car_00', 'car_01', 'dw_00', 'dw_01', 'dw_02', 'dw_03', 'dw_04',
       'lan_08', 'lan_09', 'lan_10', 'lan_11', 'lan_12', 'lan_14', 'lgt_00',
       'dw_05', 'dw_06', 'dw_07', 'dw_08', 'dw_09', 'dw_10', 'dw_11', 'lan_00',
       'lan_01', 'lan_02', 'lan_03', 'lan_04', 'lan_05', 'lan_06', 'lan_07',
       'lln_00', 'lln_01', 'pg_00', 'pg_01', 'pg_02', 'pg_03', 'pg_04',
       'psa_00', 'psa_01', 'psa_02', 'psa_03', 'psa_04', 'stv_00', 'stv_01',
        'rich', 'poor']

In [20]:
X[col] = X[col].round(2)
test[col] = test[col].round(2)

In [21]:
categorical_features_indices = np.where(X.dtypes != np.float)[0]; categorical_features_indices

array([48])

In [None]:
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold,StratifiedKFold, TimeSeriesSplit

testsplit_store=[]
test_store=[]
fold=KFold(n_splits=15, shuffle=True, random_state=123456)
i=1
for train_index, test_index in fold.split(X,y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    cat = CatBoostRegressor(n_estimators=10000,eval_metric='RMSE', learning_rate=0.0801032, random_seed= 123456, l2_leaf_reg=4, use_best_model=True)
    cat.fit(X_train,y_train,eval_set=[(X_train,y_train),(X_test, y_test)], early_stopping_rounds=300,verbose=100, cat_features=categorical_features_indices)
    predict = cat.predict(X_test)
    print("err: ",np.sqrt(mean_squared_error(y_test,predict)))
    testsplit_store.append(np.sqrt(mean_squared_error(y_test,predict)))
    pred = cat.predict(test)
    test_store.append(pred)

In [None]:
np.mean(testsplit_store)

In [None]:
submit_prep = {"ward": _id, 'target_pct_vunerable': np.mean(test_store, 0)}
submission = pd.DataFrame(data = submit_prep)

In [None]:
submission.head()

In [None]:
submission.to_csv('Submissions/third.csv', index=False)

In [None]:
submission['target_pct_vunerable'] = np.clip(submission['target_pct_vunerable'], a_min=0, a_max=90)
submission.to_csv('Submissions/third_cliped.csv',index=False)

In [None]:
## Check for the feature importance 
fea_imp = pd.DataFrame({'imp':cat.feature_importances_, 'col': X.columns})
fea_imp = fea_imp.sort_values(['imp', 'col'], ascending=[True, False]).iloc[-60:]
_ = fea_imp.plot(kind='barh', x='col', y='imp', figsize=(20, 10))
plt.savefig('catboost_feature_importance.png')

# Ensembling

In [None]:
s1 = pd.read_csv("Submissions/first.csv")
s2 = pd.read_csv("Submissions/second.csv")
s3 = pd.read_csv("Submissions/alpha_beta.csv")
s4 = pd.read_csv("Submissions/third.csv")
submission['target_pct_vunerable']=(s1['target_pct_vunerable']+s2['target_pct_vunerable']+s3['target_pct_vunerable ']+s4['target_pct_vunerable'])/4
submission.to_csv('Submissions/'+'assembling_uniform.csv', index = False)

In [None]:
#Best submission
a_min, a_max = 0.099, 47
submission['target_pct_vunerable']=np.clip((s1['target_pct_vunerable']+s2['target_pct_vunerable'])/2, a_min=a_min, a_max=a_max)
submission.to_csv('Submissions/'+f'assembling_uniform_first_second_clipped_{a_min}_{a_max}.csv', index = False)

### Others trials

In [None]:
alpha = pd.read_csv("Submissions/alpha_voting.csv")
beta = pd.read_csv("Submissions/beta_voting.csv")

In [None]:
a_min, a_max = 0.099, 47
submission['target_pct_vunerable']=np.clip(alpha['target_pct_vunerable '], a_min=a_min, a_max=a_max)
submission.to_csv('Submissions/'+f'alpha_voting_clipped_{a_min}_{a_max}.csv', index = False)

In [None]:
a_min, a_max = 0.099, 47
submission['target_pct_vunerable']=np.clip(beta['target_pct_vunerable '], a_min=a_min, a_max=a_max)
submission.to_csv('Submissions/'+f'beta_voting_clipped_{a_min}_{a_max}.csv', index = False)

In [None]:
submission['target_pct_vunerable']=0.3*s1['target_pct_vunerable']+0.5*s2['target_pct_vunerable']+0.2*s3['target_pct_vunerable ']+0.2*s4['target_pct_vunerable']
submission.to_csv('Submissions/'+'assembling_weighted.csv', index = False)

In [None]:
s4.columns

In [None]:
a_min, a_max = 0.099, 47
submission['target_pct_vunerable']=np.clip((s1['target_pct_vunerable']+s2['target_pct_vunerable']+s3['target_pct_vunerable ']+s4['target_pct_vunerable'])/4, a_min=a_min, a_max=a_max)
submission.to_csv('Submissions/'+f'assembling_uniform_clipped_{a_min}_{a_max}.csv', index = False)

In [None]:
w=0.4
submission['target_pct_vunerable']=np.clip(w1*s1['target_pct_vunerable']+(1-w)*s2['target_pct_vunerable'], a_min=0.099, a_max=47)
submission.to_csv('Submissions/'+f'assembling_first_second_weighted_{w}_clipped.csv', index = False)

In [None]:
dif_1 = pd.read_csv("Submissions/tabnet_best_params_cliped_0.06_47.csv")
dif_2 = pd.read_csv("Submissions/assembling_uniform_clipped_0.06_47.csv")
submission['target_pct_vunerable']=(dif_1['target_pct_vunerable']+dif_2['target_pct_vunerable'])/2
submission.to_csv('Submissions/'+'assembling_uniform_tabnet_f_ass.csv', index = False)

In [None]:
w1=0.035
a_min, a_max = 0.06, 47
submission['target_pct_vunerable']=np.clip(w1*dif_1['target_pct_vunerable']+(1-w1)*dif_2['target_pct_vunerable'], a_min=0.06, a_max =47)
submission.to_csv('Submissions/'+f'assembling_uniform_tabnet_f_ass_weighted_{w1}_clipped_{a_min}_{a_max}.csv', index = False)