# Submission

In [15]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
import joblib

In [2]:
# reading the test file
features_to_drop = ['status_group','id']
df_test_X = pd.read_csv('input/feature_sets/base_adv_mean_test.csv')

In [4]:
df_classifiers = pd.read_csv('classification_reports/classification_report.csv')

In [5]:
df_classifiers.head()

Unnamed: 0,classifier,accuracy_score,class,precision,recall,f1_score
0,HalvingGridSearchCV.pkl,0.815216,functional,0.81,0.9,0.85
1,HalvingGridSearchCV.pkl,0.815216,functional needs repair,0.58,0.32,0.41
2,HalvingGridSearchCV.pkl,0.815216,non functional,0.85,0.78,0.82
3,HalvingGridSearchCV_2.pkl,0.813773,functional,0.82,0.89,0.85
4,HalvingGridSearchCV_2.pkl,0.813773,functional needs repair,0.54,0.36,0.43


In [10]:
# condition for class functional needs repair
# this is to findout which class has better score for this minority class
class_filter = df_classifiers['class']=='functional needs repair'
df_classifiers[class_filter][['classifier','recall']].drop_duplicates().sort_values(by='recall', ascending=False).head()


Unnamed: 0,classifier,recall
52,RandomizedSearchCV_ms3.pkl,0.8
55,RandomizedSearchCV_2_ms3.pkl,0.79
49,HalvingGridSearchCV_3_ms3.pkl,0.79
58,GridSearchCV_ms3.pkl,0.78
46,HalvingGridSearchCV_2_ms3.pkl,0.78


so we will take **RandomizedSearchCV_ms3.pkl** CV'S best classifier for this class. 

In [11]:
class_filter = df_classifiers['class']=='functional'
df_classifiers[class_filter][['classifier','recall']].drop_duplicates().sort_values(by='recall', ascending=False).head()

Unnamed: 0,classifier,recall
9,RandomizedSearchCV.pkl,0.91
0,HalvingGridSearchCV.pkl,0.9
21,HalvingGridSearchCV_ms2.pkl,0.9
36,GridSearchCV_ms2.pkl,0.9
27,HalvingGridSearchCV_3_ms2.pkl,0.9


so we will take **RandomizedSearchCV.pkl** CV'S best classifier for this class. 

In [13]:
class_filter = df_classifiers['class']=='non functional'
df_classifiers[class_filter][['classifier','recall']].drop_duplicates().sort_values(by='recall', ascending=False).head()

Unnamed: 0,classifier,recall
41,GridSearchCV_2_ms2.pkl,0.8
20,GridSearchCV_2.pkl,0.8
35,RandomizedSearchCV_2_ms2.pkl,0.8
32,RandomizedSearchCV_ms2.pkl,0.79
8,HalvingGridSearchCV_3.pkl,0.79


so we will take **GridSearchCV_2_ms2.pkl** CV'S best classifier for this class. 

### loading the three cv searchings to get their classifiers

In [22]:
best_fnr_cv = joblib.load('models/RandomizedSearchCV_ms3.pkl')
best_f_cv   = joblib.load('models/RandomizedSearchCV.pkl')
best_nf_cv  = joblib.load('models/GridSearchCV_2_ms2.pkl')

# getting their best classifiers
best_fnr = best_fnr_cv.best_estimator_
best_f   = best_f_cv.best_estimator_
best_nf  = best_nf_cv.best_estimator_

# destroying their references
del best_fnr_cv, best_f_cv, best_nf_cv

### getting the test set

In [23]:
# a little data cleansing
df_test_X.funder_mean_nf.fillna(0, inplace=True) 
df_test_X.funder_mean_fr.fillna(0, inplace=True) 

### getting the prediction for test set from all three classifiers

In [24]:
pred_fnr = best_fnr.predict(df_test_X.drop(features_to_drop, axis=1))
pred_f   = best_f.predict(df_test_X.drop(features_to_drop, axis=1))
pred_nf  = best_nf.predict(df_test_X.drop(features_to_drop, axis=1))

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.4s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:    0.8s
[Parallel(n_jobs=8)]: Done 1000 out of 1000 | elapsed:    1.1s finished


### preparing the file

In [26]:
# preparing the file
df_test_X['status_group'] = ''
df_test_X['status_group_fnr'] = pred_fnr
df_test_X['status_group_f'] = pred_f
df_test_X['status_group_nf'] = pred_nf

In [28]:
df_test_X[['id','status_group_fnr','status_group_f','status_group_nf','status_group']].head()

Unnamed: 0,id,status_group_fnr,status_group_f,status_group_nf,status_group
0,50785,functional,functional,functional,
1,51630,functional,functional,functional,
2,17168,functional,functional,functional,
3,45559,non functional,non functional,non functional,
4,49871,functional,functional,functional,


### preparing the file by weitage

for that, we may need to create a function that would copy the relevant status_group into the status_group column

In [121]:
def get_sg(row):
    ###
    # if class type is 'fnr' then go for the fnr column, rest empty 
    # if class type is 'f' then go for the f column, rest empty
    # if class type is 'nf' then go for the nf column, rest empty
    ###
    
    a = 'functional needs repair'
    b = 'functional'
    c = 'non functional'
    
    if row[0]==a and row[1]==b and row[2]==b:
        return a
    elif row[0]==a and row[1]==a and row[2]==a: 
        return a
    elif row[0]==a and row[1]==c and row[2]==c: 
        return a
    elif row[0]==a and row[1]==b and row[2]==a: 
        return a
    elif row[0]==a and row[1]==c and row[2]==a: 
        return a
    elif row[0]==a and row[1]==b and row[2]==c: 
        return a
    elif row[0]==a and row[1]==c and row[2]==b: 
        return a
    elif row[0]==a and row[1]==a and row[2]==b: 
        return a
    elif row[0]==a and row[1]==a and row[2]==c:         
        return a
    elif row[0]==b and row[1]==b and row[2]==b: 
        return b
    elif row[0]==c and row[1]==b and row[2]==b: 
        return b
    elif row[0]==b and row[1]==b and row[2]==a: 
        return b
    elif row[0]==b and row[1]==c and row[2]==b: 
        return b
    elif row[0]==c and row[1]==c and row[2]==c: 
        return c
    elif row[0]==b and row[1]==c and row[2]==c: 
        return c
    elif row[0]==c and row[1]==b and row[2]==c: 
        return c
    elif row[0]==b and row[1]==b and row[2]==c: 
        return c
    elif row[0]==c and row[1]==c and row[2]==b: 
        return c
    elif row[0]==c and row[1]==c and row[2]==a: 
        return c
        

In [122]:
df_test_X_copy = df_test_X.copy()

In [123]:
df_test_X_copy['status_group'] =''

In [124]:
df_test_X_copy[['status_group_fnr','status_group_f','status_group_nf']].drop_duplicates()

Unnamed: 0,status_group_fnr,status_group_f,status_group_nf
0,functional,functional,functional
3,non functional,non functional,non functional
13,functional,non functional,non functional
15,functional needs repair,functional,functional
17,non functional,functional,non functional
28,functional needs repair,functional needs repair,functional needs repair
32,functional,functional,non functional
41,non functional,functional,functional
48,functional needs repair,non functional,non functional
56,functional needs repair,functional,functional needs repair


In [125]:
df_test_X_copy[['id','status_group_fnr','status_group_f','status_group_nf','status_group']]

Unnamed: 0,id,status_group_fnr,status_group_f,status_group_nf,status_group
0,50785,functional,functional,functional,
1,51630,functional,functional,functional,
2,17168,functional,functional,functional,
3,45559,non functional,non functional,non functional,
4,49871,functional,functional,functional,
...,...,...,...,...,...
14845,39307,non functional,non functional,non functional,
14846,18990,functional,functional,functional,
14847,28749,functional needs repair,functional,functional,
14848,33492,functional,functional,functional,


In [126]:
df_test_X_copy['status_group'] =\
df_test_X_copy[['status_group_fnr','status_group_f','status_group_nf']].apply(get_sg, axis=1)

In [127]:
df_test_X_copy[['status_group_fnr','status_group_f','status_group_nf','status_group']].drop_duplicates().reset_index(drop=True)

Unnamed: 0,status_group_fnr,status_group_f,status_group_nf,status_group
0,functional,functional,functional,functional
1,non functional,non functional,non functional,non functional
2,functional,non functional,non functional,non functional
3,functional needs repair,functional,functional,functional needs repair
4,non functional,functional,non functional,non functional
5,functional needs repair,functional needs repair,functional needs repair,functional needs repair
6,functional,functional,non functional,non functional
7,non functional,functional,functional,functional
8,functional needs repair,non functional,non functional,functional needs repair
9,functional needs repair,functional,functional needs repair,functional needs repair


In [128]:
df_test_X_copy.status_group.value_counts()

functional                 6723
non functional             5052
functional needs repair    3075
Name: status_group, dtype: int64

### saving the file for submission

In [129]:
df_sub_by_weight_11sep21 = df_test_X_copy[['id','status_group']]

In [131]:
df_sub_by_weight_11sep21

Unnamed: 0,id,status_group
0,50785,functional
1,51630,functional
2,17168,functional
3,45559,non functional
4,49871,functional
...,...,...
14845,39307,non functional
14846,18990,functional
14847,28749,functional needs repair
14848,33492,functional


In [132]:
df_sub_by_weight_11sep21.to_csv('submissions/df_sub_by_weight_11sep21.csv', index=False)

# Preparing the file by Best averaging best classifiers

In [149]:
# reading the test file
features_to_drop = ['status_group','id']
df_test_X = pd.read_csv('input/feature_sets/base_adv_mean_test.csv')
# a little data cleansing
df_test_X.funder_mean_nf.fillna(0, inplace=True) 
df_test_X.funder_mean_fr.fillna(0, inplace=True) 

In [150]:
df_classifiers[['classifier','accuracy_score']].drop_duplicates().sort_values(by='accuracy_score', ascending=False).head()

Unnamed: 0,classifier,accuracy_score
6,HalvingGridSearchCV_3.pkl,0.816819
27,HalvingGridSearchCV_3_ms2.pkl,0.816338
30,RandomizedSearchCV_ms2.pkl,0.816017
15,GridSearchCV.pkl,0.815376
36,GridSearchCV_ms2.pkl,0.815216


getting the best three

In [146]:
best_1_cv = joblib.load('models/HalvingGridSearchCV_3.pkl')
best_2_cv   = joblib.load('models/HalvingGridSearchCV_3_ms2.pkl')
best_3_cv  = joblib.load('models/RandomizedSearchCV_ms2.pkl')

# getting their best classifiers
best_1  = best_1_cv.best_estimator_
best_2  = best_2_cv.best_estimator_
best_3  = best_3_cv.best_estimator_

# destroying their references
del best_1_cv, best_2_cv, best_3_cv

In [151]:
df_test_X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14850 entries, 0 to 14849
Data columns (total 56 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   region_perf                      14850 non-null  int64  
 1   source_type_perf                 14850 non-null  int64  
 2   amount_tsh_zero                  14850 non-null  int64  
 3   gps_gt_668                       14850 non-null  int64  
 4   cons_year_zero                   14850 non-null  int64  
 5   pump_age_group                   14850 non-null  int64  
 6   all_zeros                        14850 non-null  int64  
 7   district_code                    14850 non-null  int64  
 8   basin                            14850 non-null  int64  
 9   region                           14850 non-null  int64  
 10  region_code                      14850 non-null  int64  
 11  public_meeting_label             14850 non-null  int64  
 12  scheme_management_

getting the prediction

In [152]:
pred_a   = best_1.predict(df_test_X.drop(features_to_drop, axis=1))
pred_b   = best_2.predict(df_test_X.drop(features_to_drop, axis=1))
pred_c   = best_3.predict(df_test_X.drop(features_to_drop, axis=1))

# preparing the file
df_test_X['status_group'] = ''
df_test_X['status_group_a'] = pred_a
df_test_X['status_group_b'] = pred_b
df_test_X['status_group_c'] = pred_c

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.5s
[Parallel(n_jobs=8)]: Done 751 out of 751 | elapsed:    1.0s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.5s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:    1.0s
[Parallel(n_jobs=8)]: Done 1100 out of 1100 | elapsed:    1.4s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.1s finished


In [155]:
df_test_X[['status_group_a','status_group_b','status_group_c','status_group']].drop_duplicates().reset_index(drop=True)

Unnamed: 0,status_group_a,status_group_b,status_group_c,status_group
0,functional,functional,functional,
1,non functional,non functional,non functional,
2,functional,functional,non functional,
3,functional needs repair,functional needs repair,functional needs repair,
4,non functional,functional,non functional,
5,functional needs repair,functional needs repair,functional,
6,functional needs repair,functional,functional,
7,non functional,non functional,functional,
8,functional,functional,functional needs repair,
9,non functional,functional,functional,


In [174]:
def get_sg_on_avg(cols):
    
    a = 'functional needs repair'
    b = 'functional'
    c = 'non functional'
    
    a_counter = 0
    b_counter = 0
    c_counter = 0
    
    for col in cols:
        if col == a:
            a_counter+=1
        elif col == b:
            b_counter+=1
        elif col == c:
            c_counter+=1
            
            
    if (a_counter > b_counter) and (a_counter > c_counter):
        return a
    elif (b_counter > a_counter) and (b_counter > c_counter):
        return b
    elif (c_counter > a_counter) and (c_counter > b_counter):
        return c
    else:
        return cols[0] # the best estimator result if all are equal    
    

In [175]:
df_test_X['status_group'] = df_test_X[['status_group_a','status_group_b','status_group_c']].apply(get_sg_on_avg, axis=1)

In [176]:
df_test_X[['status_group_a','status_group_b','status_group_c','status_group']].drop_duplicates().reset_index(drop=True)

Unnamed: 0,status_group_a,status_group_b,status_group_c,status_group
0,functional,functional,functional,functional
1,non functional,non functional,non functional,non functional
2,functional,functional,non functional,functional
3,functional needs repair,functional needs repair,functional needs repair,functional needs repair
4,non functional,functional,non functional,non functional
5,functional needs repair,functional needs repair,functional,functional needs repair
6,functional needs repair,functional,functional,functional
7,non functional,non functional,functional,non functional
8,functional,functional,functional needs repair,functional
9,non functional,functional,functional,functional


In [177]:
df_test_X.status_group.value_counts()

functional                 9076
non functional             5219
functional needs repair     555
Name: status_group, dtype: int64

In [178]:
df_sub_by_avg_11sep21 = df_test_X[['id','status_group']]

In [179]:
df_sub_by_avg_11sep21.to_csv('submissions/df_sub_by_avg_11sep21.csv', index=False)

<img src=current_score.PNG>

### So, this score is top 15% of the competitors. I am sure that with some more fine tuning the results, and optimizations, this would lead up to the top 10%. 

# Adding two weak classifiers to the mix

In [183]:
df_classifiers[['classifier','accuracy_score']].drop_duplicates().sort_values(by='accuracy_score', ascending=False)

Unnamed: 0,classifier,accuracy_score
6,HalvingGridSearchCV_3.pkl,0.816819
27,HalvingGridSearchCV_3_ms2.pkl,0.816338
30,RandomizedSearchCV_ms2.pkl,0.816017
15,GridSearchCV.pkl,0.815376
36,GridSearchCV_ms2.pkl,0.815216
0,HalvingGridSearchCV.pkl,0.815216
9,RandomizedSearchCV.pkl,0.815055
21,HalvingGridSearchCV_ms2.pkl,0.814895
3,HalvingGridSearchCV_2.pkl,0.813773
33,RandomizedSearchCV_2_ms2.pkl,0.811688


In [195]:
# Making use of averaging between five classifiers
best_1_cv = joblib.load('models/HalvingGridSearchCV_3.pkl')
best_2_cv   = joblib.load('models/HalvingGridSearchCV_3_ms2.pkl')
best_3_cv  = joblib.load('models/GridSearchCV_ms3.pkl')
best_4_cv  = joblib.load('models/RandomizedSearchCV_2.pkl')
best_5_cv  = joblib.load('models/GridSearchCV_2_ms3.pkl')


# getting their best classifiers
best_1  = best_1_cv.best_estimator_
best_2  = best_2_cv.best_estimator_
best_3  = best_3_cv.best_estimator_
best_4  = best_4_cv.best_estimator_
best_5  = best_5_cv.best_estimator_

# destroying their references
del best_1_cv, best_2_cv, best_3_cv, best_4_cv, best_5_cv

In [196]:
# reading the test file
features_to_drop = ['status_group','id']
df_test_X = pd.read_csv('input/feature_sets/base_adv_mean_test.csv')
# a little data cleansing
df_test_X.funder_mean_nf.fillna(0, inplace=True) 
df_test_X.funder_mean_fr.fillna(0, inplace=True) 


# predictions
pred_a   = best_1.predict(df_test_X.drop(features_to_drop, axis=1))
pred_b   = best_2.predict(df_test_X.drop(features_to_drop, axis=1))
pred_c   = best_3.predict(df_test_X.drop(features_to_drop, axis=1))
pred_d   = best_4.predict(df_test_X.drop(features_to_drop, axis=1))
pred_e   = best_5.predict(df_test_X.drop(features_to_drop, axis=1))

# preparing the file
df_test_X['status_group'] = ''
df_test_X['status_group_a'] = pred_a
df_test_X['status_group_b'] = pred_b
df_test_X['status_group_c'] = pred_c
df_test_X['status_group_d'] = pred_d
df_test_X['status_group_e'] = pred_e

# get the average by voting
df_test_X['status_group'] =\
df_test_X[['status_group_a','status_group_b','status_group_c','status_group_d','status_group_e']].apply(get_sg_on_avg, axis=1)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.5s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    1.3s
[Parallel(n_jobs=8)]: Done 751 out of 751 | elapsed:    2.3s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.6s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    1.4s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:    2.5s
[Parallel(n_jobs=8)]: Done 1100 out of 1100 | elapsed:    3.5s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.5s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    1.3s
[Paralle

In [197]:
df_test_X.status_group.value_counts()

functional                 8824
non functional             5260
functional needs repair     766
Name: status_group, dtype: int64

In [198]:
df_sub_by_avg_11sep21_5_classifiers = df_test_X[['id','status_group']]

In [199]:
df_sub_by_avg_11sep21.to_csv('submissions/df_sub_by_avg_11sep21_5_clf.csv', index=False)

<img src=current_score.PNG>

same result ! even though the weaker class **functional needs repair** got increased in frequency.