In [None]:
import requests
import time
import re
import warnings
import pickle
import random
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.model_selection import cross_val_score
from requests.auth import HTTPBasicAuth
from imblearn.over_sampling import RandomOverSampler

In [None]:
seed = random.randint(1, 100)

In [None]:
print ("LOADING DATASETS...")
try:
    df_train = pd.read_csv("mfalonso__6aQ6IxU7Va__train.csv")
    df_train.head()
except:
    import urllib
    csv_train = urllib.request.urlopen("http://manoelutad.pythonanywhere.com/static/uploads/mfalonso__6aQ6IxU7Va__train.csv")
    csv_train_content = csv_train.read()
    with open("mfalonso__6aQ6IxU7Va__train.csv", 'wb') as f:
            f.write(csv_train_content)
    df_train = pd.read_csv("mfalonso__6aQ6IxU7Va__train.csv")
    
try:
    df_test = pd.read_csv("mfalonso__6aQ6IxU7Va__test.csv")
    df_test.head()
except:
    import urllib
    csv_test = urllib.request.urlopen("http://manoelutad.pythonanywhere.com/static/uploads/mfalonso__6aQ6IxU7Va__test.csv")
    csv_test_content = csv_test.read()
    with open("mfalonso__6aQ6IxU7Va__test.csv", 'wb') as f:
            f.write(csv_test_content)
    df_test = pd.read_csv("mfalonso__6aQ6IxU7Va__test.csv")

In [None]:
df_train = df_train.fillna(df_train.mean())
df_test = df_test.fillna(df_test.mean())

warnings.filterwarnings("ignore", category=FutureWarning)

In [None]:
in_model = []

list_ib = []
list_icn = []
list_ico = []
list_if = []

list_inputs = set()

for var_name in df_train.columns:
    if re.search('^ib_',var_name):
        list_inputs.add(var_name)      
        list_ib.append(var_name)
    elif re.search('^icn_',var_name):
        list_inputs.add(var_name)      
        list_icn.append(var_name)
    elif re.search('^ico_',var_name):
        list_inputs.add(var_name)      
        list_ico.append(var_name)
    elif re.search('^if_',var_name):
        list_inputs.add(var_name)      
        list_if.append(var_name)
    elif re.search('^ob_',var_name):
        output_var = var_name
    else:
        print ("ERROR: unable to identify the type of:", var_name)

df_train[list_if] = df_train[list_if].astype(float)
df_test[list_if] = df_test[list_if].astype(float)

df_train[list_ib] = df_train[list_ib].astype(str)
df_test[list_ib] = df_test[list_ib].astype(str)

df_train[list_icn] = df_train[list_icn].astype(str)
df_test[list_icn] = df_test[list_icn].astype(str)

df_train[list_ico] = df_train[list_ico].astype(int)
df_test[list_ico] = df_test[list_ico].astype(int)

In [None]:
y_train = df_train["ob_target"]
df_train = df_train.drop('ob_target', axis=1)

In [None]:
columns_to_keep = ['id', 'ico_var_58', 'if_var_67', 'ico_var_47', 'if_var_70', 'ico_var_34', 'ico_var_42', 'ico_var_26', 'ib_var_2', 'if_var_80', 'ico_var_33', 'ib_var_14', 'icn_var_24', 'ico_var_40', 'ico_var_56', 'ico_var_60', 'ico_var_57', 'if_var_81', 'ib_var_18', 'if_var_72', 'ib_var_6', 'ico_var_41', 'ico_var_44', 'ico_var_43', 'ico_var_32', 'if_var_69', 'ico_var_38', 'ico_var_55', 'ico_var_64', 'if_var_79', 'ico_var_53', 'ico_var_36', 'ib_var_21', 'ico_var_31', 'ico_var_39', 'if_var_77', 'if_var_78', 'if_var_74', 'if_var_76', 'ico_var_29', 'ico_var_35']
df_train = df_train[columns_to_keep]
df_test = df_test[columns_to_keep]

In [None]:
#df_test = df_test.drop('contract_date', axis=1)
X_train = df_train
X_test = df_test

In [None]:
oversampler = RandomOverSampler(sampling_strategy=0.6, random_state=seed)

X_train_balanced, y_train_balanced = oversampler.fit_resample(X_train, y_train)

In [None]:
model = RandomForestClassifier(n_estimators=2000)

extra_model = ExtraTreesClassifier(n_estimators=2000)

model.fit(X_train_balanced, y_train_balanced)
extra_model.fit(X_train_balanced, y_train_balanced)

pred_train = (model.predict_proba(X_train_balanced)[:, 1] + extra_model.predict_proba(X_train_balanced)[:, 1]) / 2

model_predictions = model.predict_proba(X_test)[:, 1]
extra_model_predictions = extra_model.predict_proba(X_test)[:, 1]

pred_test = (model_predictions + extra_model_predictions) / 2

In [None]:
print ("STEP 4: ASSESSING THE MODEL...")
gini_score = 2*roc_auc_score(y_train_balanced, pred_train)-1
print ("GINI DEVELOPMENT=", gini_score)

importances = model.feature_importances_
feature_names = df_train.columns.tolist()

indices = np.argsort(importances)[::-1]
sorted_feature_names = [feature_names[i] for i in indices]

print("Feature Importance Ranking:")
for i, feature in enumerate(sorted_feature_names):
    print(f"{i+1}. {feature}: {importances[indices[i]]}")

selected_variables = []

for i, importance in enumerate(importances):
    if importance > 0.006:
        selected_variables.append(X_train.columns[i])

print(len(selected_variables))
print(selected_variables)

In [None]:
print ("STEP 5: SUBMITTING THE RESULTS... DO NOT CHANGE THIS PART!")

df_test['pred'] = pred_test
df_test['id'] = df_test.iloc[:,0]
df_test_tosend = df_test[['id','pred']]

filename = "df_test_tosend.csv"
df_test_tosend.to_csv(filename, sep=',')
url = 'http://manoelutad.pythonanywhere.com/uploadpredictions/6aQ6IxU7Va'
files = {'file': (filename, open(filename, 'rb')),
         'ipynbcode': ('6aQ6IxU7Va.ipynb', open('6aQ6IxU7Va.ipynb', 'rb'))}

rsub = requests.post(url, files=files, auth=HTTPBasicAuth("username", "password"))
resp_str = str(rsub.text)
print ("RESULT SUBMISSION: ", resp_str)

In [None]:
pattern = r"gini = (\d+\.?\d*)"

match = re.search(pattern, resp_str)

current_gini = float(match.group(1))

max_gini = 0.560652

if current_gini > max_gini:
    max_gini = current_gini
    print("New max_gini:", max_gini)
    with open('model.pkl', 'wb') as file:
        pickle.dump(model, file)
else:
      print("Lower than current gini:", max_gini)

In [None]:
num_iterations = 10000
iteration = 0
max_gini = 0.560652

In [None]:
while True:
     try:
          for i in range(num_iterations):
                    seed = random.randint(1, 100)
                    iteration += 1
                    print("Iteration:", iteration)
                    try:
                         df_train = pd.read_csv("mfalonso__6aQ6IxU7Va__train.csv")
                         df_train.head()
                    except:
                         import urllib
                         csv_train = urllib.request.urlopen("http://manoelutad.pythonanywhere.com/static/uploads/mfalonso__6aQ6IxU7Va__train.csv")
                         csv_train_content = csv_train.read()
                         with open("mfalonso__6aQ6IxU7Va__train.csv", 'wb') as f:
                              f.write(csv_train_content)
                         df_train = pd.read_csv("mfalonso__6aQ6IxU7Va__train.csv")
          
                    try:
                         df_test = pd.read_csv("mfalonso__6aQ6IxU7Va__test.csv")
                         df_test.head()
                    except:
                         import urllib
                         csv_test = urllib.request.urlopen("http://manoelutad.pythonanywhere.com/static/uploads/mfalonso__6aQ6IxU7Va__test.csv")
                         csv_test_content = csv_test.read()
                         with open("mfalonso__6aQ6IxU7Va__test.csv", 'wb') as f:
                              f.write(csv_test_content)
                         df_test = pd.read_csv("mfalonso__6aQ6IxU7Va__test.csv")

                    df_train = df_train.fillna(df_train.mean())
                    df_test = df_test.fillna(df_test.mean())
                    
                    in_model = []
                    list_ib = []
                    list_icn = []
                    list_ico = []
                    list_if = []

                    list_inputs = set()

                    for var_name in df_train.columns:
                         if re.search('^ib_',var_name):
                              list_inputs.add(var_name)      
                              list_ib.append(var_name)
                         elif re.search('^icn_',var_name):
                              list_inputs.add(var_name)      
                              list_icn.append(var_name)
                         elif re.search('^ico_',var_name):
                              list_inputs.add(var_name)      
                              list_ico.append(var_name)
                         elif re.search('^if_',var_name):
                              list_inputs.add(var_name)      
                              list_if.append(var_name)
                         elif re.search('^ob_',var_name):
                              output_var = var_name
                         else:
                              print ("ERROR: unable to identify the type of:", var_name)

                    df_train[list_if] = df_train[list_if].astype(float)
                    df_test[list_if] = df_test[list_if].astype(float)

                    df_train[list_ib] = df_train[list_ib].astype(str)
                    df_test[list_ib] = df_test[list_ib].astype(str)

                    df_train[list_icn] = df_train[list_icn].astype(str)
                    df_test[list_icn] = df_test[list_icn].astype(str)

                    df_train[list_ico] = df_train[list_ico].astype(int)
                    df_test[list_ico] = df_test[list_ico].astype(int)

                    y_train = df_train["ob_target"]
                    df_train = df_train.drop('ob_target', axis=1)

                    df_train = df_train[columns_to_keep]
                    df_test = df_test[columns_to_keep]

                    #df_test = df_test.drop('contract_date', axis=1)

                    X_train = df_train
                    X_test = df_test
          
                    oversampler = RandomOverSampler(sampling_strategy=0.7, random_state=seed)
                    X_train_balanced, y_train_balanced = oversampler.fit_resample(X_train, y_train)

                    model = RandomForestClassifier(n_estimators=5000)

                    extra_model = ExtraTreesClassifier(n_estimators=5000)

                    model.fit(X_train_balanced, y_train_balanced)
                    extra_model.fit(X_train_balanced, y_train_balanced)

                    pred_train = (model.predict_proba(X_train_balanced)[:, 1] + extra_model.predict_proba(X_train_balanced)[:, 1]) / 2

                    model_predictions = model.predict_proba(X_test)[:, 1]
                    extra_model_predictions = extra_model.predict_proba(X_test)[:, 1]

                    pred_test = (model_predictions + extra_model_predictions) / 2

                    df_test['pred'] = pred_test
                    df_test['id'] = df_test.iloc[:,0]
                    df_test_tosend = df_test[['id','pred']]

                    filename = "df_test_tosend.csv"
                    df_test_tosend.to_csv(filename, sep=',')
                    url = 'http://manoelutad.pythonanywhere.com/uploadpredictions/6aQ6IxU7Va'
                    files = {'file': (filename, open(filename, 'rb')),
                         'ipynbcode': ('6aQ6IxU7Va.ipynb', open('6aQ6IxU7Va.ipynb', 'rb'))}

                    rsub = requests.post(url, files=files, auth=HTTPBasicAuth("username", "password"))
                    resp_str = str(rsub.text)
                    print ("RESULT SUBMISSION: ", resp_str)

                    pattern = r"gini = (\d+\.?\d*)"

                    match = re.search(pattern, resp_str)

                    current_gini = float(match.group(1))

                    if current_gini > max_gini:
                         max_gini = current_gini
                         print("New max_gini:", max_gini)
                         with open('model.pkl', 'wb') as file:
                              pickle.dump(model, file)
                    else:
                         print("Lower than current gini", max_gini)

                    print("\n")
                    time.sleep(20)
     except Exception as e:
        # Print the error message
        print(f"Error occurred: {str(e)}")
        print("Retrying...")