In [9]:
%run mlflow_make_experiment.ipynb

import pandas as pd
import numpy as np
import catboost as ctb
from itertools  import combinations
from datetime import date

df_train = pd.read_hdf("../input/train_churn_pred.h5") 
df_test = pd.read_hdf("../input/test_churn_pred.h5") 

print(df_train.shape, df_test.shape)

def prepare_submit(df_train, df_test, model, feats, out_filename, treshold=0.2):
    X_train = df_train[feats].fillna(-1).values
    y_train = df_train["churn_probability"].values
    X_test = df_test[feats].fillna(-1).values
    
    model.fit(X_train, y_train)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    y_pred = (y_pred_proba > treshold).astype("int")
    
    df_test["churn_probability"] = y_pred

    out_path = "../output/{}.csv".format(out_filename)
    df_test[ ["id", "churn_probability"] ].to_csv(out_path, index=False)
    
    
def fe(df):
    metrics = {"_".join( x.split("_")[:-1] ) for x in df_train.columns if "mou_6" in x}

    for metric in metrics:
        metric_months = ["{}_{}".format(metric, x) for x in [8, 7, 6]]

        mean_by_month = df[metric_months].mean(axis=0).to_dict()
        df[f"mean_{metric}"] = df[metric_months].mean(axis=1)
        df[f"sum_{metric}"] = df[metric_months].sum(axis=1)
        df[f"min_{metric}"] = df[metric_months].min(axis=1)
        df[f"max_{metric}"] = df[metric_months].max(axis=1)



        for pair in combinations(metric_months, 2):
            df[f"{pair[0]} - {pair[1]}"] = df[pair[0]] - df[pair[1]] 
            df[f"{pair[0]} / {pair[1]}"] = df[pair[0]] / df[pair[1]] 
            


            df[f"mean_{pair[0]} - {pair[0]}"] = mean_by_month[pair[0]] - df[pair[0]]
            df[f"mean_{pair[1]} - {pair[1]}"] = mean_by_month[pair[1]] - df[pair[1]]
            
            
            df[f"mean_{pair[0]} / {pair[0]}"] = mean_by_month[pair[0]] / df[pair[0]]
            df[f"mean_{pair[1]} / {pair[1]}"] = mean_by_month[pair[1]] / df[pair[1]]
   
    #zmiana kolumn z datami na datetime
    #ekstrakcja pól rok, miesiąc, dzień
    date_cols = ['date_of_last_rech_6',
                 'date_of_last_rech_7',
                 'date_of_last_rech_8',
                  'date_of_last_rech_data_6',
                  'date_of_last_rech_data_7',
                  'date_of_last_rech_data_8'
    ]
    
    
    for col in date_cols:
        df[col] = pd.to_datetime(df[col], format='%m/%d/%Y')    
        df[col+"_year"] = df[col].dt.year
        df[col+"_month"] = df[col].dt.month
        df[col+"_day"] = df[col].dt.day
            
    #cechy pojawiające się wysoko w rankingu z eli5    
    most_imp = ['last_day_rch_amt_8',
                'total_ic_mou_8 / total_ic_mou_7',
                'roam_og_mou_8 / roam_og_mou_7',
                'mean_total_ic_mou_8 - total_ic_mou_8', 
                'total_ic_mou_8 / total_ic_mou_6','roam_og_mou_8', 
                'total_rech_num_7',
                'fb_user_8',
                'total_rech_num_8',
                'aon'
        
    ]

    #dla najważniejszych cech logarytm i pierwiastek
    for feat in most_imp:
        df[feat+'_log1p'] = df[feat].apply(lambda x: np.log1p(x))
        df[feat+'_sqrt'] = df[feat].apply(lambda x: np.sqrt(x))
       
    #wyznaczenie najpóźniejszej daty z 'date_of_last_rech_6', 'date_of_last_rech_7', 'date_of_last_rech_8'
    #obliczenie liczby dni między maksymalną datą i dniem dzisiejszym
    #nie poprawiło wyniku
    
#    df['max_date'] = df.apply(lambda x: max(x['date_of_last_rech_6'],x['date_of_last_rech_7'],x['date_of_last_rech_8']), axis=1)
#    df['days'] = df.max_date.apply(lambda x: (date.today() - x.date()).days if not x is pd.NaT else -1)

    return df
  
  
df_train_fe = fe(df_train.copy())
df_test_fe = fe(df_test.copy())




model = ctb.CatBoostClassifier(
    max_depth=7, n_estimators=300, random_state=0, 
    learning_rate=0.085, verbose=False
)

feats = df_test_fe.select_dtypes("number").columns

_ = make_experiment(
    df_train_fe, model,
    feats=None,
    make_random_feats=True)




(34999, 171) (35000, 170)


Custom logger is already specified. Specify more than one logger at same time is not thread safe.

In [7]:
#najlepszy wynik dla threshold 0.18
prepare_submit(df_train_fe, df_test_fe, model, feats, "ctb_feats_dates_log_sqrt", treshold=0.18)