In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from sklearn import preprocessing
import lightgbm as lgb
import os
import shutil
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from tqdm import tqdm
init_notebook_mode(connected=True)
import math 

pd.options.display.max_columns = 500
#import pandas_profiling

def display_importances(feature_importance_df_):
    cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)[:50].index
    best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]
    plt.figure(figsize=(8, 10))
    sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout()
    #plt.savefig('lgbm_importances01.png')

In [2]:
train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')

var_list = [f for f in train_df.columns if 'var' in f]

In [3]:
freq_cols = [f +'_freq' for f in var_list]

test_df[freq_cols] = test_df[var_list].apply(lambda x: x.map(x.value_counts()))
train_df[freq_cols] = train_df[var_list].apply(lambda x: x.map(x.value_counts()))


test_df['min_freq'] = test_df[freq_cols].min(1)
train_df['min_freq'] = train_df[freq_cols].min(1)


real_test = test_df.loc[test_df.min_freq==1].copy()
fake_test = test_df.loc[test_df.min_freq!=1].copy()

In [6]:
all_real_df.tail()

Unnamed: 0,ID_code,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,var_10,var_11,var_12,var_13,var_14,var_15,var_16,var_17,var_18,var_19,var_20,var_21,var_22,var_23,var_24,var_25,var_26,var_27,var_28,var_29,var_30,var_31,var_32,var_33,var_34,var_35,var_36,var_37,var_38,var_39,var_40,var_41,var_42,var_43,var_44,var_45,var_46,var_47,var_48,var_49,var_50,var_51,var_52,var_53,var_54,var_55,var_56,var_57,var_58,var_59,var_60,var_61,var_62,var_63,var_64,var_65,var_66,var_67,var_68,var_69,var_70,var_71,var_72,var_73,var_74,var_75,var_76,var_77,var_78,var_79,var_80,var_81,var_82,var_83,var_84,var_85,var_86,var_87,var_88,var_89,var_90,var_91,var_92,var_93,var_94,var_95,var_96,var_97,var_98,var_99,var_100,var_101,var_102,var_103,var_104,var_105,var_106,var_107,var_108,var_109,var_110,var_111,var_112,var_113,var_114,var_115,var_116,var_117,var_118,var_119,var_120,var_121,var_122,var_123,var_124,var_125,var_126,var_127,var_128,var_129,var_130,var_131,var_132,var_133,var_134,var_135,var_136,var_137,var_138,var_139,var_140,var_141,var_142,var_143,var_144,var_145,var_146,var_147,var_148,var_149,var_150,var_151,var_152,var_153,var_154,var_155,var_156,var_157,var_158,var_159,var_160,var_161,var_162,var_163,var_164,var_165,var_166,var_167,var_168,var_169,var_170,var_171,var_172,var_173,var_174,var_175,var_176,var_177,var_178,var_179,var_180,var_181,var_182,var_183,var_184,var_185,var_186,var_187,var_188,var_189,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199,var_0_freq,var_1_freq,var_2_freq,var_3_freq,var_4_freq,var_5_freq,var_6_freq,var_7_freq,var_8_freq,var_9_freq,var_10_freq,var_11_freq,var_12_freq,var_13_freq,var_14_freq,var_15_freq,var_16_freq,var_17_freq,var_18_freq,var_19_freq,var_20_freq,var_21_freq,var_22_freq,var_23_freq,var_24_freq,var_25_freq,var_26_freq,var_27_freq,var_28_freq,var_29_freq,var_30_freq,var_31_freq,var_32_freq,var_33_freq,var_34_freq,var_35_freq,var_36_freq,var_37_freq,var_38_freq,var_39_freq,var_40_freq,var_41_freq,var_42_freq,var_43_freq,var_44_freq,var_45_freq,var_46_freq,var_47_freq,var_48_freq,var_49_freq,var_50_freq,var_51_freq,var_52_freq,var_53_freq,var_54_freq,var_55_freq,var_56_freq,var_57_freq,var_58_freq,var_59_freq,var_60_freq,var_61_freq,var_62_freq,var_63_freq,var_64_freq,var_65_freq,var_66_freq,var_67_freq,var_68_freq,var_69_freq,var_70_freq,var_71_freq,var_72_freq,var_73_freq,var_74_freq,var_75_freq,var_76_freq,var_77_freq,var_78_freq,var_79_freq,var_80_freq,var_81_freq,var_82_freq,var_83_freq,var_84_freq,var_85_freq,var_86_freq,var_87_freq,var_88_freq,var_89_freq,var_90_freq,var_91_freq,var_92_freq,var_93_freq,var_94_freq,var_95_freq,var_96_freq,var_97_freq,var_98_freq,var_99_freq,var_100_freq,var_101_freq,var_102_freq,var_103_freq,var_104_freq,var_105_freq,var_106_freq,var_107_freq,var_108_freq,var_109_freq,var_110_freq,var_111_freq,var_112_freq,var_113_freq,var_114_freq,var_115_freq,var_116_freq,var_117_freq,var_118_freq,var_119_freq,var_120_freq,var_121_freq,var_122_freq,var_123_freq,var_124_freq,var_125_freq,var_126_freq,var_127_freq,var_128_freq,var_129_freq,var_130_freq,var_131_freq,var_132_freq,var_133_freq,var_134_freq,var_135_freq,var_136_freq,var_137_freq,var_138_freq,var_139_freq,var_140_freq,var_141_freq,var_142_freq,var_143_freq,var_144_freq,var_145_freq,var_146_freq,var_147_freq,var_148_freq,var_149_freq,var_150_freq,var_151_freq,var_152_freq,var_153_freq,var_154_freq,var_155_freq,var_156_freq,var_157_freq,var_158_freq,var_159_freq,var_160_freq,var_161_freq,var_162_freq,var_163_freq,var_164_freq,var_165_freq,var_166_freq,var_167_freq,var_168_freq,var_169_freq,var_170_freq,var_171_freq,var_172_freq,var_173_freq,var_174_freq,var_175_freq,var_176_freq,var_177_freq,var_178_freq,var_179_freq,var_180_freq,var_181_freq,var_182_freq,var_183_freq,var_184_freq,var_185_freq,var_186_freq,var_187_freq,var_188_freq,var_189_freq,var_190_freq,var_191_freq,var_192_freq,var_193_freq,var_194_freq,var_195_freq,var_196_freq,var_197_freq,var_198_freq,var_199_freq,min_freq
199986,test_199986,,19.2884,-2.8384,11.9149,6.6611,12.3112,12.9244,5.6492,16.0449,5.3597,8.2981,3.1869,-4.643,14.3275,9.1249,7.0293,14.4337,6.3944,-5.051,23.4822,-0.7194,3.7487,19.3418,4.4676,3.1266,10.5192,13.4962,6.9382,-0.2445,4.6302,6.3713,-0.7122,10.1779,-1.0252,18.3985,12.028,13.5464,3.5534,6.2831,18.38,-0.6706,-8.0934,1.934,11.3936,11.4913,-2.9115,-19.4675,11.0961,-12.3188,-1.9951,22.5925,11.7256,20.637,-8.0507,7.3796,-17.6328,10.7221,13.2778,5.1672,3.4804,8.6019,5.0281,-3.935,-0.543,-0.145,8.4064,5.8188,4.0994,8.4571,5.0136,-2.6873,28.7269,0.4391,-0.5189,17.3968,-2.4433,3.4765,-7.8663,15.7739,4.8272,13.2096,-0.7101,17.8015,9.7218,-5.4593,-11.1326,16.7775,9.5105,3.7602,8.6193,8.7704,-19.2018,6.9746,4.339,11.166,6.915,-0.2164,17.5327,39.4358,1.0662,0.1621,-14.2318,6.1709,27.6916,1.5836,10.6221,4.4757,7.7402,21.9683,14.3204,13.1442,1.6157,7.6355,4.4302,9.4249,2.8111,0.1159,2.1413,33.1694,-4.1329,5.6159,11.6713,14.5749,-12.0028,2.3387,5.2454,12.4992,12.4063,5.3644,-2.5717,12.2485,11.9573,1.2151,8.7814,7.1168,3.3208,-16.5573,23.2789,19.393,7.799,13.0668,6.1201,6.0189,7.7225,12.8592,10.5988,10.2846,12.9023,-8.8618,3.8534,-8.4942,18.464,6.7443,12.3359,16.0837,2.4796,-15.303,11.8232,-9.7278,15.9729,12.2997,37.5144,5.8036,4.1583,2.9002,2.8712,16.3983,2.167,-15.1138,7.5135,6.2927,-5.9864,5.8684,20.0642,-6.272,28.0249,9.4063,-2.592,16.7074,-11.4458,3.5884,-5.1156,11.9522,-7.3733,0.9166,15.8242,2.4801,5.3698,-7.7987,15.2795,0.4011,-3.0704,3.9924,2.8872,3.3142,22.5225,0.9812,0.102,8.3441,14.5823,0.7454,1,5,6,9,9,1,22,3,2,11,6,2,20,3,5,31,4,2,1,2,2,1,5,15,1,34,1,10,6,8,1,8,6,4,14,2,5,6,3,1,2,5,12,30,2,1,4,1,2,2,9,2,3,1,2,2,4,12,1,15,1,1,5,7,7,4,8,1,1324,3,3,47,4,3,1,2,1,4,7,6,2,3,1,1,3,1,5,3,10,1,4,73,1,21,3,18,2,1,12,3,1,2,3,53,9,20,10,5,59,4,2,8,4,3,12,2,5,2,3,1,2,2,1,1,8,25,18,2,2,4,14,15,11,27,7,1,3,1,5,3,5,5,4,3,1,3,6,1,55,3,6,5,2,7,5,1,2,3,1,2,2,46,7,2,1,4,9,1,8,11,1,1,3,2,4,2,3,3,5,3,5,7,2,2,3,1,3,1,2,11,6,3,9,4,3,8,5,20,3,3,1
199993,test_199993,,14.6764,-8.1066,7.1167,2.4138,10.3845,-11.9327,4.7563,16.0455,0.451,8.7944,-3.9173,-6.5391,14.184,12.4859,11.6117,14.3171,8.7637,3.0406,7.203,3.9212,14.2879,5.0927,8.2215,2.824,10.494,13.5207,-8.8797,-0.0327,4.8206,3.7615,0.8842,7.5475,1.4878,15.4758,11.2331,-2.8339,1.5236,7.4809,8.224,-0.3935,-0.0408,10.4346,11.7438,11.2241,7.5504,-2.2057,14.6628,0.3455,14.1002,25.8961,13.0586,12.3107,-4.5285,5.2606,-1.6487,4.431,13.8982,6.1899,11.6932,8.1573,6.4952,-2.1516,0.9917,7.0542,6.4252,0.185,5.2323,9.0955,5.0079,-7.5099,21.2623,0.8219,-3.4378,23.677,14.4819,3.0792,7.312,21.5892,4.4884,17.0566,-0.0614,16.8361,9.3114,-9.9386,-13.9102,25.7293,2.9035,9.2097,5.0275,0.8415,-18.5912,7.1695,8.5847,10.9903,10.2209,0.347,12.1064,1.6045,2.5605,-2.2526,7.8704,7.8797,25.8146,1.5433,10.9763,4.5544,9.8445,6.4871,14.3026,24.5891,8.8249,7.9327,5.1458,16.1681,2.1964,1.8897,4.2775,-5.2348,-10.9547,3.7724,4.7872,12.528,3.3829,3.1407,0.7311,12.0853,13.7389,-0.4409,1.1428,20.3194,12.5439,0.796,7.28,6.9867,0.3569,-6.0601,33.8089,14.4737,6.244,6.7573,5.6872,-3.2821,13.499,10.7475,8.783,3.7361,9.3125,4.3744,4.049,-2.9895,15.335,12.7271,2.6548,18.761,1.4137,7.1311,12.6628,-2.7828,27.1292,16.4812,29.7107,5.4092,5.7543,10.1169,-5.9091,15.5819,3.299,-7.9507,-1.8029,5.6008,3.5663,-0.9549,19.2635,-1.7153,20.7123,12.9913,1.4086,15.4677,-3.1972,3.3401,-6.0621,8.8245,-12.1243,1.7728,15.7449,0.3312,12.4182,-12.199,12.6614,0.8436,5.0961,7.7472,2.8127,6.6012,15.3706,-0.4293,6.8485,10.427,17.497,-13.0074,4,2,2,1,8,2,12,4,3,13,1,4,63,2,1,13,5,4,1,2,2,2,6,25,1,37,4,8,13,6,3,5,9,3,17,1,3,3,5,7,2,5,15,25,5,2,2,1,1,1,18,2,7,13,5,4,6,16,1,7,2,2,8,1,9,7,6,3,485,6,2,34,1,2,3,1,3,2,9,1,5,6,1,2,1,1,3,6,4,4,1,48,1,21,10,18,2,1,13,5,4,3,1,51,6,18,8,1,65,4,4,1,5,2,10,6,4,1,2,3,1,4,2,2,3,22,10,2,1,3,17,23,6,25,1,2,3,3,2,2,2,2,3,3,14,3,6,4,70,3,2,4,2,7,1,1,9,3,1,1,1,41,7,1,3,1,27,2,1,34,6,4,2,5,2,2,4,5,3,5,5,8,1,4,3,1,3,3,4,9,3,5,6,3,8,6,3,7,6,1,1
199995,test_199995,,13.1678,1.0136,10.4333,6.7997,8.5974,-4.1641,4.8579,14.7625,-2.7239,6.9937,2.6802,6.1565,14.3201,17.4594,5.3712,14.8984,5.8064,-10.0334,16.422,30.7786,22.7696,9.8526,4.0207,3.8469,13.1617,13.2522,-12.4547,-1.9127,5.3696,6.9779,-17.9649,10.7856,0.2776,5.8071,10.4332,8.6681,0.9646,11.2181,11.866,6.3732,-2.7215,13.342,11.1627,11.6436,18.4112,-21.308,10.0185,-32.6883,3.5985,16.4779,13.5237,19.5375,-9.2097,6.442,-2.308,5.9196,8.8332,7.2774,2.6084,9.0486,10.3683,-25.9187,-1.0702,-1.3391,6.1586,-1.8552,4.7364,15.6292,5.0223,-2.6639,11.0281,0.0306,1.896,24.9906,32.5007,8.3094,7.6126,25.6503,7.2437,15.0479,7.7879,13.9172,-9.0753,4.8331,4.4553,15.6388,5.5637,4.2547,12.684,0.0995,-1.8135,6.8214,9.3799,11.1513,9.6868,-0.1093,23.1655,8.3491,1.4743,-2.3265,0.0951,9.7517,28.6119,1.7091,13.6924,5.9843,7.0253,22.2816,14.2617,25.2567,1.9588,6.5321,2.993,13.3917,0.4961,-0.6465,0.2973,9.7944,3.2861,-1.2859,29.1451,13.9596,-3.4051,13.9743,3.3887,12.2799,13.9287,4.0643,-0.3375,19.0097,12.9933,0.0775,6.6729,6.9553,6.798,-16.6444,35.3246,21.9734,-2.6651,14.0376,-0.1133,11.9015,16.8878,12.5924,8.2578,6.1113,8.0605,8.7344,3.9054,3.1489,20.0401,15.7083,5.069,20.4789,5.6559,7.086,11.3302,-4.682,12.0228,11.1629,21.4193,5.3269,6.851,15.7062,-3.0267,21.8848,3.5995,-2.5442,3.7888,6.6096,-6.3101,2.2246,29.031,5.0331,26.2206,14.3811,-4.5834,9.8823,12.3426,7.3865,-11.881,10.4412,-0.9849,10.976,12.3183,-2.7719,5.882,8.5245,17.5356,1.0903,2.0544,9.6849,4.6734,-1.366,12.8721,1.2013,-4.6195,9.1568,18.2102,4.8801,5,4,1,6,2,3,8,3,2,14,4,1,29,1,11,27,7,1,2,2,2,3,4,12,3,20,1,3,12,4,2,5,8,2,4,2,3,1,5,3,4,1,16,29,1,2,3,2,1,1,14,3,3,9,2,4,1,9,6,7,3,2,5,6,8,4,5,2,1439,4,1,2,4,3,3,2,4,2,12,15,3,6,2,2,2,5,3,2,1,1,1,49,3,12,3,23,1,2,14,9,1,4,4,42,1,5,5,2,59,3,3,10,9,4,1,2,4,2,1,2,2,6,1,2,7,29,10,5,4,1,17,17,5,27,3,1,1,1,2,2,1,2,3,5,9,3,2,3,52,3,1,2,5,2,5,1,4,3,2,2,1,28,8,3,4,2,12,3,4,2,2,3,1,4,4,3,3,5,1,1,1,4,2,3,3,5,4,1,1,19,1,6,3,2,1,7,2,15,3,1,1
199996,test_199996,,9.7171,-9.1462,7.3443,9.1421,12.8936,3.0191,5.6888,18.8862,5.0915,6.3545,3.2618,-2.0445,13.8246,6.6547,5.0309,14.1999,8.5685,3.819,11.4428,19.6761,22.4473,16.6921,5.5064,3.616,10.4145,13.5557,0.0592,-1.8571,6.3351,6.4782,-15.0057,11.0107,1.1463,11.1037,11.7266,2.2244,1.3367,3.8213,2.7258,4.1475,-12.1092,23.2927,11.1598,11.4884,4.9641,-48.3446,12.154,8.2999,10.1927,9.3731,11.2994,26.2671,1.9017,6.2642,10.4324,18.2401,17.9493,5.4887,-1.4498,7.9868,6.1265,-9.8509,3.6689,-0.1774,5.5691,2.9599,5.3306,14.8157,5.0154,-3.0839,11.1069,0.3795,8.5277,4.1178,41.1888,14.107,7.8092,14.2418,4.2794,15.5445,12.1609,15.5275,-4.609,7.3667,8.6763,18.618,10.0517,19.1785,6.242,7.0502,-28.9851,6.7225,17.3362,10.8515,16.2477,0.5921,22.7872,25.731,1.7975,-2.0057,-6.1583,20.4441,21.8421,1.7256,8.5803,4.9388,8.8325,7.6675,14.0468,17.116,6.3677,5.4146,1.9411,2.6129,3.4406,3.426,2.3695,-8.9106,-13.8603,-2.0931,31.8407,13.8584,-3.226,6.3128,5.8228,12.7894,12.2272,-5.2897,1.6418,21.5449,12.9118,0.9432,7.1812,6.6422,-12.4054,3.2377,13.9731,3.7137,0.7522,16.9374,-0.7161,-0.3948,7.7641,10.755,9.0705,1.6564,9.9044,-13.0327,4.0327,-7.0027,14.8,9.1898,4.2972,20.3867,13.5508,-7.4707,13.2292,-3.5045,9.0808,8.0147,25.5856,5.6828,3.9202,10.5955,-2.9063,18.2834,3.0946,-13.0401,10.6693,5.4541,0.0289,1.5879,8.3102,6.5891,20.1379,14.4782,-7.6099,8.8498,-11.0107,-1.7399,-1.4862,10.0511,-15.125,7.5211,26.2435,-0.8879,6.4135,-1.1382,15.4816,1.7106,5.0071,6.6548,1.8197,2.4104,18.9037,-0.9337,2.9995,9.1112,18.174,-20.7689,3,1,2,3,6,2,13,4,1,7,3,4,28,4,4,20,4,2,3,2,3,2,3,13,1,24,4,5,12,7,1,6,4,2,18,5,5,6,1,2,1,1,20,35,2,1,5,1,1,3,4,2,1,15,1,1,4,7,3,10,2,2,6,8,6,3,9,1,1468,4,1,20,1,2,1,2,2,3,5,10,1,4,4,3,2,2,2,2,4,4,1,27,5,19,2,13,2,2,17,6,4,2,2,41,4,13,2,2,55,2,5,10,10,2,11,4,11,1,2,9,1,4,4,3,4,28,12,3,6,1,11,15,4,20,1,1,3,1,4,2,6,2,2,7,11,4,7,1,71,1,6,1,4,1,1,1,14,4,1,6,2,55,7,5,3,6,23,2,1,30,2,5,1,2,1,3,2,2,1,2,2,11,1,5,3,1,3,1,3,13,1,6,10,3,2,11,3,15,6,1,1
199999,test_199999,,10.4664,1.807,10.2277,6.0654,10.0258,1.0789,4.8879,14.4892,-0.5902,7.8362,8.4796,-5.896,13.8333,2.459,7.8881,14.8566,10.4665,-11.8235,25.243,20.9039,19.0743,15.5896,-2.1275,2.9672,2.0289,13.9641,-2.7192,0.9102,4.1522,3.9101,-8.6119,11.5158,2.8218,11.5082,11.2149,5.4528,-0.5887,2.878,7.0659,5.823,-2.1438,1.5189,11.0499,11.6297,9.2466,-21.8501,11.9929,-24.7056,13.693,31.2491,13.5394,-0.0924,-4.5169,6.0381,-1.164,10.3343,13.0089,5.8252,4.8179,9.4205,16.9195,-22.2258,0.9074,2.0655,7.148,-2.7835,5.1201,20.625,5.0179,2.5964,19.1312,0.8608,2.1197,14.4495,25.5517,18.2896,-0.6072,19.7737,8.0756,15.2295,8.011,11.3898,-3.7269,-4.8575,2.085,17.8978,7.3186,9.8698,10.1636,5.395,-20.9045,6.7966,9.3417,10.2155,11.5941,1.3084,7.8346,12.8029,0.9685,-0.6401,0.3854,12.125,27.8602,1.224,9.9291,5.412,9.764,27.7455,14.1016,25.9169,3.3216,5.5989,5.056,10.9571,1.1325,-0.7894,3.8041,23.0863,-24.5122,4.6938,31.4476,9.4732,5.5884,18.5127,0.1348,12.516,12.8744,1.9396,-3.025,10.3312,11.208,-0.2407,8.2985,7.1313,-8.0724,-8.7638,18.8991,28.2134,8.7092,-1.6528,5.9567,-1.7683,16.0498,12.5814,9.6512,3.5681,9.817,-12.1692,4.2543,-10.9932,17.4201,12.3167,8.9068,20.6013,13.3664,-10.3179,12.5572,-9.7303,9.5033,12.4379,26.701,5.883,3.8301,15.0473,4.8121,17.1152,2.4527,-6.0539,5.3711,6.1772,0.9131,0.3922,15.0415,-1.4703,30.1926,10.3408,-5.4143,13.9104,1.621,2.0273,-2.9853,10.4163,-12.1494,10.016,13.3454,-8.1344,4.6123,3.4033,16.831,0.6155,0.1398,9.2828,1.3601,4.8985,20.0926,-1.3048,-2.5981,10.3378,14.334,-7.7094,5,1,3,5,5,4,17,6,5,8,1,1,48,1,4,23,2,2,1,1,3,4,3,21,1,28,3,4,3,3,3,9,3,3,13,3,2,6,3,2,2,2,20,30,3,1,3,1,2,1,9,1,7,11,1,1,5,8,5,11,3,4,4,4,7,4,6,2,1602,2,3,21,3,2,2,3,3,4,2,7,4,2,4,2,4,6,2,3,5,9,2,58,4,14,4,8,2,3,9,12,2,2,5,16,8,7,4,3,48,1,2,10,9,1,3,3,5,1,1,5,2,5,2,2,2,37,17,4,2,3,9,9,5,30,4,2,2,1,2,1,1,1,4,4,12,5,8,5,23,1,5,6,7,2,2,1,14,3,2,5,1,27,5,2,3,3,11,4,4,17,4,1,3,5,2,2,3,6,3,8,2,9,1,2,2,1,3,1,2,17,4,6,5,4,4,8,3,6,6,1,1


In [7]:
# from sklearn.model_selection import train_test_split
# train_df_1,train_df_2 = train_test_split(train_df, test_size=0.5, random_state=42)


# train_df_1[freq_cols] = train_df_1[var_list].apply(lambda x: x.map(x.value_counts()))
# train_df_2[freq_cols] = train_df_2[var_list].apply(lambda x: x.map(x.value_counts()))
# real_test[freq_cols] = real_test[var_list].apply(lambda x: x.map(x.value_counts()))
# fake_test[freq_cols] = fake_test[var_list].apply(lambda x: x.map(x.value_counts()))



all_real_df = pd.concat([train_df, real_test],sort=False)
all_real_df[freq_cols] = all_real_df[var_list].apply(lambda x: x.map(x.value_counts()))

train_df = all_real_df.loc[~all_real_df.target.isna()]
real_test = all_real_df.loc[all_real_df.target.isna()]


In [8]:
# train_df = pd.concat([train_df_1, train_df_2],sort=False)
# train_df.sort_index(inplace=True)

test_df = pd.concat([real_test, fake_test],sort=False)
test_df.sort_index(inplace=True)

In [17]:
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

n_folds = 5
random_seed = 26
model = 'z_score_freq_merged_real_only'


model_name = "{0}_{1}_folds".format(model, n_folds)
print("Model: {}".format(model_name))

Model: z_score_freq_merged_real_only_5_folds


In [62]:
exclusion = ['ID_code', 'target'] + freq_cols
exclusion = ['ID_code', 'target'] 

# for var in tqdm(var_list):
#     exclusion.append('frequency_{}'.format(var))
#     exclusion.append('prob_{}'.format(var))
#     exclusion.append('true_prob_{}'.format(var))
    
feats = [c for c in train_df.columns if c not in exclusion]



In [63]:
clfs = []
folds = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=random_seed)
oof_preds = np.zeros((len(train_df), 1))
test_preds = np.zeros((len(test_df), 1))


X = train_df[feats]
y = train_df['target']
X_test = test_df[feats]
test_ids = test_df.ID_code.values
X['target'] = train_df['target']

parameters = {
    'objective': 'binary',
    'metric': 'auc',
    'is_unbalance': 'true',
    #'scale_pos_weight': 400,
    #'device' : 'gpu' ,
    'boosting': 'gbdt',
    'num_leaves': 4, #31
    'feature_fraction': 0.3,
    'bagging_fraction': 0.5,
    'bagging_freq': 6,
    'learning_rate': 0.05, #0.05
    'verbose': 30,
    #'min_data_in_leaf': 200,
    #"max_depth" : -1,
}

feature_importance_df = pd.DataFrame()
for fold_, (trn_, val_) in enumerate(folds.split(X, y)):
    print("Current Fold: {}".format(fold_+1))
    trn_x, trn_y = X.iloc[trn_, :], y[trn_]
    val_x, val_y = X.iloc[val_, :], y[val_]
    

    z_pos_cols = []
    z_neg_cols = []
    z_diff_cols = []
    for var in var_list:
        
        #pos
        trn_x[f'z_{var}_pos'] = (trn_x[var] - trn_x.loc[trn_x.target==1, var].mean())/(trn_x.loc[trn_x.target==1, var].std()/(trn_x[f'{var}_freq'].apply(lambda x: math.sqrt(x))))
        val_x[f'z_{var}_pos'] = (val_x[var] - trn_x.loc[trn_x.target==1, var].mean())/(trn_x.loc[trn_x.target==1, var].std()/(val_x[f'{var}_freq'].apply(lambda x: math.sqrt(x))))
        X_test[f'z_{var}_pos'] = (X_test[var] - trn_x.loc[trn_x.target==1, var].mean())/(trn_x.loc[trn_x.target==1, var].std()/(X_test[f'{var}_freq'].apply(lambda x: math.sqrt(x))))

        
        trn_x[f'z_{var}_neg'] = (trn_x[var] - trn_x.loc[trn_x.target==0, var].mean())/(trn_x.loc[trn_x.target==0, var].std()/(trn_x[f'{var}_freq'].apply(lambda x: math.sqrt(x))))
        val_x[f'z_{var}_neg'] = (val_x[var] - trn_x.loc[trn_x.target==0, var].mean())/(trn_x.loc[trn_x.target==0, var].std()/(val_x[f'{var}_freq'].apply(lambda x: math.sqrt(x))))
        X_test[f'z_{var}_neg'] = (X_test[var] - trn_x.loc[trn_x.target==0, var].mean())/(trn_x.loc[trn_x.target==0, var].std()/(X_test[f'{var}_freq'].apply(lambda x: math.sqrt(x))))

        
        trn_x[f'z_{var}_where'] = np.where(abs(trn_x[f'z_{var}_pos']) > abs(trn_x[f'z_{var}_neg']),1,0)
        val_x[f'z_{var}_where'] = np.where(abs(val_x[f'z_{var}_pos']) > abs(val_x[f'z_{var}_neg']),1,0)
        X_test[f'z_{var}_where'] = np.where(abs(X_test[f'z_{var}_pos']) > abs(X_test[f'z_{var}_neg']),1,0)
        
        z_pos_cols.append(f'z_{var}_pos')
        z_neg_cols.append(f'z_{var}_neg')
        z_diff_cols.append(f'z_{var}_where')
        

    exclusion = ['ID_code', 'target'] + z_pos_cols + z_diff_cols +freq_cols

    # for var in tqdm(var_list):
    #     exclusion.append('frequency_{}'.format(var))
    #     exclusion.append('prob_{}'.format(var))
    #     exclusion.append('true_prob_{}'.format(var))

    feats = [c for c in trn_x.columns if c not in exclusion]



    trn_lgb = lgb.Dataset(trn_x[feats], trn_y)
    val_lgb = lgb.Dataset(val_x[feats], val_y)
    clf = lgb.train(parameters,
                     train_set=trn_lgb,
                     #valid_sets=[valid_data_lgb,holdout_data_lgb],
                     valid_sets=[trn_lgb, val_lgb],
                     num_boost_round=30000,
                     early_stopping_rounds=150,
                     verbose_eval=500)
    


    val_pred = clf.predict(val_x[feats])
    test_fold_pred = clf.predict(X_test[feats])

    print("AUC = {}".format(roc_auc_score(val_y, val_pred)))
    oof_preds[val_, :] = val_pred.reshape((-1, 1))
    test_preds += test_fold_pred.reshape((-1, 1))
    
   # print('getting feature importance')
    
#     fold_importance_df = pd.DataFrame()
#     fold_importance_df["feature"] = feats
#     fold_importance_df["importance"] = clf.feature_importance()
#     fold_importance_df["fold"] = fold_ + 1
#     feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
    
test_preds /= n_folds
roc_score = roc_auc_score(y, oof_preds.ravel())
print("Overall AUC = {}".format(roc_score))


print("Saving submission file")
sample = pd.read_csv('../data/sample_submission.csv')
sample.target = test_preds.astype(float)
sample.ID_code = test_ids
sample.to_csv('../submissions/{}_{}.csv'.format(model_name,str(roc_score)), index=False)

#display_importances(feature_importance_df)


Current Fold: 1
Training until validation scores don't improve for 150 rounds.
[500]	training's auc: 0.894532	valid_1's auc: 0.879511
[1000]	training's auc: 0.91688	valid_1's auc: 0.899201
[1500]	training's auc: 0.92648	valid_1's auc: 0.906363
[2000]	training's auc: 0.932336	valid_1's auc: 0.909974
[2500]	training's auc: 0.936783	valid_1's auc: 0.912106
[3000]	training's auc: 0.940727	valid_1's auc: 0.913332
[3500]	training's auc: 0.944547	valid_1's auc: 0.913582
Early stopping, best iteration is:
[3413]	training's auc: 0.943908	valid_1's auc: 0.913737
AUC = 0.9137371492776521
Current Fold: 2
Training until validation scores don't improve for 150 rounds.
[500]	training's auc: 0.894796	valid_1's auc: 0.878697
[1000]	training's auc: 0.917264	valid_1's auc: 0.898466
[1500]	training's auc: 0.926872	valid_1's auc: 0.905674
[2000]	training's auc: 0.932654	valid_1's auc: 0.909418
[2500]	training's auc: 0.937209	valid_1's auc: 0.91094
[3000]	training's auc: 0.941194	valid_1's auc: 0.9114
Early

In [45]:
%%javascript
var nb = IPython.notebook;
var kernel = IPython.notebook.kernel;
var command = "NOTEBOOK_FULL_PATH = '" + nb.base_url + nb.notebook_path + "'";
kernel.execute(command);

<IPython.core.display.Javascript object>

In [46]:


shutil.copyfile(os.path.basename(NOTEBOOK_FULL_PATH), 
                             '../models/{}_{}.ipynb'.format(model_name, str(roc_score)))


'../models/z_score_freq_merged_real_only_5_folds_0.9142066009824421.ipynb'

In [57]:
a

NameError: name 'a' is not defined