# Tabular Playground Series - Feb 2022

In [None]:
import warnings
warnings.filterwarnings('ignore')
#https://stackoverflow.com/questions/9031783/hide-all-warnings-in-ipython
#warnings.filterwarnings(action='once')

In [None]:
!pip install scikit-learn-intelex
from sklearnex import patch_sklearn
patch_sklearn()

In [None]:
import seaborn as sns
import numpy as np
import pandas as pd

import matplotlib.pylab as plt
import seaborn as sns


from scipy.stats import mode
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder

KAGGLE_DIR = r'../input/tabular-playground-series-feb-2022/'
LOCAL_DIR = r''
KAGGLE = True
RS = 69420

In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
%%time
if KAGGLE:
    print(f"{'*'*10} Loading Training Data... {'*'*10}")
    train = pd.read_csv(KAGGLE_DIR+'train.csv', index_col=0).pipe(reduce_mem_usage)
    print(f"{'*'*10} Loading Testing Data... {'*'*10}")
    test = pd.read_csv(KAGGLE_DIR+'test.csv', index_col=0).pipe(reduce_mem_usage)
    sub = pd.read_csv(KAGGLE_DIR+'sample_submission.csv').pipe(reduce_mem_usage)
else:
    print(f"{'*'*10} Loading Training Data... {'*'*10}")
    train = pd.read_csv(LOCAL_DIR+'train.csv', index_col=0).pipe(reduce_mem_usage)
    print(f"{'*'*10} Loading Testing Data... {'*'*10}")
    test = pd.read_csv(LOCAL_DIR+'test.csv', index_col=0).pipe(reduce_mem_usage)
    sub = pd.read_csv(LOCAL_DIR+'sample_submission.csv').pipe(reduce_mem_usage)

## Remove Duplicated Rows

In [None]:
#https://www.kaggle.com/c/tabular-playground-series-feb-2022/discussion/305364

## Feature Engineering - for each Row, produce Total A, T, C, G

- For each column, produce 4 new columns - - that column's A, T, C, G

In [None]:
columnss=train.columns.tolist()
columnss.remove('target')
sandbox=columnss[0]
sandbox

In [None]:
import re


a=re.findall(r'\d+', sandbox)

train['column_1_A']=train[columnss[0]]*int(a[0])
train['column_1_T']=train[columnss[0]]*int(a[1])
train['column_1_G']=train[columnss[0]]*int(a[2])
train['column_1_C']=train[columnss[0]]*int(a[3])

In [None]:
for i,j in enumerate(columnss):
    
    sandbox=columnss[i]

    a=re.findall(r'\d+', sandbox)
    stringg=str(i+1)
    train['column_'+stringg+'_A']=train[columnss[i]]*int(a[0])
    train['column_'+stringg+'_T']=train[columnss[i]]*int(a[1])
    train['column_'+stringg+'_G']=train[columnss[i]]*int(a[2])
    train['column_'+stringg+'_C']=train[columnss[i]]*int(a[3])

In [None]:
for i,j in enumerate(columnss):
    
    sandbox=columnss[i]

    a=re.findall(r'\d+', sandbox)
    stringg=str(i+1)
    test['column_'+stringg+'_A']=test[columnss[i]]*int(a[0])
    test['column_'+stringg+'_T']=test[columnss[i]]*int(a[1])
    test['column_'+stringg+'_G']=test[columnss[i]]*int(a[2])
    test['column_'+stringg+'_C']=test[columnss[i]]*int(a[3])

### Sum A Columns Together, T Columns Together, etc

In [None]:
A_Columns=['column_1_A','column_2_A','column_3_A','column_4_A','column_5_A','column_6_A','column_7_A','column_8_A','column_9_A','column_10_A',
           'column_11_A','column_12_A','column_13_A','column_14_A','column_15_A','column_16_A','column_17_A','column_18_A','column_19_A','column_20_A',
           'column_21_A','column_22_A','column_23_A','column_24_A','column_25_A','column_26_A','column_27_A','column_28_A','column_29_A','column_30_A',
           'column_31_A','column_32_A','column_33_A','column_34_A','column_35_A','column_36_A','column_37_A','column_38_A','column_39_A','column_40_A',
           'column_41_A','column_42_A','column_43_A','column_44_A','column_45_A','column_46_A','column_47_A','column_48_A','column_49_A','column_50_A',
           'column_51_A','column_52_A','column_53_A','column_54_A','column_55_A','column_56_A','column_57_A','column_58_A','column_59_A','column_60_A',
           'column_61_A','column_62_A','column_63_A','column_64_A','column_65_A','column_66_A','column_67_A','column_68_A','column_69_A','column_70_A',
           'column_71_A','column_72_A','column_73_A','column_74_A','column_75_A','column_76_A','column_77_A','column_78_A','column_79_A','column_80_A',
           'column_81_A','column_82_A','column_83_A','column_84_A','column_85_A','column_86_A','column_87_A','column_88_A','column_89_A','column_90_A',
           'column_91_A','column_92_A','column_93_A','column_94_A','column_95_A','column_96_A','column_97_A','column_98_A','column_99_A','column_100_A',
           'column_101_A','column_102_A','column_103_A','column_104_A','column_105_A','column_106_A','column_107_A','column_108_A','column_109_A','column_110_A',
           'column_111_A','column_112_A','column_113_A','column_114_A','column_115_A','column_116_A','column_117_A','column_118_A','column_119_A','column_120_A',
           'column_121_A','column_122_A','column_123_A','column_124_A','column_125_A','column_126_A','column_127_A','column_128_A','column_129_A','column_130_A',
           'column_131_A','column_132_A','column_133_A','column_134_A','column_135_A','column_136_A','column_137_A','column_138_A','column_139_A','column_140_A',
           'column_141_A','column_142_A','column_143_A','column_144_A','column_145_A','column_146_A','column_147_A','column_148_A','column_149_A','column_150_A',
           'column_151_A','column_152_A','column_153_A','column_154_A','column_155_A','column_156_A','column_157_A','column_158_A','column_159_A','column_160_A',
           'column_161_A','column_162_A','column_163_A','column_164_A','column_165_A','column_166_A','column_167_A','column_168_A','column_169_A','column_170_A',
           'column_171_A','column_172_A','column_173_A','column_174_A','column_175_A','column_176_A','column_177_A','column_178_A','column_179_A','column_180_A',
           'column_181_A','column_182_A','column_183_A','column_184_A','column_185_A','column_186_A','column_187_A','column_188_A','column_189_A','column_190_A',
           'column_191_A','column_192_A','column_193_A','column_194_A','column_195_A','column_196_A','column_197_A','column_198_A','column_199_A','column_200_A',
           'column_201_A','column_202_A','column_203_A','column_204_A','column_205_A','column_206_A','column_207_A','column_208_A','column_209_A','column_210_A',
           'column_211_A','column_212_A','column_213_A','column_214_A','column_215_A','column_216_A','column_217_A','column_218_A','column_219_A','column_220_A',
           'column_221_A','column_222_A','column_223_A','column_224_A','column_225_A','column_226_A','column_227_A','column_228_A','column_229_A','column_230_A',
           'column_231_A','column_232_A','column_233_A','column_234_A','column_235_A','column_236_A','column_237_A','column_238_A','column_239_A','column_240_A',
           'column_241_A','column_242_A','column_243_A','column_244_A','column_245_A','column_246_A','column_247_A','column_248_A','column_249_A','column_250_A',
           'column_251_A','column_252_A','column_253_A','column_254_A','column_255_A','column_256_A','column_257_A','column_258_A','column_259_A','column_260_A',
           'column_261_A','column_262_A','column_263_A','column_264_A','column_265_A','column_266_A','column_267_A','column_268_A','column_269_A','column_270_A',
           'column_271_A','column_272_A','column_273_A','column_274_A','column_275_A','column_276_A','column_277_A','column_278_A','column_279_A','column_280_A',
           'column_281_A','column_282_A','column_283_A','column_284_A','column_285_A','column_286_A']

In [None]:
T_Columns=['column_1_T','column_2_T','column_3_T','column_4_T','column_5_T','column_6_T','column_7_T','column_8_T','column_9_T','column_10_T',
           'column_11_T','column_12_T','column_13_T','column_14_T','column_15_T','column_16_T','column_17_T','column_18_T','column_19_T','column_20_T',
           'column_21_T','column_22_T','column_23_T','column_24_T','column_25_T','column_26_T','column_27_T','column_28_T','column_29_T','column_30_T',
           'column_31_T','column_32_T','column_33_T','column_34_T','column_35_T','column_36_T','column_37_T','column_38_T','column_39_T','column_40_T',
           'column_41_T','column_42_T','column_43_T','column_44_T','column_45_T','column_46_T','column_47_T','column_48_T','column_49_T','column_50_T',
           'column_51_T','column_52_T','column_53_T','column_54_T','column_55_T','column_56_T','column_57_T','column_58_T','column_59_T','column_60_T',
           'column_61_T','column_62_T','column_63_T','column_64_T','column_65_T','column_66_T','column_67_T','column_68_T','column_69_T','column_70_T',
           'column_71_T','column_72_T','column_73_T','column_74_T','column_75_T','column_76_T','column_77_T','column_78_T','column_79_T','column_80_T',
           'column_81_T','column_82_T','column_83_T','column_84_T','column_85_T','column_86_T','column_87_T','column_88_T','column_89_T','column_90_T',
           'column_91_T','column_92_T','column_93_T','column_94_T','column_95_T','column_96_T','column_97_T','column_98_T','column_99_T','column_100_T',
           'column_101_T','column_102_T','column_103_T','column_104_T','column_105_T','column_106_T','column_107_T','column_108_T','column_109_T','column_110_T',
           'column_111_T','column_112_T','column_113_T','column_114_T','column_115_T','column_116_T','column_117_T','column_118_T','column_119_T','column_120_T',
           'column_121_T','column_122_T','column_123_T','column_124_T','column_125_T','column_126_T','column_127_T','column_128_T','column_129_T','column_130_T',
           'column_131_T','column_132_T','column_133_T','column_134_T','column_135_T','column_136_T','column_137_T','column_138_T','column_139_T','column_140_T',
           'column_141_T','column_142_T','column_143_T','column_144_T','column_145_T','column_146_T','column_147_T','column_148_T','column_149_T','column_150_T',
           'column_151_T','column_152_T','column_153_T','column_154_T','column_155_T','column_156_T','column_157_T','column_158_T','column_159_T','column_160_T',
           'column_161_T','column_162_T','column_163_T','column_164_T','column_165_T','column_166_T','column_167_T','column_168_T','column_169_T','column_170_T',
           'column_171_T','column_172_T','column_173_T','column_174_T','column_175_T','column_176_T','column_177_T','column_178_T','column_179_T','column_180_T',
           'column_181_T','column_182_T','column_183_T','column_184_T','column_185_T','column_186_T','column_187_T','column_188_T','column_189_T','column_190_T',
           'column_191_T','column_192_T','column_193_T','column_194_T','column_195_T','column_196_T','column_197_T','column_198_T','column_199_T','column_200_T',
           'column_201_T','column_202_T','column_203_T','column_204_T','column_205_T','column_206_T','column_207_T','column_208_T','column_209_T','column_210_T',
           'column_211_T','column_212_T','column_213_T','column_214_T','column_215_T','column_216_T','column_217_T','column_218_T','column_219_T','column_220_T',
           'column_221_T','column_222_T','column_223_T','column_224_T','column_225_T','column_226_T','column_227_T','column_228_T','column_229_T','column_230_T',
           'column_231_T','column_232_T','column_233_T','column_234_T','column_235_T','column_236_T','column_237_T','column_238_T','column_239_T','column_240_T',
           'column_241_T','column_242_T','column_243_T','column_244_T','column_245_T','column_246_T','column_247_T','column_248_T','column_249_T','column_250_T',
           'column_251_T','column_252_T','column_253_T','column_254_T','column_255_T','column_256_T','column_257_T','column_258_T','column_259_T','column_260_T',
           'column_261_T','column_262_T','column_263_T','column_264_T','column_265_T','column_266_T','column_267_T','column_268_T','column_269_T','column_270_T',
           'column_271_T','column_272_T','column_273_T','column_274_T','column_275_T','column_276_T','column_277_T','column_278_T','column_279_T','column_280_T',
           'column_281_T','column_282_T','column_283_T','column_284_T','column_285_T','column_286_T']

In [None]:
C_Columns=['column_1_C','column_2_C','column_3_C','column_4_C','column_5_C','column_6_C','column_7_C','column_8_C','column_9_C','column_10_C',
           'column_11_C','column_12_C','column_13_C','column_14_C','column_15_C','column_16_C','column_17_C','column_18_C','column_19_C','column_20_C',
           'column_21_C','column_22_C','column_23_C','column_24_C','column_25_C','column_26_C','column_27_C','column_28_C','column_29_C','column_30_C',
           'column_31_C','column_32_C','column_33_C','column_34_C','column_35_C','column_36_C','column_37_C','column_38_C','column_39_C','column_40_C',
           'column_41_C','column_42_C','column_43_C','column_44_C','column_45_C','column_46_C','column_47_C','column_48_C','column_49_C','column_50_C',
           'column_51_C','column_52_C','column_53_C','column_54_C','column_55_C','column_56_C','column_57_C','column_58_C','column_59_C','column_60_C',
           'column_61_C','column_62_C','column_63_C','column_64_C','column_65_C','column_66_C','column_67_C','column_68_C','column_69_C','column_70_C',
           'column_71_C','column_72_C','column_73_C','column_74_C','column_75_C','column_76_C','column_77_C','column_78_C','column_79_C','column_80_C',
           'column_81_C','column_82_C','column_83_C','column_84_C','column_85_C','column_86_C','column_87_C','column_88_C','column_89_C','column_90_C',
           'column_91_C','column_92_C','column_93_C','column_94_C','column_95_C','column_96_C','column_97_C','column_98_C','column_99_C','column_100_C',
           'column_101_C','column_102_C','column_103_C','column_104_C','column_105_C','column_106_C','column_107_C','column_108_C','column_109_C','column_110_C',
           'column_111_C','column_112_C','column_113_C','column_114_C','column_115_C','column_116_C','column_117_C','column_118_C','column_119_C','column_120_C',
           'column_121_C','column_122_C','column_123_C','column_124_C','column_125_C','column_126_C','column_127_C','column_128_C','column_129_C','column_130_C',
           'column_131_C','column_132_C','column_133_C','column_134_C','column_135_C','column_136_C','column_137_C','column_138_C','column_139_C','column_140_C',
           'column_141_C','column_142_C','column_143_C','column_144_C','column_145_C','column_146_C','column_147_C','column_148_C','column_149_C','column_150_C',
           'column_151_C','column_152_C','column_153_C','column_154_C','column_155_C','column_156_C','column_157_C','column_158_C','column_159_C','column_160_C',
           'column_161_C','column_162_C','column_163_C','column_164_C','column_165_C','column_166_C','column_167_C','column_168_C','column_169_C','column_170_C',
           'column_171_C','column_172_C','column_173_C','column_174_C','column_175_C','column_176_C','column_177_C','column_178_C','column_179_C','column_180_C',
           'column_181_C','column_182_C','column_183_C','column_184_C','column_185_C','column_186_C','column_187_C','column_188_C','column_189_C','column_190_C',
           'column_191_C','column_192_C','column_193_C','column_194_C','column_195_C','column_196_C','column_197_C','column_198_C','column_199_C','column_200_C',
           'column_201_C','column_202_C','column_203_C','column_204_C','column_205_C','column_206_C','column_207_C','column_208_C','column_209_C','column_210_C',
           'column_211_C','column_212_C','column_213_C','column_214_C','column_215_C','column_216_C','column_217_C','column_218_C','column_219_C','column_220_C',
           'column_221_C','column_222_C','column_223_C','column_224_C','column_225_C','column_226_C','column_227_C','column_228_C','column_229_C','column_230_C',
           'column_231_C','column_232_C','column_233_C','column_234_C','column_235_C','column_236_C','column_237_C','column_238_C','column_239_C','column_240_C',
           'column_241_C','column_242_C','column_243_C','column_244_C','column_245_C','column_246_C','column_247_C','column_248_C','column_249_C','column_250_C',
           'column_251_C','column_252_C','column_253_C','column_254_C','column_255_C','column_256_C','column_257_C','column_258_C','column_259_C','column_260_C',
           'column_261_C','column_262_C','column_263_C','column_264_C','column_265_C','column_266_C','column_267_C','column_268_C','column_269_C','column_270_C',
           'column_271_C','column_272_C','column_273_C','column_274_C','column_275_C','column_276_C','column_277_C','column_278_C','column_279_C','column_280_C',
           'column_281_C','column_282_C','column_283_C','column_284_C','column_285_C','column_286_C']

In [None]:
G_Columns=['column_1_G','column_2_G','column_3_G','column_4_G','column_5_G','column_6_G','column_7_G','column_8_G','column_9_G','column_10_G',
           'column_11_G','column_12_G','column_13_G','column_14_G','column_15_G','column_16_G','column_17_G','column_18_G','column_19_G','column_20_G',
           'column_21_G','column_22_G','column_23_G','column_24_G','column_25_G','column_26_G','column_27_G','column_28_G','column_29_G','column_30_G',
           'column_31_G','column_32_G','column_33_G','column_34_G','column_35_G','column_36_G','column_37_G','column_38_G','column_39_G','column_40_G',
           'column_41_G','column_42_G','column_43_G','column_44_G','column_45_G','column_46_G','column_47_G','column_48_G','column_49_G','column_50_G',
           'column_51_G','column_52_G','column_53_G','column_54_G','column_55_G','column_56_G','column_57_G','column_58_G','column_59_G','column_60_G',
           'column_61_G','column_62_G','column_63_G','column_64_G','column_65_G','column_66_G','column_67_G','column_68_G','column_69_G','column_70_G',
           'column_71_G','column_72_G','column_73_G','column_74_G','column_75_G','column_76_G','column_77_G','column_78_G','column_79_G','column_80_G',
           'column_81_G','column_82_G','column_83_G','column_84_G','column_85_G','column_86_G','column_87_G','column_88_G','column_89_G','column_90_G',
           'column_91_G','column_92_G','column_93_G','column_94_G','column_95_G','column_96_G','column_97_G','column_98_G','column_99_G','column_100_G',
           'column_101_G','column_102_G','column_103_G','column_104_G','column_105_G','column_106_G','column_107_G','column_108_G','column_109_G','column_110_G',
           'column_111_G','column_112_G','column_113_G','column_114_G','column_115_G','column_116_G','column_117_G','column_118_G','column_119_G','column_120_G',
           'column_121_G','column_122_G','column_123_G','column_124_G','column_125_G','column_126_G','column_127_G','column_128_G','column_129_G','column_130_G',
           'column_131_G','column_132_G','column_133_G','column_134_G','column_135_G','column_136_G','column_137_G','column_138_G','column_139_G','column_140_G',
           'column_141_G','column_142_G','column_143_G','column_144_G','column_145_G','column_146_G','column_147_G','column_148_G','column_149_G','column_150_G',
           'column_151_G','column_152_G','column_153_G','column_154_G','column_155_G','column_156_G','column_157_G','column_158_G','column_159_G','column_160_G',
           'column_161_G','column_162_G','column_163_G','column_164_G','column_165_G','column_166_G','column_167_G','column_168_G','column_169_G','column_170_G',
           'column_171_G','column_172_G','column_173_G','column_174_G','column_175_G','column_176_G','column_177_G','column_178_G','column_179_G','column_180_G',
           'column_181_G','column_182_G','column_183_G','column_184_G','column_185_G','column_186_G','column_187_G','column_188_G','column_189_G','column_190_G',
           'column_191_G','column_192_G','column_193_G','column_194_G','column_195_G','column_196_G','column_197_G','column_198_G','column_199_G','column_200_G',
           'column_201_G','column_202_G','column_203_G','column_204_G','column_205_G','column_206_G','column_207_G','column_208_G','column_209_G','column_210_G',
           'column_211_G','column_212_G','column_213_G','column_214_G','column_215_G','column_216_G','column_217_G','column_218_G','column_219_G','column_220_G',
           'column_221_G','column_222_G','column_223_G','column_224_G','column_225_G','column_226_G','column_227_G','column_228_G','column_229_G','column_230_G',
           'column_231_G','column_232_G','column_233_G','column_234_G','column_235_G','column_236_G','column_237_G','column_238_G','column_239_G','column_240_G',
           'column_241_G','column_242_G','column_243_G','column_244_G','column_245_G','column_246_G','column_247_G','column_248_G','column_249_G','column_250_G',
           'column_251_G','column_252_G','column_253_G','column_254_G','column_255_G','column_256_G','column_257_G','column_258_G','column_259_G','column_260_G',
           'column_261_G','column_262_G','column_263_G','column_264_G','column_265_G','column_266_G','column_267_G','column_268_G','column_269_G','column_270_G',
           'column_271_G','column_272_G','column_273_G','column_274_G','column_275_G','column_276_G','column_277_G','column_278_G','column_279_G','column_280_G',
           'column_281_G','column_282_G','column_283_G','column_284_G','column_285_G','column_286_G']

#### Train Feature Engineering

In [None]:
train['A_sum']=train[A_Columns].sum(axis=1)
train['T_sum']=train[T_Columns].sum(axis=1)
train['C_sum']=train[C_Columns].sum(axis=1)
train['G_sum']=train[G_Columns].sum(axis=1)

In [None]:
train['A_mean']=train[A_Columns].mean(axis=1)
train['T_mean']=train[T_Columns].mean(axis=1)
train['C_mean']=train[C_Columns].mean(axis=1)
train['G_mean']=train[G_Columns].mean(axis=1)

In [None]:
train['A_std']=train[A_Columns].std(axis=1)
train['T_std']=train[T_Columns].std(axis=1)
train['C_std']=train[C_Columns].std(axis=1)
train['G_std']=train[G_Columns].std(axis=1)

In [None]:
train['A_med']=train[A_Columns].median(axis=1)
train['T_med']=train[T_Columns].median(axis=1)
train['C_med']=train[C_Columns].median(axis=1)
train['G_med']=train[G_Columns].median(axis=1)

In [None]:
train['A_q01']=train[A_Columns].quantile(q=0.01, axis=1)
train['T_q01']=train[T_Columns].quantile(q=0.01, axis=1)
train['C_q01']=train[C_Columns].quantile(q=0.01, axis=1)
train['G_q01']=train[G_Columns].quantile(q=0.01, axis=1)

train['A_q05']=train[A_Columns].quantile(q=0.05, axis=1)
train['T_q05']=train[T_Columns].quantile(q=0.05, axis=1)
train['C_q05']=train[C_Columns].quantile(q=0.05, axis=1)
train['G_q05']=train[G_Columns].quantile(q=0.05, axis=1)

train['A_q10']=train[A_Columns].quantile(q=0.10, axis=1)
train['T_q10']=train[T_Columns].quantile(q=0.10, axis=1)
train['C_q10']=train[C_Columns].quantile(q=0.10, axis=1)
train['G_q10']=train[G_Columns].quantile(q=0.10, axis=1)

In [None]:
#     df['tails'] = df['range'] / df['iqr']
#     df['dispersion'] = df['std'] / df['mean']
#     df['dispersion_2'] = df['iqr'] / df['median']

In [None]:
train['A_q25']=train[A_Columns].quantile(q=0.25, axis=1)
train['T_q25']=train[T_Columns].quantile(q=0.25, axis=1)
train['C_q25']=train[C_Columns].quantile(q=0.25, axis=1)
train['G_q25']=train[G_Columns].quantile(q=0.25, axis=1)


train['A_q75']=train[A_Columns].quantile(q=0.75, axis=1)
train['T_q75']=train[T_Columns].quantile(q=0.75, axis=1)
train['C_q75']=train[C_Columns].quantile(q=0.75, axis=1)
train['G_q75']=train[G_Columns].quantile(q=0.75, axis=1)


train['A_q90']=train[A_Columns].quantile(q=0.90, axis=1)
train['T_q90']=train[T_Columns].quantile(q=0.90, axis=1)
train['C_q90']=train[C_Columns].quantile(q=0.90, axis=1)
train['G_q90']=train[G_Columns].quantile(q=0.90, axis=1)


train['A_q95']=train[A_Columns].quantile(q=0.95, axis=1)
train['T_q95']=train[T_Columns].quantile(q=0.95, axis=1)
train['C_q95']=train[C_Columns].quantile(q=0.95, axis=1)
train['G_q95']=train[G_Columns].quantile(q=0.95, axis=1)


train['A_q99']=train[A_Columns].quantile(q=0.99, axis=1)
train['T_q99']=train[T_Columns].quantile(q=0.99, axis=1)
train['C_q99']=train[C_Columns].quantile(q=0.99, axis=1)
train['G_q99']=train[G_Columns].quantile(q=0.99, axis=1)


train['A_q40']=train[A_Columns].quantile(q=0.40, axis=1)
train['T_q40']=train[T_Columns].quantile(q=0.40, axis=1)
train['C_q40']=train[C_Columns].quantile(q=0.40, axis=1)
train['G_q40']=train[G_Columns].quantile(q=0.40, axis=1)

train['A_q60']=train[A_Columns].quantile(q=0.60, axis=1)
train['T_q60']=train[T_Columns].quantile(q=0.60, axis=1)
train['C_q60']=train[C_Columns].quantile(q=0.60, axis=1)
train['G_q60']=train[G_Columns].quantile(q=0.60, axis=1)


train['A_max']=train[A_Columns].max(axis=1)
train['T_max']=train[T_Columns].max(axis=1)
train['C_max']=train[C_Columns].max(axis=1)
train['G_max']=train[G_Columns].max(axis=1)

train['A_min']=train[A_Columns].min(axis=1)
train['T_min']=train[T_Columns].min(axis=1)
train['C_min']=train[C_Columns].min(axis=1)
train['G_min']=train[G_Columns].min(axis=1)


train['A_skew']=train[A_Columns].skew(axis=1)
train['T_skew']=train[T_Columns].skew(axis=1)
train['C_skew']=train[C_Columns].skew(axis=1)
train['G_skew']=train[G_Columns].skew(axis=1)


# train['A_kurt']=train[A_Columns].kurt(axis=1)
# train['T_kurt']=train[T_Columns].kurt(axis=1)
# train['C_kurt']=train[C_Columns].kurt(axis=1)
# train['G_kurt']=train[G_Columns].kurt(axis=1)


train['A_range']=train['A_max']-train['A_min']
train['T_range']=train['T_max']-train['T_min']
train['C_range']=train['C_max']-train['C_min']
train['G_range']=train['G_max']-train['G_min']

train['A_iqr']=train['A_q75']-train['A_q25']
train['T_iqr']=train['T_q75']-train['T_q25']
train['C_iqr']=train['C_q75']-train['C_q25']
train['G_iqr']=train['G_q75']-train['G_q25']


train['A_median_max']=train['A_med']-train['A_max']
train['T_median_max']=train['T_med']-train['T_max']
train['C_median_max']=train['C_med']-train['C_max']
train['G_median_max']=train['G_med']-train['G_max']


train['A_median_min']=train['A_med']-train['A_min']
train['T_median_min']=train['T_med']-train['T_min']
train['C_median_min']=train['C_med']-train['C_min']
train['G_median_min']=train['G_med']-train['G_min']

train['A_q99-q95']=train['A_q99']-train['A_q95']
train['T_q99-q95']=train['T_q99']-train['T_q95']
train['C_q99-q95']=train['C_q99']-train['C_q95']
train['G_q99-q95']=train['G_q99']-train['G_q95']

train['A_q01-q10']=train['A_q01']-train['A_q10']
train['T_q01-q10']=train['T_q01']-train['T_q10']
train['C_q01-q10']=train['C_q01']-train['C_q10']
train['G_q01-q10']=train['G_q01']-train['G_q10']

train['A_q01-q05']=train['A_q01']-train['A_q05']
train['T_q01-q05']=train['T_q01']-train['T_q05']
train['C_q01-q05']=train['C_q01']-train['C_q05']
train['G_q01-q05']=train['G_q01']-train['G_q05']

train['A_q99-q90']=train['A_q99']-train['A_q90']
train['T_q99-q90']=train['T_q99']-train['T_q90']
train['C_q99-q90']=train['C_q99']-train['C_q90']
train['G_q99-q90']=train['G_q99']-train['G_q90']

#### Test Feature Engineering

In [None]:
test['A_sum']=test[A_Columns].sum(axis=1)
test['T_sum']=test[T_Columns].sum(axis=1)
test['C_sum']=test[C_Columns].sum(axis=1)
test['G_sum']=test[G_Columns].sum(axis=1)

test['A_mean']=test[A_Columns].mean(axis=1)
test['T_mean']=test[T_Columns].mean(axis=1)
test['C_mean']=test[C_Columns].mean(axis=1)
test['G_mean']=test[G_Columns].mean(axis=1)

test['A_std']=test[A_Columns].std(axis=1)
test['T_std']=test[T_Columns].std(axis=1)
test['C_std']=test[C_Columns].std(axis=1)
test['G_std']=test[G_Columns].std(axis=1)

test['A_med']=test[A_Columns].median(axis=1)
test['T_med']=test[T_Columns].median(axis=1)
test['C_med']=test[C_Columns].median(axis=1)
test['G_med']=test[G_Columns].median(axis=1)

In [None]:
test['A_q01']=test[A_Columns].quantile(q=0.01, axis=1)
test['T_q01']=test[T_Columns].quantile(q=0.01, axis=1)
test['C_q01']=test[C_Columns].quantile(q=0.01, axis=1)
test['G_q01']=test[G_Columns].quantile(q=0.01, axis=1)

test['A_q05']=test[A_Columns].quantile(q=0.05, axis=1)
test['T_q05']=test[T_Columns].quantile(q=0.05, axis=1)
test['C_q05']=test[C_Columns].quantile(q=0.05, axis=1)
test['G_q05']=test[G_Columns].quantile(q=0.05, axis=1)

test['A_q10']=test[A_Columns].quantile(q=0.10, axis=1)
test['T_q10']=test[T_Columns].quantile(q=0.10, axis=1)
test['C_q10']=test[C_Columns].quantile(q=0.10, axis=1)
test['G_q10']=test[G_Columns].quantile(q=0.10, axis=1)

In [None]:
test['A_q25']=test[A_Columns].quantile(q=0.25, axis=1)
test['T_q25']=test[T_Columns].quantile(q=0.25, axis=1)
test['C_q25']=test[C_Columns].quantile(q=0.25, axis=1)
test['G_q25']=test[G_Columns].quantile(q=0.25, axis=1)


test['A_q75']=test[A_Columns].quantile(q=0.75, axis=1)
test['T_q75']=test[T_Columns].quantile(q=0.75, axis=1)
test['C_q75']=test[C_Columns].quantile(q=0.75, axis=1)
test['G_q75']=test[G_Columns].quantile(q=0.75, axis=1)


test['A_q90']=test[A_Columns].quantile(q=0.90, axis=1)
test['T_q90']=test[T_Columns].quantile(q=0.90, axis=1)
test['C_q90']=test[C_Columns].quantile(q=0.90, axis=1)
test['G_q90']=test[G_Columns].quantile(q=0.90, axis=1)


test['A_q95']=test[A_Columns].quantile(q=0.95, axis=1)
test['T_q95']=test[T_Columns].quantile(q=0.95, axis=1)
test['C_q95']=test[C_Columns].quantile(q=0.95, axis=1)
test['G_q95']=test[G_Columns].quantile(q=0.95, axis=1)


test['A_q99']=test[A_Columns].quantile(q=0.99, axis=1)
test['T_q99']=test[T_Columns].quantile(q=0.99, axis=1)
test['C_q99']=test[C_Columns].quantile(q=0.99, axis=1)
test['G_q99']=test[G_Columns].quantile(q=0.99, axis=1)


test['A_q40']=test[A_Columns].quantile(q=0.40, axis=1)
test['T_q40']=test[T_Columns].quantile(q=0.40, axis=1)
test['C_q40']=test[C_Columns].quantile(q=0.40, axis=1)
test['G_q40']=test[G_Columns].quantile(q=0.40, axis=1)

test['A_q60']=test[A_Columns].quantile(q=0.60, axis=1)
test['T_q60']=test[T_Columns].quantile(q=0.60, axis=1)
test['C_q60']=test[C_Columns].quantile(q=0.60, axis=1)
test['G_q60']=test[G_Columns].quantile(q=0.60, axis=1)


test['A_max']=test[A_Columns].max(axis=1)
test['T_max']=test[T_Columns].max(axis=1)
test['C_max']=test[C_Columns].max(axis=1)
test['G_max']=test[G_Columns].max(axis=1)

test['A_min']=test[A_Columns].min(axis=1)
test['T_min']=test[T_Columns].min(axis=1)
test['C_min']=test[C_Columns].min(axis=1)
test['G_min']=test[G_Columns].min(axis=1)


test['A_skew']=test[A_Columns].skew(axis=1)
test['T_skew']=test[T_Columns].skew(axis=1)
test['C_skew']=test[C_Columns].skew(axis=1)
test['G_skew']=test[G_Columns].skew(axis=1)


# test['A_kurt']=test[A_Columns].kurt(axis=1)
# test['T_kurt']=test[T_Columns].kurt(axis=1)
# test['C_kurt']=test[C_Columns].kurt(axis=1)
# test['G_kurt']=test[G_Columns].kurt(axis=1)


test['A_range']=test['A_max']-test['A_min']
test['T_range']=test['T_max']-test['T_min']
test['C_range']=test['C_max']-test['C_min']
test['G_range']=test['G_max']-test['G_min']

test['A_iqr']=test['A_q75']-test['A_q25']
test['T_iqr']=test['T_q75']-test['T_q25']
test['C_iqr']=test['C_q75']-test['C_q25']
test['G_iqr']=test['G_q75']-test['G_q25']


test['A_median_max']=test['A_med']-test['A_max']
test['T_median_max']=test['T_med']-test['T_max']
test['C_median_max']=test['C_med']-test['C_max']
test['G_median_max']=test['G_med']-test['G_max']


test['A_median_min']=test['A_med']-test['A_min']
test['T_median_min']=test['T_med']-test['T_min']
test['C_median_min']=test['C_med']-test['C_min']
test['G_median_min']=test['G_med']-test['G_min']

test['A_q99-q95']=test['A_q99']-test['A_q95']
test['T_q99-q95']=test['T_q99']-test['T_q95']
test['C_q99-q95']=test['C_q99']-test['C_q95']
test['G_q99-q95']=test['G_q99']-test['G_q95']

test['A_q01-q10']=test['A_q01']-test['A_q10']
test['T_q01-q10']=test['T_q01']-test['T_q10']
test['C_q01-q10']=test['C_q01']-test['C_q10']
test['G_q01-q10']=test['G_q01']-test['G_q10']

test['A_q01-q05']=test['A_q01']-test['A_q05']
test['T_q01-q05']=test['T_q01']-test['T_q05']
test['C_q01-q05']=test['C_q01']-test['C_q05']
test['G_q01-q05']=test['G_q01']-test['G_q05']

test['A_q99-q90']=test['A_q99']-test['A_q90']
test['T_q99-q90']=test['T_q99']-test['T_q90']
test['C_q99-q90']=test['C_q99']-test['C_q90']
test['G_q99-q90']=test['G_q99']-test['G_q90']

In [None]:
train=train.drop(A_Columns+T_Columns+C_Columns+G_Columns, axis=1)

In [None]:
test=test.drop(A_Columns+T_Columns+C_Columns+G_Columns, axis=1)

In [None]:
#train.columns.to_list()

### Label Encode Train's Target Variable

In [None]:
target_encoder = LabelEncoder()
train["target"] = target_encoder.fit_transform(train["target"])

X = train.drop(["target"], axis=1)
y = train["target"]

##### Look at remaining Columns

In [None]:
#X.columns.to_list()

In [None]:
col_list=train.columns.to_list()

In [None]:
train_cols=col_list[287:]

### Target 0, 1, 2, 3's distributions of A T C G Sums

In [None]:
#https://stackoverflow.com/questions/37911731/seaborn-histogram-with-4-panels-2-x-2-in-python
#https://seaborn.pydata.org/tutorial/axis_grids.html
#https://seaborn.pydata.org/tutorial/color_palettes.html

In [None]:
plt.subplot(2, 2, 1)
sns.histplot(data=train[(train['target']==0) | (train['target'] == 1)| (train['target'] == 2)| (train['target'] == 3)], x="A_sum", hue="target")
plt.title('Distribution of Adenine')
plt.xlabel("Adenine")
plt.ylabel("Count")

plt.subplot(2, 2, 2)
sns.histplot(data=train[(train['target']==0) | (train['target'] == 1)| (train['target'] == 2)| (train['target'] == 3)], x="T_sum", hue="target")
plt.title('Distribution of Thymine')
plt.xlabel("Thymine")
plt.ylabel("Count")

plt.subplot(2, 2, 3)
sns.histplot(data=train[(train['target']==0) | (train['target'] == 1)| (train['target'] == 2)| (train['target'] == 3)], x="C_sum", hue="target")
plt.title('Distribution of Cytosine')
plt.xlabel("Cytosine")
plt.ylabel("Count")

plt.subplot(2, 2, 4)
sns.histplot(data=train[(train['target']==0) | (train['target'] == 1)| (train['target'] == 2)| (train['target'] == 3)], x="G_sum", hue="target")
plt.title('Distribution of Guanine')
plt.xlabel("Guanine")
plt.ylabel("Count")

plt.gcf().set_size_inches(18, 15)
plt.show()

### Target 4, 5, 6, 7's distributions of A T C G Sums

In [None]:
plt.subplot(2, 2, 1)
sns.histplot(data=train[(train['target']==4) | (train['target'] == 5)| (train['target'] == 6)| (train['target'] == 7)], x="A_sum", hue="target")
plt.title('Distribution of Adenine')
plt.xlabel("Adenine")
plt.ylabel("Count")

plt.subplot(2, 2, 2)
sns.histplot(data=train[(train['target']==4) | (train['target'] == 5)| (train['target'] == 6)| (train['target'] == 7)], x="T_sum", hue="target")
plt.title('Distribution of Thymine')
plt.xlabel("Thymine")
plt.ylabel("Count")


plt.subplot(2, 2, 3)
sns.histplot(data=train[(train['target']==4) | (train['target'] == 5)| (train['target'] == 6)| (train['target'] == 7)], x="C_sum", hue="target")
plt.title('Distribution of Cytosine')
plt.xlabel("Cytosine")
plt.ylabel("Count")

plt.subplot(2, 2, 4)
sns.histplot(data=train[(train['target']==4) | (train['target'] == 5)| (train['target'] == 6)| (train['target'] == 7)], x="G_sum", hue="target")
plt.title('Distribution of Guanine')
plt.xlabel("Guanine")
plt.ylabel("Count")

plt.gcf().set_size_inches(18, 15)
plt.show()

### Target 0, 1, 2, 3's distributions of A T C G Means

In [None]:
plt.subplot(2, 2, 1)
sns.histplot(data=train[(train['target']==0) | (train['target'] == 1)| (train['target'] == 2)| (train['target'] == 3)], x="A_mean", hue="target")
plt.title('Distribution of Adenine')
plt.xlabel("Adenine")
plt.ylabel("Count")

plt.subplot(2, 2, 2)
sns.histplot(data=train[(train['target']==0) | (train['target'] == 1)| (train['target'] == 2)| (train['target'] == 3)], x="T_mean", hue="target")
plt.title('Distribution of Thymine')
plt.xlabel("Thymine")
plt.ylabel("Count")

plt.subplot(2, 2, 3)
sns.histplot(data=train[(train['target']==0) | (train['target'] == 1)| (train['target'] == 2)| (train['target'] == 3)], x="C_mean", hue="target")
plt.title('Distribution of Cytosine')
plt.xlabel("Cytosine")
plt.ylabel("Count")

plt.subplot(2, 2, 4)
sns.histplot(data=train[(train['target']==0) | (train['target'] == 1)| (train['target'] == 2)| (train['target'] == 3)], x="G_mean", hue="target")
plt.title('Distribution of Guanine')
plt.xlabel("Guanine")
plt.ylabel("Count")

plt.gcf().set_size_inches(18, 15)
plt.show()

### Target 4, 5, 6, 7's distributions of A T C G Means

In [None]:
plt.subplot(2, 2, 1)
sns.histplot(data=train[(train['target']==4) | (train['target'] == 5)| (train['target'] == 6)| (train['target'] == 7)], x="A_mean", hue="target")
plt.title('Distribution of Adenine')
plt.xlabel("Adenine")
plt.ylabel("Count")

plt.subplot(2, 2, 2)
sns.histplot(data=train[(train['target']==4) | (train['target'] == 5)| (train['target'] == 6)| (train['target'] == 7)], x="T_mean", hue="target")
plt.title('Distribution of Thymine')
plt.xlabel("Thymine")
plt.ylabel("Count")


plt.subplot(2, 2, 3)
sns.histplot(data=train[(train['target']==4) | (train['target'] == 5)| (train['target'] == 6)| (train['target'] == 7)], x="C_mean", hue="target")
plt.title('Distribution of Cytosine')
plt.xlabel("Cytosine")
plt.ylabel("Count")

plt.subplot(2, 2, 4)
sns.histplot(data=train[(train['target']==4) | (train['target'] == 5)| (train['target'] == 6)| (train['target'] == 7)], x="G_mean", hue="target")
plt.title('Distribution of Guanine')
plt.xlabel("Guanine")
plt.ylabel("Count")

plt.gcf().set_size_inches(18, 15)
plt.show()

### Target 0, 1, 2, 3's distributions of A T C G Stds

In [None]:
plt.subplot(2, 2, 1)
sns.histplot(data=train[(train['target']==0) | (train['target'] == 1)| (train['target'] == 2)| (train['target'] == 3)], x="A_std", hue="target")
plt.title('Distribution of Adenine')
plt.xlabel("Adenine")
plt.ylabel("Count")

plt.subplot(2, 2, 2)
sns.histplot(data=train[(train['target']==0) | (train['target'] == 1)| (train['target'] == 2)| (train['target'] == 3)], x="T_std", hue="target")
plt.title('Distribution of Thymine')
plt.xlabel("Thymine")
plt.ylabel("Count")

plt.subplot(2, 2, 3)
sns.histplot(data=train[(train['target']==0) | (train['target'] == 1)| (train['target'] == 2)| (train['target'] == 3)], x="C_std", hue="target")
plt.title('Distribution of Cytosine')
plt.xlabel("Cytosine")
plt.ylabel("Count")

plt.subplot(2, 2, 4)
sns.histplot(data=train[(train['target']==0) | (train['target'] == 1)| (train['target'] == 2)| (train['target'] == 3)], x="G_std", hue="target")
plt.title('Distribution of Guanine')
plt.xlabel("Guanine")
plt.ylabel("Count")

plt.gcf().set_size_inches(18, 15)
plt.show()

### Target 4, 5, 6, 7's distributions of A T C G Stds

In [None]:
plt.subplot(2, 2, 1)
sns.histplot(data=train[(train['target']==4) | (train['target'] == 5)| (train['target'] == 6)| (train['target'] == 7)], x="A_std", hue="target")
plt.title('Distribution of Adenine')
plt.xlabel("Adenine")
plt.ylabel("Count")

plt.subplot(2, 2, 2)
sns.histplot(data=train[(train['target']==4) | (train['target'] == 5)| (train['target'] == 6)| (train['target'] == 7)], x="T_std", hue="target")
plt.title('Distribution of Thymine')
plt.xlabel("Thymine")
plt.ylabel("Count")


plt.subplot(2, 2, 3)
sns.histplot(data=train[(train['target']==4) | (train['target'] == 5)| (train['target'] == 6)| (train['target'] == 7)], x="C_std", hue="target")
plt.title('Distribution of Cytosine')
plt.xlabel("Cytosine")
plt.ylabel("Count")

plt.subplot(2, 2, 4)
sns.histplot(data=train[(train['target']==4) | (train['target'] == 5)| (train['target'] == 6)| (train['target'] == 7)], x="G_std", hue="target")
plt.title('Distribution of Guanine')
plt.xlabel("Guanine")
plt.ylabel("Count")

plt.gcf().set_size_inches(18, 15)
plt.show()

# Model training

##### Ensure there aren't Null Values in any newly engineered features

In [None]:
X[train_cols].head()

In [None]:
dff=X.isna().sum().to_frame().reset_index()
dff.columns=['name','null_ct']

In [None]:
dff['null_ct'].value_counts()

In [None]:
dff[dff['null_ct']>0]

In [None]:
test[train_cols].head()

In [None]:
test[train_cols].max().max()
test[train_cols].min().min()

In [None]:
test.isna().sum().sum()

In [None]:
dfff=test.isna().sum().to_frame().reset_index()
dfff.columns=['name','null_ct']

In [None]:
dfff[dfff['null_ct']>0]

### ExtraTreesClassifier training

In [None]:
y_probs = []
scores = []

folds = StratifiedKFold(n_splits=5, shuffle=True)

estimators = 2000
for fold, (train_id, test_id) in enumerate(folds.split(X, y)):  
    #X_train = X[['A_sum', 'T_sum', 'C_sum', 'G_sum','A_std','T_std','C_std','G_std','A_mean','T_mean','C_mean','G_mean']].iloc[train_id]
    #X_train = X[train_cols].iloc[train_id]
    X_train = X.iloc[train_id]
    y_train = y.iloc[train_id]
    #X_valid = X[['A_sum', 'T_sum', 'C_sum', 'G_sum','A_std','T_std','C_std','G_std','A_mean','T_mean','C_mean','G_mean']].iloc[test_id]
    #X_valid = X[train_cols].iloc[test_id]
    X_valid = X.iloc[test_id]
    y_valid = y.iloc[test_id]
    
    model = ExtraTreesClassifier(
        n_estimators=estimators,
        n_jobs=-1
    )

    model.fit(X_train, y_train)
    
    valid_pred = model.predict(X_valid)
    valid_score = accuracy_score(y_valid, valid_pred)
    
    print("Fold:", fold + 1, "Accuracy:", valid_score)
    
    scores.append(valid_score)
    
    # Save predictions to later submit the mean values
    #if submission: 
    #y_probs.append(model.predict_proba(test[['A_sum', 'T_sum', 'C_sum', 'G_sum','A_std','T_std','C_std','G_std','A_mean','T_mean','C_mean','G_mean']]))
    #y_probs.append(model.predict_proba(test[train_cols]))
    y_probs.append(model.predict_proba(test))
    
    estimators = estimators + 10

In [None]:
print("Mean accuracy score:", np.array(scores).mean())

In [None]:
y_prob = sum(y_probs) / len(y_probs)
# The explanations for these numbers are in AMBROSM's code
#y_prob += np.array([0, 0, 0.01, 0.03, 0, 0, 0, 0, 0, 0])
y_pred_tuned = target_encoder.inverse_transform(np.argmax(y_prob, axis=1))
pd.Series(y_pred_tuned, index=test.index).value_counts().sort_index() / len(test) * 100

#### Extra_Trees Classifier - Linear Kernel - Submitting results

In [None]:
sub["target"] = y_pred_tuned
sub.to_csv("X_trees_4_Features_submission.csv", index=False)

### RandomForestClassifier  training

In [None]:
# from sklearn.ensemble import RandomForestClassifier
# y_probs = []
# scores = []

# folds = StratifiedKFold(n_splits=5, shuffle=True)

# estimators = 1000
# for fold, (train_id, test_id) in enumerate(folds.split(X, y)):  
#     #X_train = X[['A_sum', 'T_sum', 'C_sum', 'G_sum','A_std','T_std','C_std','G_std','A_mean','T_mean','C_mean','G_mean']].iloc[train_id]
#     #X_train = X[train_cols].iloc[train_id]
#     X_train = X.iloc[train_id]
#     y_train = y.iloc[train_id]
#     #X_valid = X[['A_sum', 'T_sum', 'C_sum', 'G_sum','A_std','T_std','C_std','G_std','A_mean','T_mean','C_mean','G_mean']].iloc[test_id]
#     #X_valid = X[train_cols].iloc[test_id]
#     X_valid = X.iloc[test_id]
#     y_valid = y.iloc[test_id]
    
#     model = RandomForestClassifier(
#         n_estimators=estimators,
#         n_jobs=-1
#     )

#     model.fit(X_train, y_train)
    
#     valid_pred = model.predict(X_valid)
#     valid_score = accuracy_score(y_valid, valid_pred)
    
#     print("Fold:", fold + 1, "Accuracy:", valid_score)
    
#     scores.append(valid_score)
    
#     # Save predictions to later submit the mean values
#     #if submission: 
#     #y_probs.append(model.predict_proba(test[['A_sum', 'T_sum', 'C_sum', 'G_sum','A_std','T_std','C_std','G_std','A_mean','T_mean','C_mean','G_mean']]))
#     #y_probs.append(model.predict_proba(test[train_cols]))
#     y_probs.append(model.predict_proba(test))
    
#     estimators = estimators + 100

In [None]:
#print("Mean accuracy score:", np.array(scores).mean())

In [None]:
# y_prob = sum(y_probs) / len(y_probs)
# # The explanations for these numbers are in AMBROSM's code
# #y_prob += np.array([0, 0, 0.01, 0.03, 0, 0, 0, 0, 0, 0])
# y_pred_tuned = target_encoder.inverse_transform(np.argmax(y_prob, axis=1))
# pd.Series(y_pred_tuned, index=test.index).value_counts().sort_index() / len(test) * 100

#### RF - Submitting results

In [None]:
# sub["target"] = y_pred_tuned
# sub.to_csv("RF_4_Features_submission.csv", index=False)

### Logistic Regression- GridSearch CV:

In [None]:
# from sklearn.linear_model import LogisticRegression
# from sklearn.model_selection import GridSearchCV
# import time
# t_start = time.time()

# #default max iters and C=9.6 gives 0.428224 accuracy

# #max_iter=500 and C': 19.21 gives 0.434685 accuracy

# param_grid = {'C':[.01,1,10, 100, 1000, 2500]
#              }

# clf = LogisticRegression(random_state=0, max_iter=1000)
# LR_search = GridSearchCV(clf, param_grid, cv=5, n_jobs=1)

# #LR_search.fit(X[['A_sum', 'T_sum', 'C_sum', 'G_sum','A_std','T_std','C_std','G_std','A_mean','T_mean','C_mean','G_mean']], y)
# #LR_search.fit(X[train_cols], y)
# LR_search.fit(X, y)
# #search.fit(X_encoded, y) #since we're using CV, don't use train, use entire data set



# t_stop = time.time()
# print('Time elapsed: {:.3f} seconds'.format(t_stop - t_start))

In [None]:
#LR_search.best_params_

In [None]:
#LR_search.best_estimator_

In [None]:
#LR_search.best_score_

In [None]:
#y_pred_test=LR_search.best_estimator_.predict(test[['A_sum', 'T_sum', 'C_sum', 'G_sum','A_std','T_std','C_std','G_std','A_mean','T_mean','C_mean','G_mean']])
#y_pred_test=LR_search.best_estimator_.predict(test[train_cols])
#y_pred_test=LR_search.best_estimator_.predict(test)

In [None]:
#y_pred_tuned = target_encoder.inverse_transform(y_pred_test)

#### Logistic Regression - Submitting results

In [None]:
# sub["target"] = y_pred_tuned
# sub.to_csv("LR_4_Features_submission.csv", index=False)

# Submitting results

In [None]:
# sub["target"] = y_pred_tuned
# sub.to_csv("RF_4_Features_submission.csv", index=False)