In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
!pip install scikit-learn-intelex
from sklearnex import patch_sklearn
patch_sklearn()

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pylab as plt

import re

from scipy.stats import mode
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold

KAGGLE_DIR = r'../input/tabular-playground-series-feb-2022/'
LOCAL_DIR = r''
KAGGLE = True
RS = 69420

In [None]:
def reduce_mem_usage(df):
    
    start_mem = df.memory_usage().sum() / 1024**2
    print(f'MEMORY USAGE: {start_mem:.2f} MB')
    
    for col in df.columns:
        col_dtype = df[col].dtype
        
        if col_dtype != object:
            dtype_min = df[col].min()
            dtype_max = df[col].max()
            
            if str(col_dtype)[:3] == 'int':
                if dtype_min > np.iinfo(np.int8).min and dtype_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif dtype_min > np.iinfo(np.int16).min and dtype_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif dtype_min > np.iinfo(np.int32).min and dtype_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif dtype_min > np.iinfo(np.int64).min and dtype_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if dtype_min > np.finfo(np.float16).min and dtype_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif dtype_min > np.finfo(np.float32).min and dtype_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')
            
    end_mem = df.memory_usage().sum() / 1024**2
    print(f'MEMORY USAGE: {end_mem:.2f} MB')
    print(f'DECREASED BY: {(start_mem - end_mem) / start_mem * 100}')
    
    return df

In [None]:
%%time

if KAGGLE:
    print(f"{'*'*10}LOADING TRAIN DATA{'*'*10}")
    train = pd.read_csv(KAGGLE_DIR + 'train.csv', index_col=0).pipe(reduce_mem_usage)
    print(f"{'*'*10}LOADING TEST DATA{'*'*10}")
    test = pd.read_csv(KAGGLE_DIR + 'test.csv', index_col=0).pipe(reduce_mem_usage)
    sub = pd.read_csv(KAGGLE_DIR + 'sample_submission.csv', index_col=0).pipe(reduce_mem_usage)
else:
    print(f"{'*'*10}LOADING TRAIN DATA{'*'*10}")
    train = pd.read_csv(LOCAL_DIR + 'train.csv', index_col=0).pipe(reduce_mem_usage)
    print(f"{'*'*10}LOADING TEST DATA{'*'*10}")
    test = pd.read_csv(LOCAL_DIR + 'test.csv', index_col=0).pipe(reduce_mem_usage)
    sub = pd.read_csv(LOCAL_DIR + 'sample_submission.csv', index_col=0).pipe(reduce_mem_usage)

In [None]:
columns_tolist = train.columns.tolist()
columns_tolist.remove('target')

nucleotides = ['A', 'T', 'G', 'C']

def feature_engineering(df):
    
    for i, col in enumerate(columns_tolist):
        digits = re.findall(r'\d+', col)
        colidx_tostr = str(i)
        for j, nucleotide in enumerate(nucleotides):
            idx = str(j)
            df[colidx_tostr + nucleotide] = df[col] * int(digits[j])
            
feature_engineering(train)
feature_engineering(test)

In [None]:
def statistics(df):
    for i, nucleotide in enumerate(nucleotides):
        
        math_feats = [col for col in train.columns if col[-1] == nucleotide]
        
        df[nucleotide + '_sum'] = df[math_feats].sum(axis=1)
        df[nucleotide + '_mean'] = df[math_feats].mean(axis=1)
        df[nucleotide + '_std'] = df[math_feats].std(axis=1)
        df[nucleotide + '_median'] = df[math_feats].median(axis=1)
        
        df[nucleotide + '_q01'] = df[math_feats].quantile(q=0.01, axis=1)
        df[nucleotide + '_q05'] = df[math_feats].quantile(q=0.05, axis=1)
        df[nucleotide + '_q10'] = df[math_feats].quantile(q=0.10, axis=1)
        df[nucleotide + '_q25'] = df[math_feats].quantile(q=0.25, axis=1)
        df[nucleotide + '_q75'] = df[math_feats].quantile(q=0.75, axis=1)
        df[nucleotide + '_q90'] = df[math_feats].quantile(q=0.90, axis=1)
        df[nucleotide + '_q95'] = df[math_feats].quantile(q=0.95, axis=1)
        df[nucleotide + '_q99'] = df[math_feats].quantile(q=0.99, axis=1)
        df[nucleotide + '_q40'] = df[math_feats].quantile(q=0.40, axis=1)
        df[nucleotide + '_q60'] = df[math_feats].quantile(q=0.60, axis=1)
        
        df[nucleotide + '_max'] = df[math_feats].max(axis=1)
        df[nucleotide + '_min'] = df[math_feats].min(axis=1)
        df[nucleotide + '_skew'] = df[math_feats].skew(axis=1)
        
        df[nucleotide + '_range'] = df[nucleotide + '_max'] - df[nucleotide + '_min']
        df[nucleotide + '_iqr'] = df[nucleotide + '_q75'] - df[nucleotide + '_q25']
        df[nucleotide + '_median_max'] = df[nucleotide + '_median'] - df[nucleotide + '_max']
        df[nucleotide + '_median_min'] = df[nucleotide + '_median'] - df[nucleotide + '_min']
        df[nucleotide + '_q99_q95'] = df[nucleotide + '_q99'] - df[nucleotide + '_q95']
        df[nucleotide + '_q01_q10'] = df[nucleotide + '_q01'] - df[nucleotide + '_q10']
        df[nucleotide + '_q01_q05'] = df[nucleotide + '_q01'] - df[nucleotide + '_q05']
        df[nucleotide + '_q99_q90'] = df[nucleotide + '_q99'] - df[nucleotide + '_q90']

statistics(train)
statistics(test)

In [None]:
def drop_columns(df):
    for i, nucleotide in enumerate(nucleotides):
        
        math_features = [col for col in df.columns if col[-1] == nucleotide]
        df = df.drop(math_features, axis=1)
    return df

train = drop_columns(train)
test = drop_columns(test)

In [None]:
train.columns[287:]

In [None]:
test.columns[286:]

In [None]:
target_encoder = LabelEncoder()
train['target'] = target_encoder.fit_transform(train['target'])

X = train.drop(['target'], axis=1)
y = train['target']

In [None]:
def plot_nucleo_distrib(statistic, targets):
    nucleotides = ['Adenine', 'Thymine', 'Guanine', 'Cytosine']
    for i, nucleotide in enumerate(nucleotides):
        plt.subplot(2, 2, i + 1)
        sns.histplot(data=train[train['target'].isin(targets)], x=nucleotide[0] + '_' + statistic, hue='target')
        plt.title(f'DISTRIBUTION OF {nucleotide}')
        plt.xlabel(nucleotide)
        plt.ylabel('Count')
        
    plt.gcf().set_size_inches(18, 15)
    plt.show()

In [None]:
plot_nucleo_distrib('sum', [0, 1, 2, 3])

In [None]:
plot_nucleo_distrib('sum', [4, 5, 6, 7])

In [None]:
plot_nucleo_distrib('mean', [0, 1, 2, 3])

In [None]:
plot_nucleo_distrib('mean', [4, 5, 6, 7])

In [None]:
plot_nucleo_distrib('std', [0, 1, 2, 3])

In [None]:
plot_nucleo_distrib('std', [4, 5, 6, 7])

In [None]:
cols_tolist = train.columns.tolist()
train_cols = cols_tolist[287:]

In [None]:
X[train_cols].head()

In [None]:
X_isna_sum = X.isna().sum().to_frame().reset_index()
X_isna_sum

In [None]:
X_isna_sum.columns = ['nucleotide', 'null_count']
X_isna_sum

In [None]:
X_isna_sum['null_count'].value_counts()

In [None]:
test[train_cols].head()

In [None]:
test[train_cols].max().max()

In [None]:
test[train_cols].min().min()

In [None]:
test_isna_sum = test.isna().sum().to_frame().reset_index()
test_isna_sum.columns = ['nucleotide', 'null_count']
test_isna_sum

In [None]:
test_isna_sum.null_count.value_counts()

In [None]:
test_isna_sum[test_isna_sum['null_count'] > 0]

In [None]:
scores = []
y_probs = []
estimators = 500

skf = StratifiedKFold(n_splits=5, shuffle=True)

for fold, (trn_idx, val_idx) in enumerate(skf.split(X, y)):
    X_train, y_train = X[train_cols].iloc[trn_idx], y.iloc[trn_idx]
    X_valid, y_valid = X[train_cols].iloc[val_idx], y.iloc[val_idx]
    
    model = ExtraTreesClassifier(n_estimators=estimators, n_jobs=-1)
    model.fit(X_train, y_train)
    
    valid_pred = model.predict(X_valid)
    valid_score = accuracy_score(y_valid, valid_pred)
    
    print(f'FOLD: {fold + 1}, ACCURACY: {valid_score}')
    
    scores.append(valid_score)
    y_probs.append(model.predict_proba(test[train_cols]))
    
    estimators += 10

In [None]:
print(f'MEAN ACCURACY SCORE: {np.array(scores).mean()}')

In [None]:
y_prob = sum(y_probs) / len(y_probs)
y_pred_tuned = target_encoder.inverse_transform(np.argmax(y_prob, axis=1))
pd.Series(y_pred_tuned, index=test.index).value_counts().sort_index() / len(test) * 100

In [None]:
sub['target'] = y_pred_tuned
sub.to_csv('extra_trees_four_features_01.csv', index=False)

In [None]:
from sklearn.ensemble import RandomForestClassifier

scores = []
y_probs = []
estimators = 1000

skf = StratifiedKFold(n_splits=5, shuffle=True)

for fold, (trn_idx, val_idx) in enumerate(skf.split(X, y)):
    X_train, y_train = X[train_cols].iloc[trn_idx], y.iloc[trn_idx]
    X_valid, y_valid = X[train_cols].iloc[val_idx], y.iloc[val_idx]
    
    model = RandomForestClassifier(n_estimators=estimators, n_jobs=-1)
    model.fit(X_train, y_train)
    
    valid_pred = model.predict(X_valid)
    valid_score = accuracy_score(y_valid, valid_pred)
    
    print(f'FOLD: {fold + 1}, ACCURACY: {valid_score}')
    
    scores.append(valid_score)
    y_probs.append(model.predict_proba(test[train_cols]))
    
    estimators += 100

In [None]:
print(f'MEAN ACCURACY SCORE: {np.array(scores).mean()}')

In [None]:
y_prob = sum(y_probs) / len(y_probs)
y_pred_tuned = target_encoder.inverse_transform(np.argmax(y_prob, axis=1))
pd.Series(y_pred_tuned, index=test.index).value_counts().sort_index() / len(test) * 100

In [None]:
sub['target'] = y_pred_tuned
sub.to_csv(f'rand_forest_four_feat_01.csv', index=False)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
import time

time_start = time.time()

param_grid = {'C': [.01, 1, 10, 100, 1000, 2500]}

clf = LogisticRegression(random_state=0, max_iter=1000)

LR_search = GridSearchCV(clf, param_grid, cv=5, n_jobs=1)
LR_search.fit(X[train_cols], y)

time_stop = time.time()
print(f'TIME ELAPSED: {time_stop - time_start:.3f} seconds')

In [None]:
LR_search.best_params_

In [None]:
LR_search.best_estimator_

In [None]:
LR_search.best_score_

In [None]:
y_pred_test = LR_search.best_estimator_.predict(test[train_cols])

In [None]:
y_pred_tuned = target_encoder.inverse_transform(y_pred_test)

In [None]:
sub['target'] = y_pred_tuned
sub.to_csv('log_regr_four_feat_01.csv', index=False)