In [None]:
!pip install scikit-learn-intelex -q

import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import gc
from sklearn.feature_selection import VarianceThreshold, mutual_info_classif, SelectKBest
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder

#Modeling
from sklearn import model_selection
from sklearn.metrics import accuracy_score
from sklearn.ensemble import ExtraTreesClassifier

from sklearnex import patch_sklearn
patch_sklearn()

## *EDA & Feature Engineering*

Reducing memory

In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
%%time
print(f"{'*'*10} Loading Training Data... {'*'*10}")
train = pd.read_csv("../input/tabular-playground-series-feb-2022/train.csv", index_col=0).pipe(reduce_mem_usage)
print(f"{'*'*10} Loading Testing Data... {'*'*10}")
test = pd.read_csv("../input/tabular-playground-series-feb-2022/test.csv", index_col=0).pipe(reduce_mem_usage)
sub = pd.read_csv('../input/tabular-playground-series-feb-2022/sample_submission.csv').pipe(reduce_mem_usage)

In [None]:
%%time
train.head()

In [None]:
%%time
test.head()

In [None]:
print(f'Number of rows: {train.shape[0]};  Number of columns: {train.shape[1]}; No of missing values: {sum(train.isna().sum())}')
print(f'Number of rows: {test.shape[0]};  Number of columns: {test.shape[1]}; No of missing values: {sum(test.isna().sum())}')

In [None]:
target_encoder = LabelEncoder()
train["target"] = target_encoder.fit_transform(train["target"])

In [None]:
plt.rcParams['figure.dpi'] = 100
sns.countplot(train['target'])
plt.show()

## *Vif Threshold, checking for constant cols*

In [None]:
useful_features = [column for column in train.columns if column not in ("row_id", 'target')]

The default is to keep all features with non-zero variance, i.e. remove the features that have the same value in all samples.

In [None]:
var_threshold = VarianceThreshold(threshold=0)
var_threshold.fit(train[useful_features])

In [None]:
#Removing constant columns
tr_constant_column = [column for column in train[useful_features].columns if column not in train[useful_features].columns[var_threshold.get_support()]]
print([feature for feature in tr_constant_column])

In [None]:
te_constant_column = [column for column in test[useful_features].columns if column not in test[useful_features].columns[var_threshold.get_support()]]
print([feature for feature in te_constant_column])

In [None]:
print(len(tr_constant_column))
print(len(te_constant_column))

Cool, we don't have constant columns

## *Finding & Dropping columns having high-correlation*

In [None]:
def finding_correlation(data, threshold):
    correlated_columns = set()
    correlation_matrix = data[useful_features].corr()
    for i in range(correlation_matrix.shape[0]):
        for j in range(i):
            if abs(correlation_matrix.iloc[i,j]) > threshold:
                column_name = correlation_matrix.columns[i]
                correlated_columns.add(column_name)
    return correlated_columns

We are going to remove features having a correlation grater than 80%

In [None]:
te_correlated_cols = finding_correlation(test, .7)

In [None]:
print(len(te_correlated_cols))

In [None]:
tr_correlated_cols = finding_correlation(train, .7)
print(len(tr_correlated_cols))

Let's take the intersection of these two list

In [None]:
cor_cols = list(set(tr_correlated_cols).intersection(te_correlated_cols))

In [None]:
print(len(cor_cols))

In [None]:
train.drop(cor_cols, inplace=True, axis=1)
test.drop(cor_cols, inplace=True, axis=1)

## *Using mutual informationn, kbest*

In [None]:
useful_features = [column for column in train.columns if column not in ("row_id", 'target')]
X_feat = train[useful_features]
Y_feat = train.target
print(X_feat.shape, Y_feat.shape)

In [None]:
%time
mutual_info = mutual_info_classif(X_feat, Y_feat)

mutual_info = pd.Series(mutual_info)
mutual_info.index = X_feat.columns
mutual_info.sort_values(ascending=False)

Let's select best 230 features

In [None]:
select_k_features  = SelectKBest(mutual_info_classif, k=260)
select_k_features.fit(X_feat, Y_feat)
select_k_features

In [None]:
cols = X_feat.columns[select_k_features.get_support()]

In [None]:
train_ = train[cols]
train_['target'] = train.target
train = train_.copy()
del train_
gc.collect()
test = test[cols]

# *Modeling*

In [None]:
train.target.value_counts()

In [None]:
target = ['target']
not_features = ['row_id','kfold', 'target']
cols = list(train.columns)
features = [feat for feat in cols if feat not in not_features]

In [None]:
train["kfold"] = -1
train_targets = train[target]

In [None]:
# initialize stratified k-fold
kf = model_selection.StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
for fold, (train_indicies, valid_indicies) in enumerate(kf.split(X=train, y=train_targets)):
    train.loc[valid_indicies, "kfold"] = fold

In [None]:
%%time
train.head()

# *Extra-Trees CLF*

In [None]:
%%time
y_probs = []
scores = []
estimators = 1500
for fold in range(10):
    xtrain =  train[train.kfold != fold].reset_index(drop=True)
    xvalid = train[train.kfold == fold].reset_index(drop=True)
    xtest = test.copy()
    
    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[features]
    xvalid = xvalid[features]
    xtest = xtest[features]
    
    model = ExtraTreesClassifier(
        n_estimators=estimators,
        n_jobs=-1,
        bootstrap = True,
    )
    model.fit(xtrain, ytrain)
    valid_pred = model.predict(xvalid)
    valid_score = accuracy_score(yvalid, valid_pred)
    
    print("Fold:", fold + 1, "Accuracy:", valid_score)
    scores.append(valid_score)
    y_probs.append(model.predict_proba(xtest))
    
    estimators = estimators + 350

print(np.mean(scores))

Credit to [AmbrosM](http://https://www.kaggle.com/ambrosm/tpsfeb22-02-postprocessing-against-the-mutants) for this part

In [None]:
y_prob = sum(y_probs) / len(y_probs)
y_prob += np.array([0, 0, 0.03, 0.036, 0, 0, 0, 0.027, 0, 0])
y_pred_tuned = target_encoder.inverse_transform(np.argmax(y_prob, axis=1))
pd.Series(y_pred_tuned, index=test.index).value_counts().sort_index() / len(test) * 100

In [None]:
sub["target"] = y_pred_tuned
sub.to_csv("./ET_CLF_SUB.csv", index=False)

In [None]:
sub.head()