# Imports

In [None]:
# For my cloud instance
# !pip install kaggle
# !kaggle competitions download -c tabular-playground-series-feb-2022
# import shutil
# shutil.unpack_archive('tabular-playground-series-feb-2022.zip')

In [None]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.ensemble import ExtraTreesClassifier

import warnings
import gc
warnings.simplefilter('ignore')

KAGGLE_DIR = r'../input/tabular-playground-series-feb-2022/'
LOCAL_DIR = r''
KAGGLE = True
RS = 69420

In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

# Preprocessing

In [None]:
%%time
if KAGGLE:
    print(f"{'*'*10} Loading Training Data... {'*'*10}")
    df = pd.read_csv(KAGGLE_DIR+'train.csv', index_col=0).pipe(reduce_mem_usage)
    print(f"{'*'*10} Loading Testing Data... {'*'*10}")
    test = pd.read_csv(KAGGLE_DIR+'test.csv', index_col=0).pipe(reduce_mem_usage)
    sub = pd.read_csv(KAGGLE_DIR+'sample_submission.csv').pipe(reduce_mem_usage)
else:
    print(f"{'*'*10} Loading Training Data... {'*'*10}")
    df = pd.read_csv(LOCAL_DIR+'train.csv', index_col=0).pipe(reduce_mem_usage)
    print(f"{'*'*10} Loading Testing Data... {'*'*10}")
    test = pd.read_csv(LOCAL_DIR+'test.csv', index_col=0).pipe(reduce_mem_usage)
    sub = pd.read_csv(LOCAL_DIR+'sample_submission.csv').pipe(reduce_mem_usage)

In [None]:
# Drop duplicate rows
# https://www.kaggle.com/c/tabular-playground-series-feb-2022/discussion/305364

duplicates_train = df.duplicated().sum()
print('Duplicates in train data: {0}'.format(duplicates_train))

df.drop_duplicates(keep='first', inplace=True)
duplicates_train = df.duplicated().sum()

print('Train data shape:', df.shape)
print('Duplicates in train data: {0}'.format(duplicates_train))

In [None]:
lb = LabelEncoder()
X = df.iloc[:, :-1].values
y = lb.fit_transform(df['target'])

In [None]:
X.shape, y.shape

In [None]:
del df
gc.collect()

# Simple Model

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=RS)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
clf = ExtraTreesClassifier(
    n_estimators=100,
    n_jobs=-1,
    random_state=RS,
    verbose=1
)

In [None]:
%%time
clf.fit(
    X_train, y_train,
)

In [None]:
y_pred = clf.predict(X_test)

In [None]:
acc = accuracy_score(y_pred, y_test)
print(f"Model Accuracy: {round(acc, 6)}")

# Tuning Number Of Estimators

In [None]:
estimator_list = np.arange(100, 5100, 100)
seeds = [69420, 42, 69, 666]
scores = pd.DataFrame()

In [None]:
for seed in seeds:
    print(f"{'*'*10} SEED: {seed} {'*'*10}")
    seed_scores = []
    for i in estimator_list:
        print(f"Testing {i} Estimators")
        clf = ExtraTreesClassifier(
            n_estimators=i,
            n_jobs=-1,
            random_state=seed
        )

        clf.fit(
            X_train, y_train,
        )

        y_pred = clf.predict(X_test)

        acc = accuracy_score(y_pred, y_test)
        seed_scores.append(acc)
        print(f"Classifier Accuracy: {round(acc, 6)}")
        _ = gc.collect()
        
    scores[f'{seed}_scores'] = (seed_scores)

In [None]:
scores.set_index(estimator_list, inplace=True)
scores['mean'] = scores.mean(axis=1)
scores

In [None]:
import matplotlib.pyplot as plt
plt.plot(scores.index, scores['mean'])
plt.title('Accuracy Sensitivity to Estimators')
plt.xlabel("Number Of Estimators")
plt.ylabel("Accuracy")
plt.tight_layout()

best_est = estimator_list[np.argmax(scores['mean'])]
best_scores = round(scores['mean'][best_est], 6)
print(f"Best Accuracy: {best_scores}, Number Of Estimators: {best_est}")

In [None]:
scores.plot(
    figsize=(8, 6),
    title='Variation Of Scores Across Random Seeds',
    xlabel='Number Of Estimators',
    ylabel='Accuracy'
)
plt.tight_layout()

print("Highest Scoring Number Of Estimators Per Seed \n", scores.idxmax())
print("Highest Score Per Seed \n", scores.max())

In [None]:
plot_df = scores.transpose()
plot_df.plot(
    kind='box',
    figsize=(10, 6),
    title='Variation Of Scores Across Random Seeds',
    xlabel='Number of Estimators',
    ylabel='Accuracy'
)
plt.xticks(rotation=90)
plt.show()