## This notebook presents application of Randomforest technique to the current problem
###  preprocessing is taken from notebook of @amrbosm
### https://www.kaggle.com/ambrosm/tpsfeb22-02-postprocessing-against-the-mutants

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from math import factorial

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cluster import KMeans

In [None]:
train_df = pd.read_csv('../input/tabular-playground-series-feb-2022/train.csv')
test_df = pd.read_csv('../input/tabular-playground-series-feb-2022/test.csv')



In [None]:
elements = [e for e in train_df.columns if e != 'row_id' and e != 'target']

# Convert the 10 bacteria names to the integers 0 .. 9
le = LabelEncoder()
train_df['target_num'] = le.fit_transform(train_df.target)

train_df.shape, test_df.shape

In [None]:
def bias(w, x, y, z):
    return factorial(10) / (factorial(w) * factorial(x) * factorial(y) * factorial(z) * 4**10)

def bias_of(s):
    w = int(s[1:s.index('T')])
    x = int(s[s.index('T')+1:s.index('G')])
    y = int(s[s.index('G')+1:s.index('C')])
    z = int(s[s.index('C')+1:])
    return factorial(10) / (factorial(w) * factorial(x) * factorial(y) * factorial(z) * 4**10)

train_i = pd.DataFrame({col: ((train_df[col] + bias_of(col)) * 1000000).round().astype(int) for col in elements})
test_i = pd.DataFrame({col: ((test_df[col] + bias_of(col)) * 1000000).round().astype(int) for col in elements})
train_i

In [None]:
def gcd_of_all(df_i):
    gcd = df_i[elements[0]]
    for col in elements[1:]:
        gcd = np.gcd(gcd, df_i[col])
    return gcd

train_df['gcd'] = gcd_of_all(train_i)
test_df['gcd'] = gcd_of_all(test_i)
np.unique(train_df['gcd'], return_counts=True), np.unique(test_df['gcd'], return_counts=True)

In [None]:
for scale in np.sort(train_df['gcd'].unique()):
    # Compute the PCA
    pca = PCA(whiten=True, random_state=1)
    pca.fit(train_i[elements][train_df['gcd'] == scale])

    # Transform the data so that the components can be analyzed
    Xt_tr = pca.transform(train_i[elements][train_df['gcd'] == scale])
    Xt_te = pca.transform(test_i[elements][test_df['gcd'] == scale])

    # Plot a scattergram, projected to two PCA components, colored by classification target
    plt.figure(figsize=(6,6))
    plt.scatter(Xt_tr[:,0], Xt_tr[:,1], c=train_df.target_num[train_df['gcd'] == scale], s=1)
    plt.title(f"{1000000 // scale} decamers ({(train_df['gcd'] == scale).sum()} samples with gcd = {scale})")
    plt.show()

In [None]:
def plot_duplicates_per_gcd(df, title):
    plt.figure(figsize=(14, 3))
    plt.tight_layout()
    for i, gcd in enumerate(np.unique(df.gcd)):
        plt.subplot(1, 4, i+1)
        duplicates = df[df.gcd == gcd][elements].duplicated().sum()
        non_duplicates = len(df[df.gcd == gcd]) - duplicates
        plt.pie([non_duplicates, duplicates],
                labels=['not duplicate', 'duplicate'],
                colors=['gray', 'r'],
                startangle=90)
        plt.title(f"GCD = {gcd}")
    plt.subplots_adjust(wspace=0.8)
    plt.suptitle(title)
    plt.show()
        
plot_duplicates_per_gcd(train_df, title="Duplicates in Training")
plot_duplicates_per_gcd(test_df, title="Duplicates in Test")

In [None]:
# Show how the spectra are scaled
# The original spectrum gives a peak at 1.0;
# the seven error rates give seven other peaks in the histograms
# The eight peaks in the histograms correspond to the eight clusters 
# in the scattergram above
v = train_df[elements].abs().sum(axis=1)
chosen_gcd = 1

plt.figure(figsize=(18, 14))
plt.tight_layout()
for t in range(10): # loop over the ten bacteria species
    plt.subplot(5, 2, t+1)
    plt.title(le.inverse_transform([t])[0])
    
    # Select a single GCD and a single species
    vt = v[(train_df['gcd'] == chosen_gcd) & (train_df['target_num'] == t)]
    
    # Do a one-dimensional clustering to get the cluster centers and the cluster sizes
    km = KMeans(n_clusters=8)
    km.fit(vt.values.reshape(-1, 1))
    cluster_max = km.cluster_centers_.max() # label this cluster with 1.0 (no simulated errors)
    print(sorted((km.cluster_centers_ / cluster_max).ravel().round(2)),
          np.unique(km.predict(vt.values.reshape(-1, 1)), return_counts=True)[1][np.argsort(km.cluster_centers_.ravel())])

    # Plot a histogram of the eight clusters
    plt.hist(vt / cluster_max, bins=np.linspace(0, (vt / cluster_max).max(), 200), color='m', density=True)
    plt.xticks(ticks=(km.cluster_centers_ / cluster_max).round(2))
    plt.xlabel('scale')
    plt.ylabel('density')
    #plt.ylim(0, 1100)
plt.subplots_adjust(hspace=0.5)
plt.show()

In [None]:
scale = 1

# Compute the PCA
pca = PCA(whiten=True, random_state=1)
pca.fit(train_i[elements][train_df['gcd'] == scale])

# Transform the data so that the components can be analyzed
Xt_tr = pca.transform(train_i[elements][train_df['gcd'] == scale])
Xt_te = pca.transform(test_i[elements][test_df['gcd'] == scale])

# Plot a scattergram, projected to two PCA components, of training and test data
plt.figure(figsize=(6,6))
plt.scatter(Xt_tr[:,0], Xt_tr[:,1], c='b', s=1, label='Train')
plt.scatter(Xt_te[:,0], Xt_te[:,1], c='r', s=1, label='Test')
plt.title("The test data deviate from the training data")
plt.legend()
plt.show()

In [None]:
X_train=train_df.copy()

X_train.head()

In [None]:
#X_train["Target_gcd"] = X_train["target"]+X_train["gcd"].astype(str)
y_train= X_train["target"]+X_train["gcd"].astype(str)
X_train=X_train.drop(['target'], axis = 1)
X_train=X_train.drop(['row_id','target_num'], axis = 1)

X_train.head()

## Try-1: use Randomforest classfifier with pre or post processing 
### hyper parameters are selected based on the gridsearch and applied in here
### Public leaderboard score is 0.84....

In [None]:
#Import Random Forest Model
from sklearn.ensemble import RandomForestClassifier

#Create a Gaussian Classifier
#clf=RandomForestClassifier(n_estimators=100, min_samples_split=12, max_depth=12,min_samples_leaf=4,max_features=0.34966)
#clf.fit(X_train,y_train)
#test_df2=test_df.drop(['row_id'], axis = 1)
#y_pred=clf.predict(test_df2)
#output = pd.DataFrame({'row_id': test_df2.index, 'target': y_pred})
#output.to_csv('submission.csv', index=False)

## Try2: applying strong crossvalidation technique for Randomforest and deducting duplicate data with sample weight
### As suggested by @ambrosm training dataframe is converted to a new dataframe without the duplicated rows. To compensate for dropping the duplicates, we add a column sample_weight to the dataframe. This step is extremely important as most of the kagglers are removing the duplicates without applying sample weight.

In [None]:

train_df2 = pd.read_csv('../input/tabular-playground-series-feb-2022/train.csv', index_col='row_id')
test_df2 = pd.read_csv('../input/tabular-playground-series-feb-2022/test.csv', index_col='row_id')

# Count the duplicates in the training data
train_df2.duplicated().sum()

In [None]:
# Create a new dataframe without duplicates, but with an additional sample_weight column
vc = train_df2.value_counts()
dedup_train = pd.DataFrame([list(tup) for tup in vc.index.values], columns=train_df2.columns)
dedup_train['sample_weight'] = vc.values
dedup_train.head()


### After deduplicating the training data, we apply two small changes to the training loop:

### When calling fit(), we add the sample weights of the training data.
### When calling accuracy_score(), we add the sample weights of the validation data.

In [None]:
from sklearn.preprocessing import LabelEncoder
TARGET = train_df2.columns.difference(test_df2.columns)[0]
features = train_df2.columns[train_df2.columns != TARGET]
# Encoding categorical features
le = LabelEncoder()

X_dedup = dedup_train[features]
y_dedup = pd.DataFrame(le.fit_transform(dedup_train[TARGET]), columns=[TARGET])
sample_weight = dedup_train['sample_weight']

In [None]:
#%%time
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm
N_SPLITS = 10
folds = StratifiedKFold(n_splits=N_SPLITS, shuffle=True)
y_pred_list, y_proba_list, scores = [], [], []

for fold, (train_id, valid_id) in enumerate(tqdm(folds.split(X_dedup, y_dedup), total=N_SPLITS)):
    print('####### Fold: ', fold)
    
    # Splitting
    X_train, y_train, sample_weight_train = X_dedup.iloc[train_id], y_dedup.iloc[train_id], sample_weight.iloc[train_id]
    X_valid, y_valid, sample_weight_valid = X_dedup.iloc[valid_id], y_dedup.iloc[valid_id], sample_weight.iloc[valid_id]
    
    model=RandomForestClassifier(n_estimators=100, min_samples_split=12, max_depth=26, min_samples_leaf=1,max_features=0.34966)
    model.fit(X_train,y_train, sample_weight_train)
    
        
    # Validation
    valid_pred = model.predict(X_valid)
    valid_score = accuracy_score(y_valid, valid_pred, sample_weight=sample_weight_valid)
    print(f'Accuracy score: {valid_score:5f}\n')
    scores.append(valid_score)
    
    # Prediction for submission
    y_pred_list.append(model.predict(test_df2))
    y_proba_list.append(model.predict_proba(test_df2))
    
score = np.array(scores).mean()
print(f'Mean accuracy score: {score:6f}')

In [None]:
# Majority vote
from scipy.stats import mode
y_pred = mode(y_pred_list).mode[0]
y_pred = le.inverse_transform(y_pred)
target_distrib = pd.DataFrame({
    'count': train_df2.target.value_counts(),
    'share': train_df2[TARGET].value_counts() / train_df2.shape[0] * 100
})

target_distrib['pred_count'] = pd.Series(y_pred, index=test_df2.index).value_counts()
target_distrib['pred_share'] = target_distrib['pred_count'] / len(test_df2) * 100
target_distrib.sort_index()

In [None]:
y_proba = sum(y_proba_list) / len(y_proba_list)
y_proba += np.array([0.01, 0.02, 0.01, 0.015, 0.0015, 0.01, 0.01, 0.01, 0.00001, 0.0015])
y_pred_tuned = le.inverse_transform(np.argmax(y_proba, axis=1))
pd.Series(y_pred_tuned, index=test_df2.index).value_counts().sort_index() / len(test_df2) * 100

In [None]:
submission = pd.read_csv('../input/tabular-playground-series-feb-2022/sample_submission.csv')
submission[TARGET] = y_pred_tuned
submission.to_csv('submission.csv', index=False)
submission

## the public leaderboard score is 0.93052. I have noticed that most of the other kagglers also has similar score.
## Next step use clustering information collected in my another notebook
https://www.kaggle.com/abdulravoofshaik/clustering-extra-tree-lb-98-34

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from math import factorial

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cluster import KMeans

In [None]:
# Reading train and testing datasets
output = pd.read_csv('/kaggle/input/tabular-playground-series-feb-2022/sample_submission.csv')
dedup_train = pd.read_csv('../input/tpsfeb2022-v2/train_cluster_target.csv')
test_df2 = pd.read_csv('../input/tpsfeb2022-v2/test_cluster_target.csv')

In [None]:
test_df2.head()

In [None]:
sample_weight = dedup_train['sample_weight']
dedup_train=dedup_train.drop(['sample_weight'],axis=1)


In [None]:
le2 = LabelEncoder()
dedup_train['target']=le2.fit_transform(dedup_train['target'])
dedup_train['target']

In [None]:
from sklearn.preprocessing import LabelEncoder
TARGET = 'target' #dedup_train.columns.difference(test_df.columns)[0]
features = dedup_train.columns[dedup_train.columns != TARGET]
# Encoding categorical features
#le = LabelEncoder()

X_dedup = dedup_train[features]
y_dedup = dedup_train[TARGET]     #pd.DataFrame(le.fit_transform(dedup_train[TARGET]), columns=[TARGET])



In [None]:
y_dedup.head()
#TARGET

In [None]:
#%%time
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from tqdm import tqdm
N_SPLITS = 5
folds = StratifiedKFold(n_splits=N_SPLITS, shuffle=True)
y_pred_list, y_proba_list, scores = [], [], []

for fold, (train_id, valid_id) in enumerate(tqdm(folds.split(X_dedup, y_dedup), total=N_SPLITS)):
    print('####### Fold: ', fold)
    
    # Splitting
    X_train, y_train, sample_weight_train = X_dedup.iloc[train_id], y_dedup.iloc[train_id], sample_weight.iloc[train_id]
    X_valid, y_valid, sample_weight_valid = X_dedup.iloc[valid_id], y_dedup.iloc[valid_id], sample_weight.iloc[valid_id]
    
    model=RandomForestClassifier(n_estimators=100, min_samples_split=12, max_depth=26, min_samples_leaf=1,max_features=0.34966)
    model.fit(X_train,y_train, sample_weight_train)
    
        
    # Validation
    valid_pred = model.predict(X_valid)
    valid_score = accuracy_score(y_valid, valid_pred, sample_weight=sample_weight_valid)
    print(f'Accuracy score: {valid_score:5f}\n')
    scores.append(valid_score)
    
    # Prediction for submission
    y_pred_list.append(model.predict(test_df2))
    y_proba_list.append(model.predict_proba(test_df2))
    
score = np.array(scores).mean()
print(f'Mean accuracy score: {score:6f}')

In [None]:
# Majority vote
from scipy.stats import mode
y_pred = mode(y_pred_list).mode[0]
y_pred = le2.inverse_transform(y_pred)
target_distrib2 = pd.DataFrame({
    'count': dedup_train.target.value_counts(),
    'share': dedup_train[TARGET].value_counts() / dedup_train.shape[0] * 100
})

target_distrib2['pred_count'] = pd.Series(y_pred, index=test_df2.index).value_counts().values
target_distrib2['pred_share'] = target_distrib2['pred_count'] / len(test_df2) * 100
target_distrib2.sort_index()

# As you can see above that even with clustering information, model failed to predict Escherichia_coli (see index=4 row) properly.
# The conclusion is Randomforest is not suitable for this dataset