**The Experiment:** They did one of four things:

1. They put 1000000 decamers into the machine and saved the machine's output.
2. They put 100000 decamers into the machine and multiplied the machine's output by 10.
3. They put 1000 decamers into the machine and multiplied the machine's output by 1000.
4. They put 100 decamers into the machine and multiplied the machine's output by 10000.

With this procedure, the row sums are always 1000000.
When they start with 1000000 decamers, they get accurate estimates of the true frequencies. When they start with 100 decamers and multiply by 10000, they still get unbiased estimates of the true frequencies, but less accurate. The Greatest Common Divisor (GCD) tells us which one of the experiment is carried out. So, a high GCD will result in a lower accuracy because lower amount of data was fed into the machine and then the results were amplified to a larger data.

In [None]:
import numpy as np
import pandas as pd
from math import factorial


from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler , LabelEncoder, MinMaxScaler
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from scipy.stats import mode



import warnings
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('float_format', '{:f}'.format)
warnings.filterwarnings('ignore')

test = pd.read_csv("../input/tabular-playground-series-feb-2022/test.csv")
train = pd.read_csv("../input/tabular-playground-series-feb-2022/train.csv")

**Below function is taken from:** https://www.kaggle.com/ambrosm/tpsfeb22-01-eda-which-makes-sense <br />
*(Strongly recommend a read)* <br />
**TL/DR :** The [paper](https://www.frontiersin.org/articles/10.3389/fmicb.2020.00257/full) describes process of converting the float values to integer and gives the formula for the additive constant, which they call bias. With the help of this formula, we can convert the floating point numbers back to the original integers:

In [None]:
def bias(w, x, y, z):
    return factorial(10) / (factorial(w) * factorial(x) * factorial(y) * factorial(z) * 4**10)

def bias_of(s):
    w = int(s[1:s.index('T')])
    x = int(s[s.index('T')+1:s.index('G')])
    y = int(s[s.index('G')+1:s.index('C')])
    z = int(s[s.index('C')+1:])
    return factorial(10) / (factorial(w) * factorial(x) * factorial(y) * factorial(z) * 4**10)

elements = [e for e in train.columns if e != 'row_id' and e != 'target']
train_i = pd.DataFrame({col: ((train[col] + bias_of(col)) * 1000000).round().astype(int) for col in elements})
test_i = pd.DataFrame({col: ((test[col] + bias_of(col)) * 1000000).round().astype(int) for col in elements})





In [None]:
def gcd_of_all(df_i):
    gcd = df_i[elements[0]]
    for col in elements[1:]:
        gcd = np.gcd(gcd, df_i[col])
    return gcd
train['gcd'] = gcd_of_all(train_i)
test['gcd'] = gcd_of_all(test_i)
train_i.head()

In [None]:
train[train_i.columns] = train_i[train_i.columns] 
test[test_i.columns] = test_i[test_i.columns]

In [None]:
print(f'Number of observations in TRAIN:{len(train)}')
print(f'Number of observations in TEST:{len(test)}')
print(f'Ratio of train to test: {len(train)/len(test)}')

print(f'Number of null values in train: {train.isnull().sum().sort_values(ascending = False).sum()}')
print(f'Number of null values in test: {test.isnull().sum().sort_values(ascending = False).sum()}')

OBS: **Train set is large enough!** 

In [None]:
train.target.value_counts()

OBS: **Classes have equal distribution, can use smaller dataset for visualization**

In [None]:
eda_df = train.sample(frac=.2)
eda_df.head()

In [None]:
try:
    eda_df.drop('row_id',axis=1, inplace=True)
except:
    pass
print(eda_df.target.value_counts())
eda_df.groupby(['target']).sum()

In [None]:
eda_df.groupby(['target']).count()['A0T0G0C10'].values

score_df = eda_df.groupby(['target']).sum().div(eda_df.groupby(['target']).count()['A0T0G0C10'], axis=0).T
score_df.head(10)

**Getting all important indicators**

In [None]:
# Getting all important indicators 

indicators = []
for val in score_df.columns:
    indicators = indicators + score_df.sort_values(by=val).head(10).index.tolist()
    indicators = indicators + score_df.sort_values(by=val).tail(10).index.tolist()
#     print(score_df.sort_values(by=val).head(10).index.tolist())

indicators = np.unique(np.array(indicators))
indicators.tolist()

OBS: **We can try playing with this knowledge later maybe.**

In [None]:
train[elements].duplicated().sum(), test[elements].duplicated().sum()

In [None]:
enc = LabelEncoder()
train['target'] = enc.fit_transform(np.array(train['target']).reshape(-1, 1))
train = train.drop('row_id',axis=1)
test = test.drop('row_id',axis=1)


In [None]:
scaler = StandardScaler()
cont_features = train_i.columns
train[cont_features] = scaler.fit_transform(train[cont_features])
train.drop_duplicates(inplace=True)
X = train.drop('target',axis=1)
y = train['target']
# Min max scaling one feature
train['gcd'] = train['gcd']/train['gcd'].max()


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
clf = RandomForestClassifier(max_depth=500, random_state=0, criterion='entropy')

In [None]:
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
accuracy_score(y_test, pred)

In [None]:
test_df = pd.read_csv('../input/tabular-playground-series-feb-2022/test.csv')
def cleaner_func(train):
    elements = [e for e in train.columns if e != 'row_id' and e != 'target']
    train_i = pd.DataFrame({col: ((train[col] + bias_of(col)) * 1000000).round().astype(int) for col in elements})
    train['gcd'] = gcd_of_all(train_i)
    train[train_i.columns] = train_i[train_i.columns] 
    train = train.drop('row_id',axis=1)
    scaler = StandardScaler()
    cont_features = train_i.columns
    train[cont_features] = scaler.fit_transform(train[cont_features])
    train['gcd'] = train['gcd']/train['gcd'].max()
    return train
final_X_test = cleaner_func(test_df)

In [None]:
final_pred = clf.predict(final_X_test)
final_pred_vals = enc.inverse_transform(final_pred)
final_output_df = pd.DataFrame()
final_output_df['target'] = final_pred_vals
final_output_df['row_id'] = test_df.row_id
final_output_df.to_csv('Submission.csv',index=False)

In [None]:
clf2 = RandomForestClassifier(max_depth=30, random_state=0)

In [None]:

indicators = np.append(indicators,['gcd'])
clf2.fit(X_train[indicators], y_train)

In [None]:
pred = clf2.predict(X_test[indicators])
accuracy_score(y_test, pred)

In [None]:
final_pred_2 = clf2.predict(final_X_test[indicators])
final_pred_vals_2 = enc.inverse_transform(final_pred)

In [None]:
final_output_df_2 = pd.DataFrame()
final_output_df_2['target'] = final_pred_vals_2
final_output_df_2['row_id'] = test_df.row_id

In [None]:
final_output_df_2.to_csv('Submission2.csv',index=False)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(X_train, y_train)

In [None]:
pred = neigh.predict(X_test)
accuracy_score(y_test, pred)

In [None]:
test_df = pd.read_csv('../input/tabular-playground-series-feb-2022/test.csv')
final_X_test = cleaner_func(test_df)
final_pred_2 = neigh.predict(final_X_test)
final_pred_vals_2 = enc.inverse_transform(final_pred_2)

In [None]:
#WIP