In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Importing Libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn import model_selection, preprocessing
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
%matplotlib inline

# Importing data

In [None]:
train = pd.read_csv('../input/tabular-playground-series-feb-2022/train.csv', index_col=0)
test = pd.read_csv('../input/tabular-playground-series-feb-2022/test.csv', index_col=0)
submission = pd.read_csv('../input/tabular-playground-series-feb-2022/sample_submission.csv')

In [None]:
train.head()

In [None]:
train.dtypes

In [None]:
train.describe()

In [None]:
train.describe(include='object')

In [None]:
train.info()

In [None]:
train.isnull().sum() #check for null data

In [None]:
train.isnull().sum().sum()

In [None]:
y = train['target'] 
X = train.drop(columns=['target'])

# Exploring target data

In [None]:
y.value_counts()

In [None]:
y.value_counts().transpose().plot(kind='bar')
plt.title('Target data')
plt.ylabel('Frequency')
plt.xlabel('Bactrria type')
plt.show()

# Using Model: RandomForestClassifier

In [None]:
avg = train.groupby(['target']).mean() # getting the average contributuion for each target
# avg.head()
avg.transpose().plot(kind='line',figsize=(25, 10))
plt.title('Contribution to the target')
plt.ylabel('Average contribution')
plt.xlabel('ATGC combination')
plt.show()


In [None]:
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=42)

In [None]:
rf_model = RandomForestClassifier(random_state=1)
rf_model.fit(train_X, train_y)

In [None]:
rf_val_predictions = rf_model.predict(val_X) # predicting for validation
rf_val_predictions[:25]

# Validation

In [None]:
a_s = accuracy_score(val_y, rf_val_predictions)
print('Accuracy score:',a_s)

# Visualizing accuracy

In [None]:
labl = y.unique().tolist() #List of Names of Bacteria
# using confusion_matrix from sklearn.metrics
cm = confusion_matrix(val_y, rf_val_predictions, labels= labl)
print(cm)
# Normalise
cmn = cm.astype('float') / cm.sum(axis=1)

In [None]:
fig, ax = plt.subplots(figsize=(12,10))           
ax = sns.heatmap(cmn, annot=True, fmt='.5f',cmap="Blues",vmin=0.0, vmax=0.005)
ax.set_xticklabels(labl, rotation=90)
ax.set_yticklabels(labl,rotation=0)
plt.show()

# Retrain the model with whole train data set

In [None]:
rf_model.fit(X,y)

In [None]:
test.head()

In [None]:
rf_val_predictions_test = rf_model.predict(test) #predictions for test data set
rf_val_predictions_test[:25]

# Preparing submission

In [None]:
col_nam = test.columns # all column names of test data set
test_pred = test.copy(deep=False) # make a copy of test data set
test_pred['target'] = rf_val_predictions_test # adding new column to test tata set
output= test_pred.drop(columns=col_nam) # droping unnecessary columns
output.reset_index(inplace=True) # rest index
output.head()

# Writing in to a csv file

In [None]:
output.to_csv('submission.csv', index=False)
print("submission was successful")

In [None]:
from xgboost import XGBClassifier
import optuna

In [None]:
df_train = pd.read_csv('../input/tabular-playground-series-feb-2022/train.csv')
df_test = pd.read_csv('../input/tabular-playground-series-feb-2022/test.csv')

In [None]:
df_test_row_id = df_test.row_id 
train_data = df_train.copy()

In [None]:
df_tar = pd.DataFrame(df_train['target'].value_counts())
df_tar['species']=df_tar.index
df_tar = df_tar.reset_index(drop=True)
for i in df_tar.index:
    df_tar['percentage']=df_tar['target']/df_tar['target'].sum()*100
df_tar

In [None]:
plt.figure(figsize=(12,6))
sns.barplot(y='species',x='target',data=df_tar)
plt.xticks(rotation=90)
plt.show()

In [None]:
cols = [e for e in df_test.columns if e not in ('row_id')]

In [None]:
s1 = pd.merge(df_train, df_test, how='inner', on=cols)
s1.head()

In [None]:
dic = {}
for i in range(len(s1)):
    dic[s1.loc[i]['row_id_y']] = s1.loc[i]['row_id_x']

In [None]:
s1 = s1.set_index('row_id_x')
s1_idx = s1.index.to_list()
df_train = df_train.drop(s1_idx)

df_train = df_train.drop_duplicates(subset=cols, keep='first')
df_train = df_train.reset_index(drop=True)

In [None]:
kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=42)
for fold, (train_indicies, valid_indicies) in enumerate(kf.split(X=df_train)):
    df_train.loc[valid_indicies,"kfold"] = fold

In [None]:
useful_features = [c for c in df_train.columns if c not in ("row_id", "target", "kfold")]
df_test = df_test[useful_features]
for col in useful_features:
    df_train[col] = np.log1p(df_train[col])
    df_test[col] = np.log1p(df_test[col])

In [None]:
def xgboost(trial):
    fold = 0
    
    learning_rate = trial.suggest_float("learning_rate", 1e-2, 0.1, log=True)
    reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
    reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
    subsample = trial.suggest_float("subsample", 0.1, 1.0)
    colsample_bytree = trial.suggest_float("colsample_bytree", 0.1, 1.0)
    max_depth = trial.suggest_int("max_depth", 3, 25)
    
    xtrain =  df_train[df_train.kfold != fold].reset_index(drop=True)
    xvalid = df_train[df_train.kfold == fold].reset_index(drop=True)
    
    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    le = preprocessing.LabelEncoder()
    ytrain = le.fit_transform(ytrain)
    yvalid = le.fit_transform(yvalid)
    
    model = XGBClassifier(random_state=fold,
                          tree_method='gpu_hist',
                          gpu_id=0, predictor="gpu_predictor",
                          use_label_encoder =False,
                          n_jobs=-1,
                          n_estimators=1000,
                          eval_metric='rmse',
                          learning_rate=learning_rate,
                          reg_lambda=reg_lambda,
                          reg_alpha=reg_alpha,
                          subsample=subsample,
                          colsample_bytree=colsample_bytree,
                          max_depth=max_depth,
                         )
    model.fit(xtrain, ytrain)
    preds_valid = model.predict(xvalid)
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    return rmse

In [None]:
#study = optuna.create_study(direction="minimize")
#study.optimize(xgboost, n_trials=20,gc_after_trial=True)

In [None]:
#study.best_params

In [None]:
for fold in range(5):
    
    xtrain = df_train[df_train.kfold != fold].reset_index(drop=True)
    xvalid = df_train[df_train.kfold == fold].reset_index(drop=True)
    
    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    le = preprocessing.LabelEncoder()
    ytrain = le.fit_transform(ytrain)
    yvalid = le.fit_transform(yvalid)
    
    model = XGBClassifier(random_state=fold,
                          tree_method='gpu_hist',
                          gpu_id=0, predictor="gpu_predictor",
                          use_label_encoder =False,
                          n_jobs=-1,
                          n_estimators=1000,
                          eval_metric='rmse',
                          learning_rate=0.0958614407371858,
                          reg_lambda= 0.013277441840190538,
                          reg_alpha= 0.0078106631860548935,
                          subsample= 0.5644494238856632,
                          colsample_bytree=0.854776309994251,
                          max_depth=15,
                         )
    model.fit(xtrain, ytrain)
    print(f'fold{fold} ',model.score(xvalid,yvalid))

In [None]:
preds = model.predict(df_test)
res = le.inverse_transform(preds)
df = pd.DataFrame({'row_id': df_test_row_id,'target': res})

In [None]:
for e in dic:
    df.loc[df[df['row_id']==e].index.to_list(),'target'] = train_data.loc[train_data[train_data['row_id']==dic[e]].index.tolist()[0],'target']

In [None]:
df.to_csv('Submission_01.csv',index=False)