# TPS Feb 2022 - Bacteria Species (Keras)

* Data Observation / データ観察　
* Modeling with Keras / Kerasでモデリング
* Analysis / 検証
* Submission / 提出

# Data Observation / データ観察

## Goal
The goal of this competition is to classify 10 different bacteria species using data from a genomic analysis technique that has some data compression and data loss.

## 目的　
このコンペティションの目的は、ゲノム解析技術のデータから10種類の細菌を分類すること。

In [None]:
# Import basic libraries.
# 最初に必要なライブラリをインポート。
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 10)

In [None]:
# Read the csv and check the contents.
# csvを読み込み内容を確認。
train = pd.read_csv("/kaggle/input/tabular-playground-series-feb-2022/train.csv")
test  = pd.read_csv("/kaggle/input/tabular-playground-series-feb-2022/test.csv")
sample_submission  = pd.read_csv("/kaggle/input/tabular-playground-series-feb-2022/sample_submission.csv")


In [None]:
train.head()

In [None]:
test.head()

In [None]:
sample_submission.head()

In [None]:
# Prepare "row_id" for submission.
# 提出用にrow_idを準備します。
row_id = test['row_id']

In [None]:
# Integrate the data.
# データを統合し中身を確認。
df = pd.concat([train,test], ignore_index = True)

In [None]:
df.head()

Check the meanings of each colomun in df.
* row_id : Column Number  
* A0T0G0C10~A10T0G0C0 : histogram of bases  
* target : bacteria species 

The Target is "target", and the columns other than "row_id" are the features.


dfの各列に何が記載されているか確認します。
* row_id : 列番号  
* A0T0G0C10~A10T0G0C0 : 塩基のヒストグラム
* target : 最近の種類  

targetが目的変数で、列番号以外のその他特徴量になる。  




In [None]:
df.info()

In [None]:
print(f'Number of rows in train data: {df.shape[0]}')
print(f'Number of columns in train data: {df.shape[1]}')
print(f'Number of values in train data: {df.count().sum()}')
print(f'Number missing values in train data: {sum(df.isna().sum())}')

In [None]:
df.describe()

In [None]:
# Check how the target destributes.
# 目的関数の状況を確認する。

plt.subplots(1, 2, figsize=(14,5))
plt.subplot(1,2,1)
plt.title("Bar of target")
sns.countplot(y='target', data=df)
plt.legend()

species = df.groupby('target').size()
plt.subplot(1,2,2)
plt.title("Pie of target")
plt.pie(x=species,
       labels=species.index,
       counterclock=False, startangle=90,
       autopct='%1.1f%%', pctdistance=0.7)
plt.show()

# Modeling with Keras / Kerasでモデリング

In [None]:
# Split df into train_new and test_new
# dfをtrain_newとtest_newに分けます。
train_new = df[df['target'].notnull()]
test_new  = df[df['target'].isnull()]

In [None]:
train_new

In [None]:
test_new

In [None]:
train_new

In [None]:
# Split test_new into X and y
# test_newをXとyに分ける。
X = train_new.copy()
y = X.pop('target')
X.drop(columns=['row_id'], axis=1, inplace=True)

In [None]:
X

In [None]:
# Change "string" in target into "integer"
# targetの中の文字列を数字に変換する。
targets = y.unique()
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(targets)
y = le.transform(y)

In [None]:
# Split X and y for modeling
# モデルで利用するためXとyを分割する。
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2,  random_state=0)


In [None]:
# Check the input shape of X_train for Keras
# Kerasを利用するためのX_trainの形を確認する。
input_shape = [X_train.shape]
print("Input shape: {}".format(input_shape))

In [None]:
# Modeling with Keras
# Kerasでモデルを作成する。
from tensorflow import keras
from tensorflow.keras import layers

model = keras.Sequential([
    layers.Input(shape=(286)),
    layers.Reshape(target_shape=(286,1)),
    layers.Conv1D(filters=64, kernel_size=3, activation='relu',),
    layers.Conv1D(filters=64, kernel_size=3, activation='relu'),
    layers.MaxPooling1D(pool_size=3),
    layers.Conv1D(filters=128, kernel_size=3, activation='relu'),
    layers.Conv1D(filters=128, kernel_size=3, activation='relu'),
    layers.GlobalAveragePooling1D(),
    layers.Dropout(0.5),
    layers.Dense(units=10,activation="softmax")
])

In [None]:
model.summary()

In [None]:
# See the model made
# 作成したモデルを見る。
keras.utils.plot_model(model, "ticket_classifier_with_shape_info.png", show_shapes=True)

In [None]:
# Compiling with Keras
# Kerasでコンパイルする。
model.compile(optimizer="rmsprop",
              loss="sparse_categorical_crossentropy",
              metrics=["accuracy"])

In [None]:
# Use Callback for optimal runs.
# Callbackを利用して最適解を出す。

callbacks_list = [
#    keras.callbacks.EarlyStopping(
#        monitor="val_accuracy",
#        patience=10,
#    ),
    keras.callbacks.ModelCheckpoint(
        filepath="checkpoint_path.keras",
        monitor="val_loss",
        save_best_only=True,
    )
]

In [None]:
# Fitting with Keras
# KerasでFitする。
history = model.fit(X_train,
                    y_train,
                    epochs=100,
                    batch_size=512,
                    callbacks=callbacks_list,
                    validation_data=(X_val, y_val))

# Analysis / 検証

In [None]:
# Graph the loss.
# Lossをグラフする。

import matplotlib.pyplot as plt
history_dict = history.history
loss_values = history_dict["loss"]
val_loss_values = history_dict["val_loss"]
epochs = range(1,len(loss_values)+1)
plt.plot(epochs, loss_values, "bo",label="Training loss")
plt.plot(epochs, val_loss_values, "b",label="Varidation loss")
plt.title("Training and validation loss")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend()
plt.show()

In [None]:
# Graph the accuracy.
# Accuracyをグラフする。

acc = history_dict["accuracy"]
val_acc = history_dict["val_accuracy"]
plt.plot(epochs, acc, "bo",label="Training acc")
plt.plot(epochs, val_acc, "b",label="Varidation acc")
plt.title("Training and validation accuracy")
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.legend()
plt.show()

In [None]:
# Get the best epoch
# 最適なEpochを呼び出す。

test_model = keras.models.load_model("checkpoint_path.keras")
test_model.evaluate(X_val, y_val)

# Submission / 提出

In [None]:
X_test = test_new.copy()
X_test.drop(columns=['row_id',"target"], axis=1, inplace=True)

In [None]:
X_test

In [None]:
predictions = test_model.predict(X_test)

In [None]:
predictions

In [None]:
max_predictions = [np.argmax(predictions[i]) for i in range(len(predictions))]

In [None]:
bacteria = le.inverse_transform(max_predictions)

In [None]:
submission = pd.DataFrame({"row_id": row_id, "target": bacteria})
submission.to_csv("submission.csv", index=False)
print("Your submission was successfully saved!")

In [None]:
submission