In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from lightgbm import LGBMClassifier
import seaborn as sns

In [1]:
train_path = "../input/data-science-spring-osaka-2021"

In [1]:
df_train = pd.read_csv("../input/data-science-spring-osaka-2021/train.csv")
df_test = pd.read_csv("../input/data-science-spring-osaka-2021/test.csv")
df_action = pd.read_csv("../input/data-science-spring-osaka-2021/actions.csv")

In [1]:
action_ = df_action["action_seq"].unique()
train_action = df_train["action_seq"].unique()
print(action_)
print(train_action)

In [1]:
unseen_label = set(train_action) ^ set(action_)
unseen_label # unseen label 

**方針**

jab-jab-bodyupperとjab-jab-upperの差分を取る

その差分を既存のjab-jab-hookに足してjab-jab-bodyhookの学習データを作成

おそらく，jab-jab-bodyupperとjab-jab-upperの差分を取った場合前半は差分の少ないデータになるはず，，，

In [1]:
j_j_bu = df_train[df_train["action_seq"] == "jab-jab-bodyupper"]["file_path"]
j_j_u = df_train[df_train["action_seq"] == "jab-jab-upper"]["file_path"]

In [1]:
#それぞれ平均をとる関数
scaler = MinMaxScaler()
def join_data(file_paths):
    df_ = pd.DataFrame()
    for file_path in file_paths:
        df_temp = pd.read_csv(train_path + file_path)
        df_ = pd.concat([df_, df_temp])
    return df_.reset_index(drop=True)

In [1]:
df_jjbu = join_data(j_j_bu)
df_jju = join_data(j_j_u)

In [1]:
df_jju["ELBOW_L"].plot(figsize=(15,6))#なんとなく周期的

In [1]:
diff = df_jjbu.mean() - df_jju.mean()
diff

In [1]:
j_j_h = df_train[df_train["action_seq"] == "jab-jab-hook"]["file_path"]
len(j_j_h)

In [1]:
df = pd.DataFrame()
for file_path in j_j_h:
    df_temp = pd.read_csv(train_path + file_path)
    df_ = pd.DataFrame(df_temp.mean() + diff)
    df = pd.concat([df,df_.T])

In [1]:
df.columns = [col+'_mean' for col in df.columns] 
df["action_seq"] = "jab-jab-bodyhook"
df

In [1]:
def add_mean_as_feature(df):
    df_temp = pd.DataFrame()
    for path in df.file_path:
        df_sensor = pd.read_csv('../input/data-science-spring-osaka-2021'+path)
        df_sensor = pd.DataFrame(df_sensor.mean()).T # 平均値を集計
        df_temp = pd.concat([df_temp, df_sensor])
    df_temp.columns = [col+'_mean' for col in df_temp.columns] # カラム名に平均をとったことがわかるように末尾に'_mean'を付記
    df_temp.index = df.index
    df = pd.concat([df, df_temp], axis=1)
    return df

In [1]:
# 上記処理を適用します
df_train = add_mean_as_feature(df_train)
df_test = add_mean_as_feature(df_test)

In [1]:
df_train = pd.concat([df,df_train])
df_train

In [1]:
df_train["action_seq"].unique()

In [1]:
# 特徴量（=説明変数）とターゲット（被説明変数）に分割しておきます
y_train = df_train["action_seq"]
X_train = df_train.drop(['file_path', 'action_seq'], axis=1)
X_test = df_test.drop(['file_path'], axis=1)

In [1]:
# ターゲットをエンコード（数値に変換）しておきます
le = LabelEncoder()
y_train = le.fit_transform(y_train)

In [1]:
# 学習データの一部を検定（精度評価）用に切り出します

# -----------------------------------
# バリデーション
# -----------------------------------
from sklearn.metrics import log_loss, accuracy_score
from sklearn.model_selection import KFold

# 各foldのスコアを保存するリスト
scores_accuracy = []
lgb_params = {
                "n_estimators": 50000,
                "learning_rate": 0.05,
                "num_leaves": 8,        
              }

# クロスバリデーションを行う
# 学習データを4つに分割し、うち1つをバリデーションデータとすることを、バリデーションデータを変えて繰り返す
kf = KFold(n_splits=3, shuffle=True, random_state=71)
for tr_idx, va_idx in kf.split(X_train):
    # 学習データを学習データとバリデーションデータに分ける
    X_train_, X_val = X_train.iloc[tr_idx], X_train.iloc[va_idx]
    y_train_, y_val = y_train[tr_idx], y_train[va_idx]
    
    # モデルの学習を行う
    model = LGBMClassifier(**lgb_params)
    model.fit(X_train_, y_train_, eval_set=[(X_val, y_val)], eval_metric='logloss', early_stopping_rounds=50,verbose=False)
    # バリデーションデータの予測値を確率で出力する
    y_pred = model.predict(X_val)

    # バリデーションデータでのスコアを計算する
    accuracy = accuracy_score(y_val, y_pred)
    print(f'accuracy: {accuracy:.4f}')
    # そのfoldのスコアを保存する
    scores_accuracy.append(accuracy)

# 各foldのスコアの平均を出力する
accuracy = np.mean(scores_accuracy)
print(f'CV mean accuracy: {accuracy:.4f}')

In [1]:
# 全データで再学習して提出用ファイルを作成します
model = LGBMClassifier(**lgb_params)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [1]:
df_sub = pd.read_csv('../input/data-science-spring-osaka-2021/sample_submission.csv')

In [1]:
df_sub['action_seq'] = le.inverse_transform(y_pred)
df_sub

In [1]:
# action_seqごとのデータ数をみてみる
g = sns.countplot(df_sub['action_seq'])
l = g.set_xticklabels(g.get_xticklabels(), rotation=90)

In [1]:
# 出力して提出します
df_sub.to_csv('mysubmission.csv', index=False)