In [1]:
import pandas as pd
import numpy as np

from pathlib import Path
import warnings
import sys
print(sys.path)
sys.path.append('../modules/')

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, f1_score, confusion_matrix
import seaborn as sns
import lightgbm as lgb
import matplotlib.pyplot as plt

train_origin = pd.read_csv('../input/train.csv')
test_origin = pd.read_csv('../input/test.csv')

df_train = train_origin.copy()
df_test = test_origin.copy()

df_sample_sub = pd.read_csv('../input/sample_submit.csv', header=None)
df_sample_sub.columns = ['index', 'genre']
df_genre_labels = pd.read_csv('../input/genre_labels.csv')

['/Users/shugo/Desktop/SIGNATE/SIGNATE_StudentCup2021/shu421/notebooks', '/Library/Frameworks/Python.framework/Versions/3.7/lib/python37.zip', '/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7', '/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/lib-dynload', '', '/Users/shugo/Desktop/SIGNATE/SIGNATE_StudentCup2021/shu421/lib/python3.7/site-packages', '/Users/shugo/Desktop/SIGNATE/SIGNATE_StudentCup2021/shu421/lib/python3.7/site-packages/IPython/extensions', '/Users/shugo/.ipython']


In [2]:
N_CLASSES = 11

# testのジャンルを-100として結合
def merge_train_test(df_train, df_test):
    if 'genre' not in df_test.columns.tolist():
        df_test['genre'] = -100
    res = pd.concat([df_train, df_test])
    res.reset_index(inplace=True, drop = True)
    return res

def split_train_test(df):
    df_train = df[df['genre'] != -100]
    df_test = df[df['genre'] == -100]
    df_train.reset_index(inplace=True, drop=True)
    df_test.reset_index(inplace=True, drop =True)
    return df_train, df_test



warnings.simplefilter('ignore', pd.core.common.SettingWithCopyWarning)
warnings.simplefilter('ignore', UserWarning)


N_CLASSES = 11


INPUT = Path("../input")
df_train = pd.read_csv(INPUT / "train.csv")
df_test = pd.read_csv(INPUT / "test.csv")
df_sample_sub = pd.read_csv(INPUT / "sample_submit.csv", header=None)
df_sample_sub.columns = ["index", "genre"]
df_genre_labels = pd.read_csv(INPUT / "genre_labels.csv")




class GroupFeatureExtractor:  # 参考: https://signate.jp/competitions/449/discussions/lgbm-baseline-lb06240
    EX_TRANS_METHODS = ["deviation", "zscore"]
    def __init__(self, group_key, group_values, agg_methods):
        self.group_key = group_key
        self.group_values = group_values
        self.ex_trans_methods = [m for m in agg_methods if m in self.EX_TRANS_METHODS]
        self.agg_methods = [m for m in agg_methods if m not in self.ex_trans_methods]
        self.df_agg = None
    def fit(self, df_train, y=None):
        if not self.agg_methods:
            return
        dfs = []
        for agg_method in self.agg_methods:
            if callable(agg_method):
                agg_method_name = agg_method.__name__
            else:
                agg_method_name = agg_method
            df_agg = (df_train[[self.group_key] + self.group_values].groupby(self.group_key).agg(agg_method))
            df_agg.columns = self._get_column_names(agg_method_name)
            dfs.append(df_agg)
        self.df_agg = pd.concat(dfs, axis=1).reset_index()
    def transform(self, df_eval):
        key = self.group_key
        if self.agg_methods:
            df_features = pd.merge(df_eval[[self.group_key]], self.df_agg, on=self.group_key, how="left")
        else:
            df_features = df_eval[[self.group_key]].copy()
        if self.ex_trans_methods:
            if "deviation" in self.ex_trans_methods:
                df_features[self._get_agg_column_names("deviation")] = df_eval[self.group_values] - df_eval[[key]+self.group_values].groupby(key).transform("mean")
            if "zscore" in self.ex_trans_methods:
                df_features[self._get_column_names("zscore")] = (df_eval[self.group_values] - df_eval[[key]+self.group_values].groupby(key).transform("mean")) \
                                                                / (df_eval[[key]+self.group_values].groupby(key).transform("std") + 1e-8)
        df_features.drop(self.group_key, axis=1, inplace=True)
        return df_features
    def _get_column_names(self, method):
        return [f"agg_{method}_{col}_grpby_{self.group_key}" for col in self.group_values]
    def fit_transform(self, df_train, y=None):
        self.fit(df_train, y=y)
        return self.transform(df_train) 
    
    
class KNNFeatureExtractor:
    def __init__(self, n_neighbors=5):
        self.knn = KNeighborsClassifier(n_neighbors + 1)
    def fit(self, X, y):
        self.knn.fit(X, y)
        self.y = y if isinstance(y, np.ndarray) else np.array(y)
        return self
    def transform(self, X, is_train_data):
        distances, indexes = self.knn.kneighbors(X)
        distances = distances[:, 1:] if is_train_data else distances[:, :-1]
        indexes = indexes[:, 1:] if is_train_data else indexes[:, :-1]
        labels = self.y[indexes]
        score_columns = [f"knn_score_class{c:02d}" for c in range(N_CLASSES)]
        df_knn = pd.DataFrame(
            [np.bincount(labels_, distances_, N_CLASSES) for labels_, distances_ in zip(labels, 1.0 / distances)],
            columns=score_columns
        )
        # 最大スコア
        df_knn['max_knn_scores'] = df_knn.max(axis=1)

        # 最大スコアとの差。0は最大スコアを表す
        for col in score_columns:
            df_knn[f'sub_max_knn_scores_{col}'] = df_knn['max_knn_scores'] - df_knn[col]

        # 最大スコアとの比。1は最大スコアを表す
        for col in score_columns:
            df_knn[f'div_max_knn_scores_{col}'] = df_knn[col] / df_knn['max_knn_scores']

        # それぞれのスコア同士の差
        for i, col1 in enumerate(score_columns):
            for j, col2 in enumerate(score_columns[i+1:], i+1): # 全パターンを網羅できる
                df_knn[f'sub_{col1}_{col2}'] = df_knn[col1] - df_knn[col2]

        # knnスコアの合計
        df_knn['sum_knn_scores'] = df_knn.sum(axis=1)

        return df_knn
# parameters - knn feature weights

knn_features = [
   'region_A', 'region_B', 'region_C', 'region_D', 'region_E', 'region_F',
   'region_G', 'region_H', 'region_I', 'region_J', 'region_K', 'region_L',
   'region_M', 'region_N', 'region_O', 'region_P', 'region_Q', 'region_R',
   'region_S', 'region_T', 'region_unknown',
   'standardscaled_popularity', 'standardscaled_duration_ms',
   'standardscaled_acousticness', 'standardscaled_positiveness',
   'standardscaled_danceability', 'standardscaled_loudness',
   'standardscaled_energy', 'standardscaled_liveness',
   'standardscaled_speechiness', 'standardscaled_instrumentalness',
   'standardscaled_log_tempo', 'standardscaled_num_nans'
]

dict_feature_weights = {}

for col in [
    'region_A', 'region_B', 'region_C', 'region_D', 'region_E', 'region_F',
    'region_G', 'region_H', 'region_I', 'region_J', 'region_K', 'region_L',
    'region_M', 'region_N', 'region_O', 'region_P', 'region_Q', 'region_R',
    'region_S', 'region_T', 'region_unknown'
]:
    dict_feature_weights[col] = 100.0

for col in [
    'standardscaled_duration_ms',
    'standardscaled_acousticness', 'standardscaled_positiveness',
    'standardscaled_danceability', 'standardscaled_loudness',
    'standardscaled_energy', 'standardscaled_liveness',
    'standardscaled_speechiness', 'standardscaled_instrumentalness'
]:
    dict_feature_weights[col] = 1.0

dict_feature_weights["standardscaled_popularity"] = 8.0
dict_feature_weights["standardscaled_log_tempo"] = 0.001
dict_feature_weights["standardscaled_num_nans"] = 100.0

knn_feature_weights = np.array([dict_feature_weights[col] for col in knn_features])

# parameters

# def lgb_metric(preds, data):  
#     pred_labels = preds.reshape(N_CLASSES, -1).argmax(axis=0)
#     score = f1_score(data.get_label(), pred_labels, average="macro")
#     return "macro_f1", score, True

learning_rate = 0.01
lgb_params = {
    "objective": "multiclass",
    "num_class": N_CLASSES,
    #"metric": "None",
    "learning_rate": learning_rate,
    "num_leaves": 3,
    "min_data_in_leaf": 40,
    #"colsample_bytree": 1.0,
    #"feature_fraction": 1.0,
    #"bagging_freq": 0,
    #"bagging_fraction": 1.0,
    "verbosity": 0,
    "seed": 42,
    "force_col_wise":True
}

knn_n_neighbors = 6

df_main = merge_train_test(df_train, df_test)



In [3]:
from tqdm import tqdm

In [None]:
for pseudo_labeling_threshold in tqdm([0.95, 0.925, 0.9, 0.875, 0.85, -np.inf]):
    df = df_main.copy()
    
    
    # feature engineering
    df["genre_name"] = df["genre"].map(dict(df_genre_labels[["labels", "genre"]].values))
    df["tempo"] = df["tempo"].map(lambda x: sum(map(int, x.split("-"))) / 2)
    df = pd.concat([df, pd.get_dummies(df["region"]).rename(columns={"unknown": "region_unknown"})], axis=1)
    df["num_nans"] = 0
    for col in [
        "acousticness",
        "positiveness",
        "danceability",
        "energy",
        "liveness",
        "speechiness",
        "instrumentalness",
    ]:
        df["num_nans"] += df[col].isna()
    class CountEncoder:
        def fit(self, series):
            self.counts = series.groupby(series).count()
            return self
        def transform(self, series):
            return series.map(self.counts).fillna(0)
        def fit_transform(self, series):
            return self.fit(series).transform(series)
    columns_count_enc = ["region"]
    for col in columns_count_enc:
        df["countenc_" + col] = CountEncoder().fit_transform(df[col])
        df.loc[df[col].isna().values, "countenc_" + col] = np.nan
    columns_label_enc = ["region"]
    for col in columns_count_enc:
        df["labelenc_" + col] = LabelEncoder().fit_transform(df[col])
        df.loc[df[col].isna().values, "labelenc_" + col] = np.nan
  
    df["log_tempo"] = np.log(df["tempo"])
    gfe = GroupFeatureExtractor(
        "region", 
        ['popularity', 'duration_ms', 'acousticness', 'positiveness', 'danceability', 'loudness', 'energy', 'liveness', 'speechiness', 'instrumentalness', 'log_tempo'],
        ["zscore"]
    )
    df = pd.concat([df, gfe.fit_transform(df)], axis=1)

    # feature scaling
    df["log_tempo"] = np.log(df["tempo"])
    for col in [
        'popularity', 'duration_ms', 'acousticness',
        'positiveness', 'danceability', 'loudness', 'energy', 'liveness',
        'speechiness', 'instrumentalness', 'log_tempo', 'num_nans',
    ]:
        df["standardscaled_" + col] = StandardScaler().fit_transform(df[[col]])[:, 0]
    df_train, df_test = split_train_test(df)
    target = df_train["genre"]
    
    # train
    
    N_SPLITS = 15
    SEED_SKF = 42
    np.random.seed(42)
    skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED_SKF)
    oof = np.zeros((len(df_train), N_CLASSES))
    predictions = np.zeros((len(df_test), N_CLASSES))
    df_feature_importance = pd.DataFrame()
    features_numerical = [
        'popularity', 'duration_ms', 'acousticness',
        'positiveness', 'danceability', 'loudness', 'energy', 'liveness',
        'speechiness', 'instrumentalness', 'tempo',
        'region_A', 'region_B', 'region_C', 'region_D', 'region_E', 'region_F',
        'region_G', 'region_H', 'region_I', 'region_J', 'region_K', 'region_L',
        'region_M', 'region_N', 'region_O', 'region_P', 'region_Q', 'region_R',
        'region_S', 'region_T', 'region_unknown', 'countenc_region',
        'num_nans',
        'agg_zscore_popularity_grpby_region',
        'agg_zscore_duration_ms_grpby_region',
        'agg_zscore_acousticness_grpby_region',
        'agg_zscore_positiveness_grpby_region',
        'agg_zscore_danceability_grpby_region',
        'agg_zscore_loudness_grpby_region', 'agg_zscore_energy_grpby_region',
        'agg_zscore_liveness_grpby_region',
        'agg_zscore_speechiness_grpby_region',
        'agg_zscore_instrumentalness_grpby_region',
        'agg_zscore_log_tempo_grpby_region',
        'knn_score_class00', 'knn_score_class01',
        'knn_score_class02', 'knn_score_class03', 'knn_score_class04',
        'knn_score_class05', 'knn_score_class06', 'knn_score_class07',
        'knn_score_class08', 'knn_score_class09', 'knn_score_class10',
        'max_knn_scores',
        'sub_max_knn_scores_knn_score_class00',
        'sub_max_knn_scores_knn_score_class01',
        'sub_max_knn_scores_knn_score_class02',
        'sub_max_knn_scores_knn_score_class03',
        'sub_max_knn_scores_knn_score_class04',
        'sub_max_knn_scores_knn_score_class05',
        'sub_max_knn_scores_knn_score_class06',
        'sub_max_knn_scores_knn_score_class07',
        'sub_max_knn_scores_knn_score_class08',
        'sub_max_knn_scores_knn_score_class09',
        'sub_max_knn_scores_knn_score_class10',
        'sum_knn_scores'
    ]
    sub_knn_score_n_m = []
    score_columns = [f"knn_score_class{c:02d}" for c in range(N_CLASSES)]
    for i, col1 in enumerate(score_columns):
        for j, col2 in enumerate(score_columns[i+1:], i+1):
            sub_knn_score_n_m.append(f'sub_{col1}_{col2}')
            
    features_numerical += sub_knn_score_n_m        
    features_categorical = ["labelenc_region"]
    features = features_numerical + features_categorical
    for fold_, (indexes_trn, indexes_val) in enumerate(skf.split(df_train.values, target.values)):
        print(f"------------------------------ fold {fold_} ------------------------------")
        df_trn = df_train.loc[indexes_trn].reset_index(drop=True)
        df_val = df_train.loc[indexes_val].reset_index(drop=True)
        target_trn = target.loc[indexes_trn].reset_index(drop=True)
        target_val = target.loc[indexes_val].reset_index(drop=True)
        # make knn features
        X = df_trn[knn_features].fillna(0.0).values * knn_feature_weights
        knn_feature_extractor = KNNFeatureExtractor(knn_n_neighbors).fit(X, target_trn)
        df_trn = pd.concat([df_trn, knn_feature_extractor.transform(X, is_train_data=True)], axis=1)
        X = df_val[knn_features].fillna(0.0).values * knn_feature_weights
        df_val = pd.concat([df_val, knn_feature_extractor.transform(X, is_train_data=False)], axis=1)
        X = df_test[knn_features].fillna(0.0).values * knn_feature_weights
        df_test_knn_features = knn_feature_extractor.transform(X, is_train_data=False)
        for col in df_test_knn_features.columns:
            df_test[col] = df_test_knn_features[col]
        lgb_train = lgb.Dataset(
            df_trn.loc[:, features],
            label=target_trn,
            feature_name=features,
            categorical_feature=features_categorical
        )
        lgb_valid = lgb.Dataset(
            df_val.loc[:, features],
            label=target_val,
            feature_name=features,
            categorical_feature=features_categorical
        )
        lgb_params["learning_rate"] = learning_rate + np.random.random() * 0.001  # おまじない
        num_round = 999999999
        model = lgb.train(
            lgb_params,
            lgb_train, 
            num_round, 
            valid_sets=[lgb_train, lgb_valid], 
            verbose_eval=300,
            early_stopping_rounds=300 if num_round >= 1e8 else None,
            fobj=None,
            #feval=lgb_metric,
        )
        # cv
        prediction_round = model.best_iteration+150 if num_round >= 1e8 else num_round  # おまじない
        oof[indexes_val] = model.predict(df_val[features], num_iteration=prediction_round)
        # feature importance
        df_fold_importance = pd.DataFrame()
        df_fold_importance["feature"] = features
        df_fold_importance["importance"] = model.feature_importance()
        df_fold_importance["fold"] = fold_
        df_feature_importance = pd.concat([df_feature_importance, df_fold_importance], axis=0)
        # prediction for test data
        predictions += model.predict(df_test[features], num_iteration=prediction_round) / N_SPLITS
        print()
    
    score = f1_score(target, oof.argmax(1), average="macro")
    print("CV score (not reliable!)")
    print(f"  f1: {score:8.5f}")
    print()
    print(classification_report(target, oof.argmax(1)))
    
    df_test["prediction"] = predictions.argmax(1)
    df_test["confidence"] = predictions.max(1)
    df_test["genre"] = np.where(predictions.max(1) > pseudo_labeling_threshold, predictions.argmax(1), -100)
    df = merge_train_test(df_train, df_test)
    df_main["genre"] = df_main["index"].map(dict(df[["index", "genre"]].values))
    print((df_test["confidence"] > pseudo_labeling_threshold).sum(), f"rows were filled. (confidence>{pseudo_labeling_threshold})")
    print("filled test labels:", np.bincount(df_test[df_test["genre"]!=-100]["genre"]))
    print("\n")

  0%|          | 0/6 [00:00<?, ?it/s]

------------------------------ fold 0 ------------------------------
Training until validation scores don't improve for 300 rounds
[300]	training's multi_logloss: 0.665694	valid_1's multi_logloss: 0.695353
[600]	training's multi_logloss: 0.590728	valid_1's multi_logloss: 0.676796
[900]	training's multi_logloss: 0.542854	valid_1's multi_logloss: 0.676719
Early stopping, best iteration is:
[711]	training's multi_logloss: 0.571438	valid_1's multi_logloss: 0.675461

------------------------------ fold 1 ------------------------------
Training until validation scores don't improve for 300 rounds
[300]	training's multi_logloss: 0.664605	valid_1's multi_logloss: 0.646881
[600]	training's multi_logloss: 0.588284	valid_1's multi_logloss: 0.62156
[900]	training's multi_logloss: 0.537728	valid_1's multi_logloss: 0.619386
[1200]	training's multi_logloss: 0.498112	valid_1's multi_logloss: 0.620138
Early stopping, best iteration is:
[968]	training's multi_logloss: 0.528	valid_1's multi_logloss: 0.61

 17%|█▋        | 1/6 [07:59<39:57, 479.52s/it]


CV score (not reliable!)
  f1:  0.68150

              precision    recall  f1-score   support

           0       0.82      0.74      0.78        42
           1       0.54      0.40      0.46       207
           2       0.74      0.60      0.66       193
           3       0.86      0.83      0.84       464
           4       0.67      0.58      0.62        45
           5       0.65      0.53      0.59       131
           6       0.58      0.36      0.44        50
           7       0.63      0.62      0.63       334
           8       0.72      0.79      0.76      1326
           9       0.91      0.88      0.90        85
          10       0.82      0.84      0.83      1656

    accuracy                           0.76      4533
   macro avg       0.72      0.65      0.68      4533
weighted avg       0.76      0.76      0.76      4533

184 rows were filled. (confidence>0.95)
filled test labels: [  1   0   2  24   0   3   0   0  15  13 126]


------------------------------ fold 0

 33%|███▎      | 2/6 [15:25<30:39, 459.84s/it]


CV score (not reliable!)
  f1:  0.68426

              precision    recall  f1-score   support

           0       0.79      0.72      0.76        43
           1       0.57      0.46      0.51       207
           2       0.73      0.59      0.65       195
           3       0.87      0.83      0.85       488
           4       0.68      0.51      0.58        45
           5       0.66      0.53      0.59       134
           6       0.53      0.40      0.45        50
           7       0.64      0.60      0.62       334
           8       0.72      0.80      0.76      1341
           9       0.91      0.91      0.91        98
          10       0.84      0.85      0.85      1782

    accuracy                           0.77      4717
   macro avg       0.72      0.66      0.68      4717
weighted avg       0.77      0.77      0.77      4717

341 rows were filled. (confidence>0.925)
filled test labels: [  4   3  12  46   1  11   0   0 138   0 126]


------------------------------ fold 

 50%|█████     | 3/6 [22:56<22:46, 455.64s/it]


CV score (not reliable!)
  f1:  0.70433

              precision    recall  f1-score   support

           0       0.83      0.74      0.79        47
           1       0.57      0.46      0.51       210
           2       0.73      0.59      0.65       207
           3       0.88      0.85      0.86       534
           4       0.76      0.63      0.69        46
           5       0.69      0.53      0.60       145
           6       0.71      0.34      0.46        50
           7       0.64      0.62      0.63       334
           8       0.75      0.82      0.78      1479
           9       0.93      0.91      0.92        98
          10       0.85      0.86      0.86      1908

    accuracy                           0.79      5058
   macro avg       0.76      0.67      0.70      5058
weighted avg       0.79      0.79      0.79      5058

345 rows were filled. (confidence>0.9)
filled test labels: [  4   2   8  47   2   3   1   3 180   1  94]


------------------------------ fold 0 

 67%|██████▋   | 4/6 [30:11<14:55, 447.61s/it]


CV score (not reliable!)
  f1:  0.70920

              precision    recall  f1-score   support

           0       0.89      0.80      0.85        51
           1       0.59      0.44      0.51       212
           2       0.75      0.61      0.67       215
           3       0.88      0.86      0.87       581
           4       0.72      0.60      0.66        48
           5       0.68      0.55      0.61       148
           6       0.59      0.33      0.42        51
           7       0.64      0.63      0.63       337
           8       0.78      0.83      0.81      1659
           9       0.92      0.90      0.91        99
          10       0.85      0.87      0.86      2002

    accuracy                           0.80      5403
   macro avg       0.75      0.68      0.71      5403
weighted avg       0.80      0.80      0.80      5403

315 rows were filled. (confidence>0.875)
filled test labels: [  0   5   6  30   5   6   0  16 167   3  77]


------------------------------ fold 

In [None]:
df_submission = df_sample_sub.copy()
df_submission["genre"] = df_submission["index"].map(dict(df_main[["index", "genre"]].values))
# genreのどこにもnanがなかったらという条件。あるならAssertion Error
assert not df_submission["genre"].isna().any()

print("genre counts")
display(df_submission["genre"].value_counts().sort_index())

print("\nfirst 10 test data")
display(df_submission.head(10))

# make submission file
df_submission.to_csv("../outputs/sub_05_06.csv", header=None, index=False)

In [None]:
def visualize_confusion_matrix(y_true,
                                                      pred_label,
                                                      height=.6,
                                                      labels=None):
    conf = confusion_matrix(y_true=y_true,
                                               y_pred=pred_label,
                                               normalize='true')
    
    n_labels = len(conf)
    size = n_labels * height
    fig , ax = plt.subplots(figsize=(size*2, size*1.5))
    sns.heatmap(conf, cmap='YlOrBr', ax=ax, annot=True, fmt='.2f')
    ax.set_ylabel('Label')
    ax.set_xlabel('Predict')
    
    if labels is not None:
        ax.set_yticklabels(labels)
        ax.set_xticklabels(labels)
        ax.tick_params('y', labelrotation=0)
        ax.tick_params('x', labelrotation=90)
        
    return fig

oof_label = pd.Series(np.argmax(oof, axis=1))
fig = visualize_confusion_matrix(y_true=target, pred_label=oof_label, labels=df_genre_labels['genre'])
fig.show()

In [None]:
# Adversal Validation
from Adversal_Validation import Lgbm

df_tn = df_train.copy()
df_tt = df_test.copy()

X = df_tn[knn_features].fillna(0.0).values * knn_feature_weights
    
knn_feature_extractor = KNNFeatureExtractor(knn_n_neighbors).fit(X, target)
df_tn = pd.concat([df_tn, knn_feature_extractor.transform(X, is_train_data=True)], axis=1)
    
X = df_test[knn_features].fillna(0.0).values * knn_feature_weights
df_test_knn_features = knn_feature_extractor.transform(X, is_train_data=False)

train_feat_df = df_tn.drop(['genre', 'index', 'region', 'genre_name'], axis=1)
test_feat_df = df_tt.drop(['genre', 'index', 'region', 'genre_name', 'prediction'], axis=1)


params = {'boosting_type': 'gbdt',
          'objective': 'cross_entropy',
          'verbosity':-1}

model_ad = Lgbm(params)
model_ad.adversal_validation(train_feat_df,test_feat_df)
model_ad.visualize_importance()

In [None]:
df_sample_sub['genre'] = predictions.argmax(axis=1)
display(df_sample_sub)
df_sample_sub.to_csv('../outputs/sub_3.csv', index=False, header=False)