In [None]:
import datetime
import logging

import requests
import os
import joblib

import numpy as np
import pandas as pd
pd.options.display.max_columns = 300

from pathlib import Path
import warnings
import sys
print(sys.path)
sys.path.append('../modules/')

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, f1_score, confusion_matrix
import seaborn as sns
import lightgbm as lgb
import matplotlib.pyplot as plt

In [None]:
ROOT = '/Users/shugo/Desktop/SIGNATE/SIGNATE_StudentCup2021/shu421'

# make config
INPUT = os.path.join(ROOT, 'input') # パスの結合
OUTPUT = os.path.join(ROOT, 'output')
SUBMISSION = os.path.join(ROOT, 'submission')

EXP_NAME = 'main'
EXP = os.path.join(OUTPUT, EXP_NAME)
PREDS = os.path.join(EXP, 'preds')
TRAINED = os.path.join(EXP, 'trained')
FEATURE = os.path.join(EXP, 'feature')
REPORTS = os.path.join(EXP, 'reports')

# make experiments environment
dirs = [
        OUTPUT,
        SUBMISSION,
        FEATURE,
        EXP,
        PREDS,
        TRAINED,
        REPORTS
        ]

# パスが通ってなかったら、新しいパスを通す
for v in dirs:
    if not os.path.isdir(v):
        print(f'making {v}')
        os.makedirs(v, exist_ok=True)

# Load data

In [None]:
train = pd.read_csv(INPUT+'/train.csv')
test = pd.read_csv(INPUT+'/test.csv')
sample_sub = pd.read_csv(INPUT+'/sample_submit.csv')
genre_labels = pd.read_csv(INPUT+'/genre_labels.csv')

In [None]:
# save and load用のクラス
class Util:
    @classmethod
    def dump(cls, value, path):
        os.makedirs(os.path.dirname(path), exist_ok=True)
        joblib.dump(value, path, compress=True) # 並列処理
        
    @classmethod
    def load(cls, path):
        return joblib.load(path)
    
# log用のクラス
class Logger:
    def __init__(self, path):
        self.general_logger = logging.getLogger(path) # loggerを設定
        stream_handler = logging.StreamHandler() # loggerからlogRecordを渡される
        file_general_handler = logging.FileHandler(os.path.join(path, 'Experiment.log'))
        if len(self.general_logger.handlers) == 0:
            self.general_logger.addHandler(stream_handler)
            self.general_logger.addHandler(file_general_handler)
            self.general_logger.setLevel(logging.INFO)
            
    def info(self, message):
        # display time
        self.general_logger.info('[{}] - {}'.format(self.now_string(), message))
    
    @staticmethod
    def now_string():
        return str(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
    
# loggerの設定
logger = Logger(REPORTS) # REPORTフォルダにlog結果を保存する

In [None]:
N_CLASSES = 11

# testのジャンルを-100として結合
def merge_train_test(train, test):
    if 'genre' not in test.columns.tolist():
        test['genre'] = -100
    res = pd.concat([train,test])
    res.reset_index(inplace=True, drop = True)
    return res

def split_train_test(input_df):
    train = input_df[input_df['genre'] != -100]
    test = input_df[input_df['genre'] == -100]
    train.reset_index(inplace=True, drop=True)
    test.reset_index(inplace=True, drop =True)
    return train, test

df = merge_train_test(train, test)

# Feature Enginnering
  ## class

In [None]:
# regionについてgroupbyし、その出現頻度をcountする
class CountEncoder:
    def fit(self, series):
        self.counts = series.groupby(series).count()
        return self
    
    def transform(self, series):
        return series.map(self.counts).fillna(0)
    
    def fit_transform(self, series):
        return self.fit(series).transform(series)
    

#　標準偏差とz得点
class GroupFeatureExtractor:
    EX_TRANS_METHODS = ['deviation', 'zscore']
    
    def __init__(self, group_key, group_values, agg_methods):
        self.group_key = group_key
        self.group_values = group_values
        
        self.ex_trans_methods = [m for m in agg_methods if m in self.EX_TRANS_METHODS]
        self.agg_methods = [m for m in agg_methods if m not in self.ex_trans_methods]
        self.df_agg = None

    def fit(self, df_train, y=None):
        if not self.agg_methods:
            return
        dfs = []
        for agg_method in self.agg_methods:
            if callable(agg_methods): # 呼び出し可能か判定
                agg_method_name = agg_method.__name__ # ?
            else:
                agg_method_name = agg_method
            df_agg = (df_train[[self.group_key] + self.group_values].groupby(self.group_key).agg(agg_method))
            df_agg.columns = self._get_columns_names(agg_method_name)
            dfs.append(df_agg)
        self.df_agg = pd.concat(dfs, axis=1).reset_index()
        
    def transform(self, df_eval):
        key = self.group_key
        if self.agg_methods:
            df_features = pd.merge(df_eval[[self.group_key]], self.df_agg, on=self.group_key, how='left')
        else:
            df_features = df_eval[[self.group_key]].copy()
        if self.ex_trans_methods:
            if 'deviation' in self.ex_trans_methods:
                df_features[self._get_column_names('deviation')] = df_eval[self.group_values] - df_eval[[key]+self.group_values].groupby(key).transform('mean')
            if 'zscore' in self.ex_trans_methods:
                df_features[self._get_column_names('zscore')] = (df_eval[self.group_values] - df_eval[[key]+self.group_values].groupby(key).transform('mean')) \
                                                                                                            / (df_eval[[key]+self.group_values].groupby(key).transform('std') + 1e-8)
            df_features.drop(self.group_key, axis=1, inplace = True)
            return df_features
        
    def _get_column_names(self, method):
        return [f'agg_{method}_{col}_grpby_{self.group_key}' for col in self.group_values]
        
    def fit_transform(self, df_train, y=None):
        self.fit(df_train, y=y)
        return self.transform(df_train)


# K近傍法特徴量
class KNNFeatureExtractor:
    def __init__(self, n_neighbors=5):
        self.knn = KNeighborsClassifier(n_neighbors + 1)
        
    def fit(self, X, y):
        self.knn.fit(X, y)
        self.y = y if isinstance(y, np.ndarray) else np.array(y) # 型判定の組み込み関数isinstanceはboolを返す。np.arrayのyを返す
        return self

    def transform(self, X, is_train_data):
        distances, indexes = self.knn.kneighbors(X) # ポイントXから最も近いポイントまでの距離と、そのインデックスを返す
        
        # train, testデータを判定
        distances = distances[:, 1:] if is_train_data else distances[:, :-1]
        indexes = indexes[:, 1:] if is_train_data else indexes[:, :-1]
        labels = self.y[indexes]
        score_columns = [f"knn_score_class{c:02d}" for c in range(N_CLASSES)]
        df_knn = pd.DataFrame(
                [np.bincount(labels_, distances_, N_CLASSES) for labels_, distances_ in zip(labels, 1.0 / distances)], # bincount : 要素の個数を返す
                columns=score_columns
        )
        
        # 最大スコア
        df_knn['max_knn_scores'] = df_knn.max(axis=1)
        
        # 最大スコアとの差。0は最大スコアを表す
        for col in score_columns:
            df_knn[f'sub_max_knn_scores_{col}'] = df_knn['max_knn_scores'] - df_knn[col]
        
        # 最大スコアとの比。1は最大スコアを表す
        for col in score_columns:
            df_knn[f'div_max_knn_scores_{col}'] = df_knn[col] / df_knn['max_knn_scores']
        
        # それぞれのスコア同士の差
        for i, col1 in enumerate(score_columns):
            for j, col2 in enumerate(score_columns[i+1:], i+1): # 全パターンを網羅できる
                df_knn[f'sub_{col1}_{col2}'] = df_knn[col1] - df_knn[col2]
        
        # knnスコアの合計
        df_knn['sum_knn_scores'] = df_knn.sum(1)
        
        return df_knn

In [None]:
df_genre_region_grpby = train.groupby('region')['acousticness'].mean()
display(df_genre_region_grpby.sort_values(ascending=True))
#df_genre_region_grpby.plot.hist()
import seaborn as sns
sns.histplot(x=df_genre_region_grpby.index)

## 関数

In [173]:
n_train = len(train)
def get_target(input_df):
    global n_train
    if 'genre' in input_df.columns:
        output_df = input_df['genre'][:n_train]
        return output_df

def get_numerical_features(input_df):
    # そのままの数値特徴
    cols = ['popularity',
            'duration_ms',
            'acousticness',
            'positiveness',
            'danceability',
            'loudness',
            'energy',
            'liveness',
            'speechiness',
            'instrumentalness']
    output_df = input_df[cols].copy()
    return output_df

def get_genre_name(input_df):
    output_df = pd.DataFrame()
    output_df['genre_name'] = input_df['genre'].map(dict(genre_labels[['labels', 'genre']].values)).copy()
    return output_df
   
def get_tempo_features(input_df):
    _df = input_df['tempo'].str.split('-').apply(pd.Series).astype(float)
    _df.columns = ['tempo_low', 'tempo_high']
    output_df = _df.copy()
    output_df['mean_tempo'] = (_df['tempo_high'] + _df['tempo_low'])/2
    output_df['diff_tempo'] = _df['tempo_high'] - _df['tempo_low']
    output_df['var_tempo'] = _df.var(axis=1) # tempo_highとtempo_lowの分散をとる
    output_df['sum_tempo'] = _df.sum(axis=1)
    output_df['log_tempo'] = np.log(output_df['mean_tempo'])
    return output_df

def get_region_onehot(input_df):
    # regionをone-hotし、unknownをregion_unknownとする
    output_df = pd.get_dummies(input_df['region']).rename(columns={'unknown' : 'region_unknown'})
    return output_df

def get_num_nans(input_df):
    # 曲ごとのnanの数
    input_df['num_nans'] = 0
    for col in [
        'acousticness',
        'positiveness',
        'danceability',
        'energy',
        'liveness',
        'speechiness',
        'instrumentalness'
    ]:
        input_df['num_nans'] += input_df[col].isna()
        output_df = input_df['num_nans'].apply(pd.Series)
        output_df.columns = ['num_nans']
        return output_df

def get_ce_features(input_df):
    columns_count_enc = ['region']
    output_df = pd.DataFrame()
    for col in columns_count_enc:
        output_df['countenc_' + col] = CountEncoder().fit_transform(input_df[col])
        output_df.loc[input_df[col].isna().values, 'countenc_' + col]= np.nan
    return output_df
    
def get_le_features(input_df):
    columns_count_enc = ['region']
    output_df = pd.DataFrame()
    for col in columns_count_enc:
        output_df['labelenc_' + col] = LabelEncoder().fit_transform(input_df[col])
        output_df.loc[input_df[col].isna().values, 'labelenc_' + col]= np.nan
    return output_df

def get_standardscaled_features(input_df):
    output_df = pd.DataFrame()
    _input_df = pd.concat([get_numerical_features(input_df),
                           get_tempo_features(input_df),
                           get_num_nans(input_df)], axis=1)
    for col in [
        'popularity', 'duration_ms', 'acousticness',
        'positiveness', 'danceability', 'loudness', 'energy', 'liveness',
        'speechiness', 'instrumentalness', 'log_tempo', 'num_nans'
    ]:
        output_df['standardscaled_' + col] = StandardScaler().fit_transform(_input_df[[col]])[:, 0]
    
    return output_df

# regionをキーにした集約統計量
def get_agg_region_features(input_df):
    _input_df = pd.concat([get_numerical_features(input_df),
                          get_tempo_features(input_df),
                          input_df[['region']]], axis=1)
    group_key = 'region'
    group_values = ['popularity',
                    'duration_ms',
                    'acousticness',
                    'positiveness',
                    'danceability',
                    'loudness',
                    'energy',
                    'liveness',
                    'speechiness',
                    'instrumentalness', 
                    'tempo_low', 
                    'tempo_high',
                    'log_tempo']
    agg_methods = ['deviation', 'zscore']
    gfe = GroupFeatureExtractor(
        group_key=group_key,
        group_values=group_values,
        agg_methods=agg_methods)
    output_df = gfe.fit_transform(_input_df)
    return output_df

def get_binning_features(input_df, target='acousticness', num_bins=3):#, target='acousticness', num_bins=11):
    binned = pd.cut(input_df[target], num_bins, labels=False)
    output_df = pd.DataFrame({f'binned_{target}' : binned})
    return output_df

In [174]:
# 上で作った関数を実行し、train, testそれぞれで処理を行う

def preprocess(input_df, funcs, task='train'):
    df_lst = []
    for func in funcs:
        file_name = os.path.join(FEATURE, f'{task}_{func.__name__}.pkl')
        # パスが通っていたら、その中身を返す
        if os.path.isfile(file_name):
            _df = Util.load(file_name)
        # 通ってなかったら
        else:
            _df = func(input_df)
            Util.dump(_df, file_name)
        df_lst.append(_df)
    output_df = pd.concat(df_lst, axis=1)
    return output_df

def get_train_data(train, test):
    # each_funcs : trainのみを対象とした処理
    each_funcs = [get_numerical_features,
                  get_tempo_features,
                  get_num_nans,
                  get_ce_features,
                  get_le_features,
                  get_standardscaled_features,
                  get_agg_region_features,
                  get_region_onehot,
                  #get_binning_features
                 ]
    train_out = preprocess(train, each_funcs, task='train') # each_funcsによる前処理
    
    # whole_funcs : train+testの全体集合を対象とした処理
    #whole_funcs = []
    #whole_df = pd.concat([train, test], axis=0).reset_index(drop=True) # whole_funcs用のデータ
    #whole_out = preprocess(whole_df, whole_funcs, task='whole') # whole_funcsによる前処理
    #train_x = pd.concat([train_out,
                        #whole_out.iloc[:len(train)]], axis=1)
    train_x = train_out
    return train_x

def get_test_data(train, test):
    # each_funcs : testのみを対象とした処理
    each_funcs = [get_numerical_features,
                  get_tempo_features,
                  get_num_nans,
                  get_ce_features,
                  get_le_features,
                  get_standardscaled_features,
                  get_agg_region_features,
                  get_region_onehot,
                  #get_binning_features
                 ]
    test_out = preprocess(test, each_funcs, task='test')
    
    # whole_funcs : train+testの全体集合を対象とした処理
    #whole_funcs = []
    #whole_df = pd.concat([train, test]).reset_index(drop=True)
    #whole_out = preprocess(whole_df, whole_funcs, task='whole') # whole_funcsによる前処理
    #test_x = pd.concat([test_out,
                       #whole_out.iloc[len(train):].reset_index(drop=True)], axis=1)
    test_x = test_out
    return test_x

In [175]:
# get features, 前処理の実行部分
train_x = get_train_data(train, test)
test_x = get_test_data(train, test)
test_x['region_M'] = 0
target = train['genre']
print('train_x', train_x.shape)
print('target', target.shape)
print('test_x', test_x.shape)

train_x (4046, 79)
target (4046,)
test_x (4046, 79)


# train

In [176]:
#!pip install optuna

In [195]:
import lightgbm as lgb

In [196]:
learning_rate = 0.01


lgb_params = {
    "objective": "multiclass",
    "num_class": N_CLASSES,
    #"metric": "None",
    'metric': 'multi_logloss',
    "learning_rate": learning_rate,
    "num_leaves": 3,
    "min_data_in_leaf": 40,
    #"colsample_bytree": 1.0,
    #"feature_fraction": 1.0,
    #"bagging_freq": 0,
    #"bagging_fraction": 1.0,
    "verbosity": 0,
    "seed": 42,
}

knn_n_neighbors = 6


# parameters - knn feature weights

knn_features = [
   'region_A', 'region_B', 'region_C', 'region_D', 'region_E', 'region_F',
   'region_G', 'region_H', 'region_I', 'region_J', 'region_K', 'region_L',
   'region_M', 'region_N', 'region_O', 'region_P', 'region_Q', 'region_R',
   'region_S', 'region_T', 'region_unknown',
   'standardscaled_popularity', 'standardscaled_duration_ms',
   'standardscaled_acousticness', 'standardscaled_positiveness',
   'standardscaled_danceability', 'standardscaled_loudness',
   'standardscaled_energy', 'standardscaled_liveness',
   'standardscaled_speechiness', 'standardscaled_instrumentalness',
   'standardscaled_log_tempo', 'standardscaled_num_nans'
]


dict_feature_weights = {}

for col in [
    'region_A', 'region_B', 'region_C', 'region_D', 'region_E', 'region_F',
    'region_G', 'region_H', 'region_I', 'region_J', 'region_K', 'region_L',
    'region_M', 'region_N', 'region_O', 'region_P', 'region_Q', 'region_R',
    'region_S', 'region_T', 'region_unknown'
]:
    dict_feature_weights[col] = 100.0

for col in [
    'standardscaled_duration_ms',
    'standardscaled_acousticness', 'standardscaled_positiveness',
    'standardscaled_danceability', 'standardscaled_loudness',
    'standardscaled_energy', 'standardscaled_liveness',
    'standardscaled_speechiness', 'standardscaled_instrumentalness'
]:
    dict_feature_weights[col] = 1.0

dict_feature_weights["standardscaled_popularity"] = 8.0
dict_feature_weights["standardscaled_log_tempo"] = 0.001
dict_feature_weights["standardscaled_num_nans"] = 100.0

knn_feature_weights = np.array([dict_feature_weights[col] for col in knn_features])

In [188]:
N_SPLITS = 15
SEED_SKF = 71
np.random.seed(71)

skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED_SKF)
oof = np.zeros((len(train), N_CLASSES))
predictions = np.zeros((len(test), N_CLASSES))
df_feature_importance = pd.DataFrame()

features_numerical = [
    'popularity', 'duration_ms', 'acousticness',
    'positiveness', 'danceability', 'loudness', 'energy', 'liveness',
    'speechiness', 'instrumentalness',
    'region_A', 'region_B', 'region_C', 'region_D', 'region_E', 'region_F',
    'region_G', 'region_H', 'region_I', 'region_J', 'region_K', 'region_L',
    'region_M', 'region_N', 'region_O', 'region_P', 'region_Q', 'region_R',
    'region_S', 'region_T', 'region_unknown', 'countenc_region',
    'num_nans',
    'agg_zscore_popularity_grpby_region',
    'agg_zscore_duration_ms_grpby_region',
    'agg_zscore_acousticness_grpby_region',
    'agg_zscore_positiveness_grpby_region',
    'agg_zscore_danceability_grpby_region',
    'agg_zscore_loudness_grpby_region', 'agg_zscore_energy_grpby_region',
    'agg_zscore_liveness_grpby_region',
    'agg_zscore_speechiness_grpby_region',
    'agg_zscore_instrumentalness_grpby_region',
    'agg_zscore_log_tempo_grpby_region',
    'knn_score_class00', 'knn_score_class01',
    'knn_score_class02', 'knn_score_class03', 'knn_score_class04',
    'knn_score_class05', 'knn_score_class06', 'knn_score_class07',
    'knn_score_class08', 'knn_score_class09', 'knn_score_class10',
    'max_knn_scores',
    'div_max_knn_scores_knn_score_class00',
    'div_max_knn_scores_knn_score_class01',
    'div_max_knn_scores_knn_score_class02',
    'div_max_knn_scores_knn_score_class03',
    'div_max_knn_scores_knn_score_class04',
    'div_max_knn_scores_knn_score_class05',
    'div_max_knn_scores_knn_score_class06',
    'div_max_knn_scores_knn_score_class07',
    'div_max_knn_scores_knn_score_class08',
    'div_max_knn_scores_knn_score_class09',
    'div_max_knn_scores_knn_score_class10',
    'sub_max_knn_scores_knn_score_class00',
    'sub_max_knn_scores_knn_score_class01',
    'sub_max_knn_scores_knn_score_class02',
    'sub_max_knn_scores_knn_score_class03',
    'sub_max_knn_scores_knn_score_class04',
    'sub_max_knn_scores_knn_score_class05',
    'sub_max_knn_scores_knn_score_class06',
    'sub_max_knn_scores_knn_score_class07',
    'sub_max_knn_scores_knn_score_class08',
    'sub_max_knn_scores_knn_score_class09',
    'sub_max_knn_scores_knn_score_class10',
    'sum_knn_scores',
    #'binned_acousticness'
]
sub_knn_score_n_m = []
score_columns = [f"knn_score_class{c:02d}" for c in range(N_CLASSES)]
for i, col1 in enumerate(score_columns):
    for j, col2 in enumerate(score_columns[i+1:], i+1):
        sub_knn_score_n_m.append(f'sub_{col1}_{col2}')
features_numerical += sub_knn_score_n_m        


features_categorical = ['labelenc_region']
features = features_numerical + features_categorical

for fold_, (indexes_trn, indexes_val) in enumerate(skf.split(train_x.values, target.values)):
    print('-'*50, 'fold{}'.format(fold_), '-'*50)

    df_trn = train_x.loc[indexes_trn].reset_index(drop=True) # dfになってる
    df_val = train_x.loc[indexes_val].reset_index(drop=True)
    target_trn = target.loc[indexes_trn].reset_index(drop=True)
    target_val = target.loc[indexes_val].reset_index(drop=True)

    # make knn features
    X = df_trn[knn_features].fillna(0.0).values * knn_feature_weights

    knn_feature_extractor = KNNFeatureExtractor(knn_n_neighbors).fit(X, target_trn)

    df_trn = pd.concat([df_trn, knn_feature_extractor.transform(X, is_train_data=True)], axis=1)

    X = df_val[knn_features].fillna(0.0).values * knn_feature_weights
    df_val = pd.concat([df_val, knn_feature_extractor.transform(X, is_train_data=False)], axis=1)

    X = test_x[knn_features].fillna(0.0).values * knn_feature_weights
    df_test_knn_features = knn_feature_extractor.transform(X, is_train_data=False)

    for col in df_test_knn_features.columns:
        test_x[col] = df_test_knn_features[col]

    lgb_train = lgb.Dataset(
            df_trn.loc[:, features],
            label=target_trn,
            feature_name=features,
            categorical_feature=features_categorical
    )
    lgb_valid = lgb.Dataset(
            df_val.loc[:, features],
            label=target_val,
            feature_name=features,
            categorical_feature=features_categorical
    )

    lgb_params['learning_rate'] = learning_rate + np.random.random() * 0.001 # おまじない
    num_round = 999999999
    model = lgb.train(
            lgb_params,
            lgb_train,
            num_round,
            valid_sets=[lgb_train, lgb_valid],
            verbose_eval=-1,
            early_stopping_rounds=300 if num_round >=1e8 else None,
            #fobj=None,
            #feval=lgb_metric,
    )

    # cv
    prediction_round = model.best_iteration+150 if num_round >= 1e8 else num_round # おまじない
    oof[indexes_val] = model.predict(df_val[features], num_iteration=prediction_round)

    # feature importance
    df_fold_importance = pd.DataFrame()
    df_fold_importance['feature'] = features
    df_fold_importance['importance'] = model.feature_importance()
    df_fold_importance['fold'] = fold_
    df_feature_importance = pd.concat([df_feature_importance, df_fold_importance], axis=0)

    # prediction for test data
    predictions += model.predict(test_x[features], num_iteration=prediction_round) /N_SPLITS
    print()

-------------------------------------------------- fold0 --------------------------------------------------


[32m[I 2021-05-09 17:12:03,383][0m A new study created in memory with name: no-name-74a10dfc-5937-4e3d-b969-0930ff6e308b[0m




  0%|          | 0/7 [00:00<?, ?it/s][A[A[A[A





You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Training until validation scores don't improve for 300 rounds


KeyboardInterrupt: 

In [None]:
test_x.loc[:, 'prediction'] = predictions.argmax(1)
score = f1_score(target, oof.argmax(1), average='macro')
print('CV score')
print('f1 : {:8.5f}'.format(score))
print()
print(classification_report(target, oof.argmax(1)))

fig, ax= plt.subplots(figsize=(12, 16))
sns.barplot(x='importance', y='feature', data=df_feature_importance.sort_values(by='importance', ascending=False))
plt.title('fature importance')
ax.grid()
plt.tight_layout()
plt.show()

# Submission

In [None]:
df_sample_sub['genre'] = predictions.argmax(axis=1)
display(df_sample_sub)
df_sample_sub.to_csv('../outputs/sub_0501.csv', index=False, header=False)