In [1]:
import numpy as np
import pandas as pd
import gc
import pickle
import os
import datetime as dt

#plot
import matplotlib.pyplot as plt

#LightGBM
import lightgbm as lgb

#sklearn
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
from sklearn.metrics import mean_absolute_error

import warnings
warnings.filterwarnings('ignore')

#表示桁数の指定
pd.options.display.float_format = '{:.4f}'.format

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [2]:
#train.csvファイルの読み込み
train = pd.read_csv("train.csv")
print(train.shape)
train.head()

(750000, 12)


Unnamed: 0,id,Podcast_Name,Episode_Title,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Listening_Time_minutes
0,0,Mystery Matters,Episode 98,,True Crime,74.81,Thursday,Night,,0.0,Positive,31.42
1,1,Joke Junction,Episode 26,119.8,Comedy,66.95,Saturday,Afternoon,75.95,2.0,Negative,88.0124
2,2,Study Sessions,Episode 16,73.9,Education,69.97,Tuesday,Evening,8.97,0.0,Negative,44.9253
3,3,Digital Digest,Episode 45,67.17,Technology,57.22,Monday,Morning,78.7,2.0,Positive,46.2782
4,4,Mind & Body,Episode 86,110.51,Health,80.07,Monday,Afternoon,58.68,3.0,Neutral,75.6103


In [3]:
#メモリ削減の為の関数
def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            pass

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [4]:
train = reduce_mem_usage(train)
#trainの情報を確認
print(train.info())

Memory usage of dataframe is 68.66 MB
Memory usage after optimization is: 44.35 MB
Decreased by 35.4%
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750000 entries, 0 to 749999
Data columns (total 12 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   id                           750000 non-null  int32  
 1   Podcast_Name                 750000 non-null  object 
 2   Episode_Title                750000 non-null  object 
 3   Episode_Length_minutes       662907 non-null  float16
 4   Genre                        750000 non-null  object 
 5   Host_Popularity_percentage   750000 non-null  float16
 6   Publication_Day              750000 non-null  object 
 7   Publication_Time             750000 non-null  object 
 8   Guest_Popularity_percentage  603970 non-null  float16
 9   Number_of_Ads                749999 non-null  float16
 10  Episode_Sentiment            750000 non-null  object 
 11  Listening_Time_

In [5]:
train.shape

(750000, 12)

In [6]:
#特徴量エンジニアリング1
train['Episode_Length_minutes'].fillna(train['Episode_Length_minutes'].median(), inplace=True)
train["Guest_Popularity_percentage"].fillna(train["Guest_Popularity_percentage"].median(), inplace=True)


In [7]:
#特徴量エンジニアリング2
#カテゴリ変数をcatgory型に変換
for col in train.columns:
    if train[col].dtype == "object":
        train[col] = train[col].astype("category")

In [8]:
#特徴量エンジニアリング3
#Publication_Dayを二つに分ける
# 目標が小さい曜日（0）と大きい曜日（1）に分ける
low_target_days = ['Sunday', 'Thursday', 'Friday', 'Saturday']
high_target_days = ['Wednesday', 'Monday', 'Tuesday']
    
train['Target_Day_Group'] = train['Publication_Day'].apply(
        lambda x: 0 if x in low_target_days else 1
    )


In [9]:
#特徴量エンジニアリング4
#Publivation_Timeについて
time_order = {'evening': 0, 'morning': 1, 'afternoon': 2, 'night': 3}
train['Publication_Time_Encoded'] = train['Publication_Time'].map(time_order)


In [10]:
#特徴量エンジニアリング5
#Episode_Sentimentについて
sentiment_order = {'Negative': 0, 'Neutoral': 1, 'Positive': 2}
train['Episode_Sentiment_Encoded'] = train['Episode_Sentiment'].map(sentiment_order)


In [11]:
#特徴量エンジニアリング6
#ホストとゲストの人気度の差を作る
train['Host_Guest_Popularity_Ratio'] = train['Host_Popularity_percentage'] / (train['Guest_Popularity_percentage'] + 1e-5)


In [12]:
#特徴量エンジニアリング7
#一分当たりの広告の数：EDAの結果として、広告が少ない方が音楽が聞かれやすいとわかっている
train['Ads_Per_Minute'] = train['Number_of_Ads'] / (train['Episode_Length_minutes'] + 1e-5)


In [13]:
#特徴量エンジニアリング8
#ホストとゲストが上位75%以上だと良く音楽が聞かれそう
train['Highly_Popular_Host'] = (train['Host_Popularity_percentage'] > 75).astype(int)
train['Highly_Popular_Guest'] = (train['Guest_Popularity_percentage'] > 75).astype(int)


In [15]:
#データセット作成
x_train = train.drop(["Listening_Time_minutes", "id",'Episode_Sentiment','Publication_Time'], axis=1)
y_train = train["Listening_Time_minutes"]
id_train = train[["id"]]

In [17]:
import os
import joblib
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

def train_lgb_regression(input_x,
                         input_y,
                         input_id,
                         params,
                         list_nfold=[0,1,2,3,4],
                         n_splits=5,
                         save_dir="models"):
    train_oof = np.zeros(len(input_x))
    metrics = []
    imp = pd.DataFrame()

    os.makedirs(save_dir, exist_ok=True)

    # クロスバリデーション（回帰はKFoldでOK）
    cv = list(KFold(n_splits=n_splits, shuffle=True, random_state=123).split(input_x))
    
    for nfold in list_nfold:
        print("-" * 20, f"Fold {nfold}", "-" * 20)
        
        idx_tr, idx_va = cv[nfold]
        x_tr, y_tr, id_tr = input_x.loc[idx_tr, :], input_y[idx_tr], input_id.loc[idx_tr, :]
        x_va, y_va, id_va = input_x.loc[idx_va, :], input_y[idx_va], input_id.loc[idx_va, :]
        print(f"Train shape: {x_tr.shape}, Validation shape: {x_va.shape}")
        
        # モデル学習（LGBMRegressorを使用）
        model = lgb.LGBMRegressor(**params)
        model.fit(
            x_tr, y_tr,
            eval_set=[(x_tr, y_tr), (x_va, y_va)],
            early_stopping_rounds=100,
            verbose=100
        )
        
        fname_lgb = os.path.join(save_dir, f"model_lgb_fold{nfold}.joblib")
        joblib.dump(model, fname_lgb)
        
        # 回帰なので predict() を使用
        y_tr_pred = model.predict(x_tr)
        y_va_pred = model.predict(x_va)
        
        # 評価指標はRMSE
        metric_tr = np.sqrt(mean_squared_error(y_tr, y_tr_pred))
        metric_va = np.sqrt(mean_squared_error(y_va, y_va_pred))
        metrics.append([nfold, metric_tr, metric_va])
        print(f"[RMSE] Train: {metric_tr:.4f}, Validation: {metric_va:.4f}")
        
        train_oof[idx_va] = y_va_pred
        
        _imp = pd.DataFrame({"col": input_x.columns, "imp": model.feature_importances_, "nfold": nfold})
        imp = pd.concat([imp, _imp])
      
    print("-" * 20, "Training Results", "-" * 20)
    
    metrics = np.array(metrics)
    print("[CV] Train RMSE: {:.4f}±{:.4f}, Validation RMSE: {:.4f}±{:.4f}".format(
        metrics[:, 1].mean(), metrics[:, 1].std(),
        metrics[:, 2].mean(), metrics[:, 2].std(),
    ))
    print("[OOF RMSE] {:.4f}".format(np.sqrt(mean_squared_error(input_y, train_oof))))
    
    train_oof = pd.concat([
        input_id,
        pd.DataFrame({"pred": train_oof})
    ], axis=1)
    
    imp = imp.groupby("col")["imp"].agg(["mean", "std"]).reset_index()
    imp.columns = ["col", "imp", "imp_std"]
    
    print("Training completed. Models saved to:", save_dir)
    
    return train_oof, imp, metrics


In [18]:
# ハイパーパラメータの設定
params = {
    'objective': 'regression',       # 回帰タスク
    'metric': 'rmse',                # 評価指標：RMSE（回帰タスクの標準的な指標）
    'boosting_type': 'gbdt',         # 勾配ブースティング（標準的な手法）
    'num_leaves': 31,                # 木の葉の数（過学習を防ぐために小さな値を試すことが推奨）
    'learning_rate': 0.05,           # 学習率（小さな値で過学習を防ぐ）
    'feature_fraction': 0.9,         # 特徴量のランダム選択の割合（過学習防止）
    'bagging_fraction': 0.8,         # データのランダムサンプリングの割合（過学習防止）
    'bagging_freq': 5,               # バギングの頻度（過学習防止）
    'max_depth': -1,                 # 木の深さ（-1は制限なし）
    'n_estimators': 1000,            # 決定木の数（大きな値にすると計算が重くなるので早期終了を活用）
    'early_stopping_rounds': 100,    # 早期終了のラウンド数（改善がない場合に学習を停止）
    'random_state': 42              # 再現性のためのランダムシード
}

# 学習の実行
train_oof, imp, metrics = train_lgb_regression(x_train,
                                    y_train,
                                    id_train,
                                    params,
                                    list_nfold=[0,1,2,3,4],
                                    n_splits=5,
                                   )

-------------------- Fold 0 --------------------
Train shape: (600000, 15), Validation shape: (150000, 15)
[100]	training's rmse: 13.0088	valid_1's rmse: 13.05
[200]	training's rmse: 12.8848	valid_1's rmse: 13.0149
[300]	training's rmse: 12.8021	valid_1's rmse: 13.0009
[400]	training's rmse: 12.7236	valid_1's rmse: 12.9897
[500]	training's rmse: 12.6541	valid_1's rmse: 12.9795
[600]	training's rmse: 12.5899	valid_1's rmse: 12.9723
[700]	training's rmse: 12.5287	valid_1's rmse: 12.9641
[800]	training's rmse: 12.472	valid_1's rmse: 12.961
[900]	training's rmse: 12.4165	valid_1's rmse: 12.9565
[1000]	training's rmse: 12.3613	valid_1's rmse: 12.9538
[RMSE] Train: 12.3613, Validation: 12.9538
-------------------- Fold 1 --------------------
Train shape: (600000, 15), Validation shape: (150000, 15)
[100]	training's rmse: 12.9999	valid_1's rmse: 13.0631
[200]	training's rmse: 12.8698	valid_1's rmse: 13.0296
[300]	training's rmse: 12.7842	valid_1's rmse: 13.016
[400]	training's rmse: 12.7066	v

In [None]:
imp.sort_values("imp", ascending=False)[:10]
#特徴量の重要度を可視化