In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import datetime
import seaborn as sns
from sklearn.preprocessing import StandardScaler

import os
for dirname, _, filenames in os.walk('../data/'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

../data/sample_submit.csv
../data/submission_Logistic.csv
../data/test.tsv
../data/train.tsv
../data/.ipynb_checkpoints\sample_submit-checkpoint.csv
../data/.ipynb_checkpoints\submission_Logistic-checkpoint.csv
../data/.ipynb_checkpoints\test-checkpoint.tsv
../data/.ipynb_checkpoints\train-checkpoint.tsv


In [13]:
train_df = pd.read_table('../data/train.tsv', index_col='Date', parse_dates=True)
test_df = pd.read_table('../data/test.tsv', index_col='Date', parse_dates=True)
sample_sub = pd.read_csv('../data/sample_submit.csv')

# set type label
train_df['type'] = 'train'
test_df['type'] = 'test'

# all data
all_df = pd.concat([train_df, test_df], axis=0)

# OZONEが高い日の数
train_df[train_df["OZONE"]==1.0].sum


<bound method NDFrame._add_numeric_operations.<locals>.sum of               id  WSR0  WSR1  WSR2  WSR3  WSR4  WSR5  WSR6  WSR7  WSR8  ...  \
Date                                                                    ...   
1998-04-05    94   0.4   0.5   2.1   2.2   2.5   2.4   2.1   2.9   3.6  ...   
1998-04-11   100   0.0   0.6   0.4   0.3   0.1   0.3   0.2   1.4   2.6  ...   
1998-04-20   109   1.8   0.3   0.1   0.1   0.1   0.2   0.2   0.7   0.9  ...   
1998-04-23   112   0.5   0.1   0.1   0.1   0.1   0.2   0.3   0.8   1.2  ...   
1998-04-25   114   3.1   2.4   2.4   3.0   3.4   3.4   3.9   4.5   5.5  ...   
...          ...   ...   ...   ...   ...   ...   ...   ...   ...   ...  ...   
2001-05-23  1225   0.8   0.3   0.2   0.4   0.3   1.4   0.9   1.5   1.9  ...   
2001-06-15  1248   2.2   1.7   0.8   3.8   4.0   4.1   2.0   1.9   1.8  ...   
2001-06-16  1249   0.4   0.4   0.1   0.1   0.0   0.4   0.5   0.6   1.3  ...   
2001-06-18  1251   0.4   0.7   0.5   0.6   0.8   1.0   1.6   1.5   2.

## 学習する特徴量を作成
#### __欠損値処理__
→全部平均値で補完

#### __特徴量の削除/追加__
ピアソン相関から、  
[削除]  
- 時間ごとの気温"T0"~"T23"を消す
- 時間ごとの風速"WSR0"~"WSR24"を消す
- 海面気圧の前日からの変化"SLP_"を消す  

[とりあえず追加]
- 風速の標準偏差を追加する
- 気温の標準偏差を追加する
  
計26個

#### __データ変換__
→とりあえず全部標準化

In [3]:
import re

def eda(all_df):
    # データの追加,気温・風速の標準偏差
    #1時間ごとの気温・風速を取得
    T_data = all_df[['T0', 'T1', 'T2', 'T3', 'T4', 'T5', 'T6', 'T7', 'T8', 'T9', 'T10', 'T11', 'T12', 'T13', 'T14', 'T15', 'T16', 'T17', 'T18', 'T19', 'T20', 'T21', 'T22', 'T23']]
    WSR_data = all_df[['WSR0', 'WSR1', 'WSR2', 'WSR3', 'WSR4', 'WSR5', 'WSR6', 'WSR7', 'WSR8', 'WSR9', 'WSR10', 'WSR11', 'WSR12', 'WSR13', 'WSR14', 'WSR15', 'WSR16', 'WSR17', 'WSR18', 'WSR19', 'WSR20', 'WSR21', 'WSR22', 'WSR23']]
    # 行ごとの標準偏差を追加
    all_df['T_SD'] = T_data.std(axis=1)
    # all_df['WSR_SD'] = WSR_data.std(axis=1)
    """
    # データの削除, T0~T23
    all_df = all_df.drop(columns=['T0', 'T1', 'T2', 'T3', 'T4', 'T5', 'T6', 'T7', 'T8', 'T9', 'T10', 'T11', 'T12', 'T13', 'T14', 'T15', 'T16', 'T17', 'T18', 'T19', 'T20', 'T21', 'T22', 'T23'])
    # データの削除, WSR0~WSR23
    all_df = all_df.drop(columns=['WSR0', 'WSR1', 'WSR2', 'WSR3', 'WSR4', 'WSR5', 'WSR6', 'WSR7', 'WSR8', 'WSR9', 'WSR10', 'WSR11', 'WSR12', 'WSR13', 'WSR14', 'WSR15', 'WSR16', 'WSR17', 'WSR18', 'WSR19', 'WSR20', 'WSR21', 'WSR22', 'WSR23'])
    """
    # データの削除, SLP_
    all_df = all_df.drop(columns=['SLP_'])
    
    return all_df


# 特徴量の削除/追加
all_df = eda(all_df)

# trainとtestに分けなおす
train_df = all_df[all_df['type'] == 'train']
test_df = all_df[all_df['type'] == 'test']
# train正解ラベル
y = train_df['OZONE']

# 学習に不要な特徴量を削除
train_df = train_df.drop(columns=['id', 'OZONE', 'type'])
test_df = test_df.drop(columns=['id', 'OZONE', 'type'])

# 欠損値を平均値で補完
train_df = train_df.fillna(train_df.mean())
test_df = test_df.fillna(test_df.mean())


# データ標準化(rightGBMのときはいらない)
scaler = StandardScaler()
train_df = pd.DataFrame(scaler.fit_transform(train_df), index = train_df.index, columns = train_df.columns)
test_df = pd.DataFrame(scaler.transform(test_df), index = test_df.index, columns = test_df.columns)


print(f'train_df : \n{train_df}\n')
print(f'y : \n{y}\n')

train_df : 
                WSR0      WSR1      WSR2      WSR3      WSR4      WSR5  \
Date                                                                     
1998-01-01 -0.725161  0.072833  0.576977  0.375571  0.304383  0.382625   
1998-01-02  0.788770  1.128245  1.275173  0.853712  1.351472  1.290436   
1998-01-03  0.864467  0.826698  0.732132  0.375571  0.465474  0.712738   
1998-01-04  2.227005  1.580564  1.585482  1.730302  1.029291  1.207908   
1998-01-05  0.637377  0.298993 -0.043641 -0.182259 -0.581615 -0.112545   
...              ...       ...       ...       ...       ...       ...   
2001-06-29 -0.725161 -0.605646 -1.052146 -1.058849 -0.742705 -0.690242   
2001-06-30 -0.573768 -0.756419 -0.664259 -0.819779 -0.581615 -0.360129   
2001-07-01 -0.800857 -0.530259 -0.198796 -0.580709 -0.501069 -0.360129   
2001-07-02 -0.043892 -0.002553 -0.431527 -0.261949 -0.501069 -0.525186   
2001-07-03 -0.952251 -0.907192 -0.664259 -0.819779 -0.662160 -0.690242   

                WSR6     

In [4]:
"""
# 改めてピアソン相関
corr_matrix = train_df.corr()
y_corr = corr_matrix[y]
# 横棒グラフ
fig, ax = plt.subplots(figsize=(10, 10)) 
sns.barplot(x=y_corr, y=y_corr.index, ax=ax) 
#X,Y軸とグラフタイトル 
ax.set_xlabel("Correlation Coefficient") 
ax.set_ylabel("Features") 
ax.set_title(f"Correlation with {y}") 
#表示 
plt.show()
"""

'\n# 改めてピアソン相関\ncorr_matrix = train_df.corr()\ny_corr = corr_matrix[y]\n# 横棒グラフ\nfig, ax = plt.subplots(figsize=(10, 10)) \nsns.barplot(x=y_corr, y=y_corr.index, ax=ax) \n#X,Y軸とグラフタイトル \nax.set_xlabel("Correlation Coefficient") \nax.set_ylabel("Features") \nax.set_title(f"Correlation with {y}") \n#表示 \nplt.show()\n'

## 検証データ作成

In [5]:
from sklearn.model_selection import train_test_split
# 訓練データの一部を分割し検証データを作成
# 注意 :   
# shuffleをTrueにするとランダムに分割されます。
# この時、random_stateを定義していないとモデルの再現性が取れなくなるので、設定するよう心がけてください。
# test_size=0.2とすることで訓練データの２割を検証データにしている
X_train ,X_val ,y_train ,y_val = train_test_split(
    train_df, y, 
    test_size=0.2, shuffle=True,random_state=0
    )

## モデルの作成と評価
今回はロジェスティック回帰

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# モデルを定義し学習
model = LogisticRegression(max_iter=1500) 
model.fit(X_train, y_train)

# 訓練データに対しての予測を行い、正答率を算出
y_pred = model.predict(X_train)
print(accuracy_score(y_train, y_pred))

0.932872655478776


In [12]:
# テストデータを予測
test_pred = model.predict(test_df)

# 行数で繰り返し予測値を代入
for index, row in sample_sub.iterrows():
    sample_sub.iloc[index,1] = np.where(test_pred[index]>=0.5, 1, 0)

# 結果を保存
sample_sub.to_csv("../data/submission_Logistic.csv", index=False)

In [17]:
test_pred.sum()

22.0