In [1]:
from google.colab import files
files.upload() # kaggle.jsonをアップロード
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 /root/.kaggle/kaggle.json

Saving kaggle.json to kaggle.json


In [7]:
import pandas as pd
import numpy as np
# 記録された時刻を時系列として扱うために必要
from datetime import datetime

# データの準備
def prepare():
    !kaggle datasets download -d \
    jsphyg/weather-dataset-rattle-package
    !unzip weather-dataset-rattle-package.zip
# 前処理（欠損値の削除など）
def preprocess():
    df = pd.read_csv('weatherAUS.csv')
    df = df.replace('NA', 'NaN')
    print(df)
    df = df.dropna(how='any')
    features = ['Date', 'MinTemp', 'MaxTemp', 'Rainfall']
    # datetime型に変換する
    df['Date'] = pd.to_datetime(df['Date'], format='%Y-%m-%d')
    # Yes/No ==> 1/0 に変換
    df['RainTomorrow'] = df['RainTomorrow'].map(
                          {'No': 0, 'Yes': 1}).astype(int)
    X_train = df.loc[:,features].values
    y_train = df.loc[:,['RainTomorrow']].values
    return df, X_train, y_train, features

# 時間窓を設定して平均値、最小値、最大値、標準偏差を取得する
# 時間窓はdatetime型の値を格納したタプルで表す
def time_window(df, target_feature, twin):
    df1 = df[(df['Date'] >= twin[0]) & (df['Date'] <= twin[1])]
    mean_win = df1[target_feature].mean()
    min_win = df1[target_feature].min()
    max_win = df1[target_feature].max()
    std_win = np.std( df1[target_feature] )
    return df1, mean_win, min_win, max_win, std_win

def main():
    prepare()
    df, X_train, y_train, features = preprocess()
    # 時間窓の設定
    win1 = (datetime(2009,1,1), datetime(2009,7,31))
    win2 = (datetime(2017,1,1), datetime(2017,7,31))
    print('\n')
    # 時間窓ごとに、平均値、最小値、最大値を計算
    for win in [win1, win2]:
        print('******* {:^14} ~ {:^14} *******'.format(win[0].strftime('%Y-%m-%d'),win[1].strftime('%Y-%m-%d')))
        print('{:^10}\t{:>6}\t{:>6}\t{:>6}\t{:>6}'.format(\
                     'Feature', 'Avg', 'Min', 'Max', 'Std'))
        for target_feature in ['Rainfall', 'MaxTemp', 'MinTemp']:
            df1, mean_win1, min_win1, max_win1, std_win1 = \
                time_window(df, target_feature, win)
            print('{:^10}\t{:>6.2f}\t{:>6.2f}\t{:>6.2f}\t{:>6.2f}'.format(target_feature, \
              mean_win1, min_win1, max_win1, std_win1))

if __name__ == '__main__':
    main()


Downloading weather-dataset-rattle-package.zip to /content
  0% 0.00/3.83M [00:00<?, ?B/s]
100% 3.83M/3.83M [00:00<00:00, 129MB/s]
Archive:  weather-dataset-rattle-package.zip
  inflating: weatherAUS.csv          
              Date Location  MinTemp  ...  Temp3pm  RainToday  RainTomorrow
0       2008-12-01   Albury     13.4  ...     21.8         No            No
1       2008-12-02   Albury      7.4  ...     24.3         No            No
2       2008-12-03   Albury     12.9  ...     23.2         No            No
3       2008-12-04   Albury      9.2  ...     26.5         No            No
4       2008-12-05   Albury     17.5  ...     29.7         No            No
...            ...      ...      ...  ...      ...        ...           ...
145455  2017-06-21    Uluru      2.8  ...     22.4         No            No
145456  2017-06-22    Uluru      3.6  ...     24.5         No            No
145457  2017-06-23    Uluru      5.4  ...     26.1         No            No
145458  2017-06-24    Ul