## 準備

In [71]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time

PATHS = {
    'train': './data/train.csv',
    'test': './data/test.csv',
    'sample_submission': './data/sample_submission.csv',
    'prediction': lambda model_name: f'./prediction/{model_name}_{time.strftime("%Y%m%d%H%M%S")}.csv'
}

## 読み込み

In [72]:
trainData = pd.read_csv(PATHS['train'], index_col='ID_LAT_LON_YEAR_WEEK')
predictionData = pd.read_csv(PATHS['test'], index_col='ID_LAT_LON_YEAR_WEEK')

## データの確認

In [73]:
# Headの表示
trainData.head()


Unnamed: 0_level_0,latitude,longitude,year,week_no,SulphurDioxide_SO2_column_number_density,SulphurDioxide_SO2_column_number_density_amf,SulphurDioxide_SO2_slant_column_number_density,SulphurDioxide_cloud_fraction,SulphurDioxide_sensor_azimuth_angle,SulphurDioxide_sensor_zenith_angle,...,Cloud_cloud_top_height,Cloud_cloud_base_pressure,Cloud_cloud_base_height,Cloud_cloud_optical_depth,Cloud_surface_albedo,Cloud_sensor_azimuth_angle,Cloud_sensor_zenith_angle,Cloud_solar_azimuth_angle,Cloud_solar_zenith_angle,emission
ID_LAT_LON_YEAR_WEEK,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ID_-0.510_29.290_2019_00,-0.51,29.29,2019,0,-0.000108,0.603019,-6.5e-05,0.255668,-98.593887,50.843559,...,3664.436218,61085.80957,2615.120483,15.568533,0.272292,-12.628986,35.632416,-138.786423,30.75214,3.750994
ID_-0.510_29.290_2019_01,-0.51,29.29,2019,1,2.1e-05,0.728214,1.4e-05,0.130988,16.592861,39.137194,...,3651.190311,66969.478735,3174.572424,8.690601,0.25683,30.359375,39.557633,-145.18393,27.251779,4.025176
ID_-0.510_29.290_2019_02,-0.51,29.29,2019,2,0.000514,0.748199,0.000385,0.110018,72.795837,52.868816,...,4216.986492,60068.894448,3516.282669,21.10341,0.251101,15.377883,30.401823,-142.519545,26.193296,4.231381
ID_-0.510_29.290_2019_03,-0.51,29.29,2019,3,,,,,,,...,5228.507736,51064.547339,4180.973322,15.386899,0.262043,-11.293399,24.380357,-132.665828,28.829155,4.305286
ID_-0.510_29.290_2019_04,-0.51,29.29,2019,4,-7.9e-05,0.676296,-4.8e-05,0.121164,4.121269,35.515587,...,3980.59812,63751.125781,3355.710107,8.114694,0.235847,38.532263,37.392979,-141.509805,22.204612,4.347317


In [74]:
# 欠損値の確認用関数
def check_missing(df):
    missing = df.isnull().sum()
    missing_rate = missing / len(df)
    df_missing = pd.DataFrame({'欠損値数': missing, '欠損値割合': missing_rate})
    return df_missing

In [75]:
# 1. 欠損値の確認、各カラム "欠損値数" / "全データ数" という形で出力
check_missing(train)

Unnamed: 0,欠損値数,欠損値割合
latitude,0,0.0
longitude,0,0.0
year,0,0.0
week_no,0,0.0
SulphurDioxide_SO2_column_number_density,14609,0.18487
SulphurDioxide_SO2_column_number_density_amf,14609,0.18487
SulphurDioxide_SO2_slant_column_number_density,14609,0.18487
SulphurDioxide_cloud_fraction,14609,0.18487
SulphurDioxide_sensor_azimuth_angle,14609,0.18487
SulphurDioxide_sensor_zenith_angle,14609,0.18487


In [76]:
# 値を変えるため、trainをコピー
df = trainData.copy()

# "UvAerosolLayerHeight"からはじまる列を削除
df = df.drop(trainData.filter(regex="^UvAerosolLayerHeight", axis=1).columns, axis=1)

# 欠損値のある列を削除
df = df.dropna()

In [77]:
# カラム数の出力
print(len(df.columns))

68


In [78]:
# 各カラムのデータ型を確認
pd.set_option('display.max_rows', 100)
trainData.dtypes

latitude                                                    float64
longitude                                                   float64
year                                                          int64
week_no                                                       int64
SulphurDioxide_SO2_column_number_density                    float64
SulphurDioxide_SO2_column_number_density_amf                float64
SulphurDioxide_SO2_slant_column_number_density              float64
SulphurDioxide_cloud_fraction                               float64
SulphurDioxide_sensor_azimuth_angle                         float64
SulphurDioxide_sensor_zenith_angle                          float64
SulphurDioxide_solar_azimuth_angle                          float64
SulphurDioxide_solar_zenith_angle                           float64
SulphurDioxide_SO2_column_number_density_15km               float64
CarbonMonoxide_CO_column_number_density                     float64
CarbonMonoxide_H2O_column_number_density        

## 単純な回帰モデルを作成する

In [79]:
# 回帰モデルをめちゃくちゃ試す
# いろんな回帰モデルをimportする
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

# とりあえず全ての特徴量を使ってみる
X = df.drop(['emission'], axis=1)
y = df['emission']

# dfをtrainとtestに分割する
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

models = [
    # LinearRegression(), done
    # Ridge(), done
    # Lasso(), done
    # ElasticNet(),
    # SVR(),
    # DecisionTreeRegressor(), done
    # RandomForestRegressor(), done
    # AdaBoostRegressor(),
    # MLPRegressor(), done
    # KNeighborsRegressor(),
    # GaussianProcessRegressor()
]

for model in models:
    # モデルを学習させる
    model.fit(X_train, y_train)
    # 予測値を計算する
    y_pred = model.predict(X_test)
    # RMSEを計算する
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print('Model: {}, RMSE: {}'.format(model.__class__.__name__, rmse))

In [80]:
for model in models:
    # "UvAerosolLayerHeight"からはじまる列を削除
    predictionData = predictionData.drop(predictionData.filter(regex="^UvAerosolLayerHeight", axis=1).columns, axis=1)
    predictionData.head()
    # 欠損値
    predictionData.isnull().sum()
    # 欠損値を補完
    predictionData = predictionData.fillna(method='ffill')
    predictionData = predictionData.fillna(method='bfill')
    predictionData.isnull().sum()
    # 予測
    prediction = model.predict(predictionData)
    output = pd.DataFrame({'ID_LAT_LON_YEAR_WEEK': predictionData.index, 'emission': prediction})
    path = PATHS['prediction'](model.__class__.__name__)
    output.to_csv(path, index=False)