In [1]:
# 必要なライブラリのインポート
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

In [2]:
# 人口構成のCSVファイルの読み込み
df = pd.read_csv("population_data/japan_population.csv")
df.head()

Unnamed: 0,集計年,都道府県コード,都道府県名,総人口
0,1960,1,北海道,5039206.0
1,1960,2,青森県,1426606.0
2,1960,3,岩手県,1448517.0
3,1960,4,宮城県,1743195.0
4,1960,5,秋田県,1335580.0


In [3]:
# 1960年から2015年までの関東地方各県の総人口データを機械学習にかける
# ある年の都道府県コードと総人口数を説明変数X, その翌年の総人口数を目的変数yに設定する
# Xは縦56x7行の横2列, yは縦56x7行の横1列でndarrayを作成する
X = np.zeros((56 * 7, 7), dtype=np.uint32)
y = np.zeros(56 * 7, dtype=np.uint32)

cnt = 0
for i in range(56 * 47):
    pref_id = df.iloc[i, 1]
    population = df.iloc[i, 3]
    next_population = df.iloc[i+47, 3]

    if pref_id >= 8 and pref_id <= 14:
        if pref_id < 14:
            X[cnt][pref_id - 8] = 1
        
        X[cnt][6] = population
        y[cnt] = next_population
        cnt += 1

# Xを確認
X[0:10]

array([[      1,       0,       0,       0,       0,       0, 2047024],
       [      0,       1,       0,       0,       0,       0, 1513624],
       [      0,       0,       1,       0,       0,       0, 1578476],
       [      0,       0,       0,       1,       0,       0, 2430871],
       [      0,       0,       0,       0,       1,       0, 2306010],
       [      0,       0,       0,       0,       0,       1, 9683802],
       [      0,       0,       0,       0,       0,       0, 3443176],
       [      1,       0,       0,       0,       0,       0, 2053000],
       [      0,       1,       0,       0,       0,       0, 1513000],
       [      0,       0,       1,       0,       0,       0, 1581000]],
      dtype=uint32)

In [4]:
# yを確認
y[0:10]

array([2053000, 1513000, 1581000, 2497000, 2356000, 9967000, 3606000,
       2056000, 1513000, 1584000], dtype=uint32)

In [5]:
# 1960年から2009年までを訓練データ
# 2010年以降のデータをテストデータとして分割する
X_train = X[:350]
X_test = X[350:]
y_train = y[:350]
y_test = y[350:]

In [22]:
# LinearRegrassionで回帰モデルを作成
model1 = LinearRegression(n_jobs=-1)
model1.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=-1, normalize=False)

In [23]:
# LiearRegressionで予測実行
y_pred1 = model1.predict(X_test)
y_pred1 = y_pred1.astype(np.uint32)
y_pred1

array([ 2976504,  2011342,  2011854,  7234625,  6247455, 13182582,
        9095368,  2967043,  2003902,  2005009,  7248612,  6248144,
       13219973,  9106668,  2954454,  1996155,  1998231,  7255390,
        6231681, 13254834,  9116352,  2944770,  1990345,  1990484,
        7267011,  6232650, 13325526,  9129909,  2935086,  1984534,
        1983705,  7285410,  6240397, 13414617,  9148308,  2925379,
        1978971,  1978006,  7304327,  6253631, 13527211,  9170788],
      dtype=uint32)

In [24]:
# 正解の表示
y_test

array([ 2960000,  2000000,  2001000,  7209000,  6217000, 13198000,
        9060000,  2947000,  1992000,  1994000,  7216000,  6200000,
       13234000,  9070000,  2937000,  1986000,  1986000,  7228000,
        6201000, 13307000,  9084000,  2927000,  1980000,  1979000,
        7247000,  6209000, 13399000,  9103000,  2916976,  1974255,
        1973115,  7266534,  6222666, 13515271,  9126214,  2905000,
        1966000,  1967000,  7289000,  6236000, 13624000,  9145000],
      dtype=uint32)

In [13]:
# RandomForestRegressorで回帰モデルを作成
model2 = RandomForestRegressor(n_estimators=100)
model2.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [15]:
# 予測実行と表示
y_pred2 = model2.predict(X_test)
y_pred2 = y_pred2.astype(np.uint32)
y_pred2

array([ 2970787,  2011287,  2012098,  7267193,  6277202, 13112857,
        9017121,  2968080,  2003827,  2004050,  7270453,  6277202,
       13112857,  9017121,  2959630,  1996470,  2002272,  7270453,
        6259782, 13112857,  9017121,  2954037,  1991111,  1990592,
        7278422,  6259782, 13112857,  9017121,  2946647,  1986501,
        1984223,  7320461,  6277202, 13112857,  9017121,  2942240,
        1983151,  1982962,  7320461,  6306236, 13112857,  9017121],
      dtype=uint32)

In [16]:
y_test

array([ 2960000,  2000000,  2001000,  7209000,  6217000, 13198000,
        9060000,  2947000,  1992000,  1994000,  7216000,  6200000,
       13234000,  9070000,  2937000,  1986000,  1986000,  7228000,
        6201000, 13307000,  9084000,  2927000,  1980000,  1979000,
        7247000,  6209000, 13399000,  9103000,  2916976,  1974255,
        1973115,  7266534,  6222666, 13515271,  9126214,  2905000,
        1966000,  1967000,  7289000,  6236000, 13624000,  9145000],
      dtype=uint32)

In [17]:
# SVRで回帰モデルを作成
model3 = SVR(gamma='scale')
model3.fit(X_train, y_train)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [18]:
# 予測実行と表示
y_pred3 = model3.predict(X_test)
y_pred3 = y_pred3.astype(np.uint32)
y_pred3

array([3928213, 3928193, 3928193, 3928350, 3928321, 3928364, 3928382,
       3928213, 3928192, 3928192, 3928350, 3928321, 3928364, 3928383,
       3928213, 3928192, 3928192, 3928350, 3928320, 3928363, 3928383,
       3928212, 3928192, 3928192, 3928350, 3928320, 3928362, 3928383,
       3928212, 3928192, 3928192, 3928351, 3928321, 3928361, 3928383,
       3928212, 3928192, 3928192, 3928351, 3928321, 3928359, 3928383],
      dtype=uint32)