# 線形重回帰による数値予測

In [29]:
import numpy as np
import pandas as pd

In [30]:
#データの読み込み
from pyspark.sql import SparkSession

In [31]:
spark = SparkSession.builder \
        .master("local") \
        .appName("linear_regression") \
        .getOrCreate()

In [32]:
data = spark.read.csv("./data/bank/bank-full.csv", header=True, inferSchema=True, sep=";")

In [33]:
data.show()

+---+------------+--------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+---+
|age|         job| marital|education|default|balance|housing|loan|contact|day|month|duration|campaign|pdays|previous|poutcome|  y|
+---+------------+--------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+---+
| 58|  management| married| tertiary|     no|   2143|    yes|  no|unknown|  5|  may|     261|       1|   -1|       0| unknown| no|
| 44|  technician|  single|secondary|     no|     29|    yes|  no|unknown|  5|  may|     151|       1|   -1|       0| unknown| no|
| 33|entrepreneur| married|secondary|     no|      2|    yes| yes|unknown|  5|  may|      76|       1|   -1|       0| unknown| no|
| 47| blue-collar| married|  unknown|     no|   1506|    yes|  no|unknown|  5|  may|      92|       1|   -1|       0| unknown| no|
| 33|     unknown|  single|  unknown|     no|      1|     no|  no|unknown|  5|  may

## 全データを使ったモデリング（流れの確認）
- とりあえず特徴量には数値のカラムのみ使用する
- 標準化は行わない
- one-hot encodingは行わない
- 重複行などのデータチェックは行わない

### 手順
0. 簡単なデータチェック
1. データ作成
1. モデリング
1. 係数や精度指標の確認

In [34]:
data.summary().show()

+-------+------------------+-------+--------+---------+-------+------------------+-------+-----+--------+-----------------+-----+-----------------+-----------------+------------------+------------------+--------+-----+
|summary|               age|    job| marital|education|default|           balance|housing| loan| contact|              day|month|         duration|         campaign|             pdays|          previous|poutcome|    y|
+-------+------------------+-------+--------+---------+-------+------------------+-------+-----+--------+-----------------+-----+-----------------+-----------------+------------------+------------------+--------+-----+
|  count|             45211|  45211|   45211|    45211|  45211|             45211|  45211|45211|   45211|            45211|45211|            45211|            45211|             45211|             45211|   45211|45211|
|   mean| 40.93621021432837|   null|    null|     null|   null|1362.2720576850766|   null| null|    null|15.80641879188693| 

In [35]:
#データ作成

In [36]:
linear_df = data.select(["age", "balance", "campaign"])
target = "balance"
features = ["age", "campaign"]
train_df = data.select(features)

In [37]:
features

['age', 'campaign']

In [39]:
#データ作成ステージ
from pyspark.ml.feature import VectorAssembler
assemble = VectorAssembler(inputCols=features, outputCol="features")

In [41]:
#線形重回帰モデリングステージ
from pyspark.ml.regression import LinearRegression
clf = LinearRegression(featuresCol="features", labelCol="balance")

In [42]:
#パイプラインの設定：ステージの登録
from pyspark.ml.pipeline import Pipeline
pipeline = Pipeline(stages=[assemble, clf])
model = pipeline.fit(linear_df)

In [43]:
#パイプラインの実行
df = model.transform(linear_df)
df.show()

+---+-------+--------+----------+------------------+
|age|balance|campaign|  features|        prediction|
+---+-------+--------+----------+------------------+
| 58|   2143|       1|[58.0,1.0]|1867.1309208276969|
| 44|     29|       1|[44.0,1.0]| 1474.315799038966|
| 33|      2|       1|[33.0,1.0]| 1165.675346204963|
| 47|   1506|       1|[47.0,1.0]| 1558.490467993694|
| 33|      1|       1|[33.0,1.0]| 1165.675346204963|
| 35|    231|       1|[35.0,1.0]|1221.7917921747817|
| 28|    447|       1|[28.0,1.0]|1025.3842312804163|
| 42|      2|       1|[42.0,1.0]|1418.1993530691473|
| 58|    121|       1|[58.0,1.0]|1867.1309208276969|
| 43|    593|       1|[43.0,1.0]|1446.2575760540565|
| 41|    270|       1|[41.0,1.0]|1390.1411300842378|
| 29|    390|       1|[29.0,1.0]|1053.4424542653255|
| 53|      6|       1|[53.0,1.0]|1726.8398059031501|
| 58|     71|       1|[58.0,1.0]|1867.1309208276969|
| 57|    162|       1|[57.0,1.0]|1839.0726978427874|
| 51|    229|       1|[51.0,1.0]|1670.72335993

In [44]:
#係数の確認
model.stages[1].coefficients

DenseVector([28.0582, -14.7855])

In [45]:
#切片
model.stages[1].intercept

254.53947540939342

## 線形重回帰のモデリングと予測
学習データとテストデータ分ける

In [46]:
train_df, test_df = data.select(["age","balance", "campaign"]) \
                        .randomSplit([0.7, 0.3], seed = 1)

In [47]:
train_df.show()

+---+-------+--------+
|age|balance|campaign|
+---+-------+--------+
| 18|      3|       2|
| 18|      5|       2|
| 18|     35|       2|
| 18|    108|       1|
| 18|    156|       2|
| 18|    348|       4|
| 18|    438|       1|
| 18|    608|       1|
| 18|    608|       1|
| 18|   1944|       3|
| 19|      0|       3|
| 19|     60|       1|
| 19|     88|       1|
| 19|     96|       3|
| 19|    103|       2|
| 19|    103|       2|
| 19|    103|       2|
| 19|    108|       1|
| 19|    108|       2|
| 19|    134|       2|
+---+-------+--------+
only showing top 20 rows



In [48]:
train_df.count()

31676

学習データでモデリング

In [49]:
#データ作成ステージ
from pyspark.ml.feature import VectorAssembler
target = "balance"
features = ["age", "campaign"]
assemble = VectorAssembler(inputCols=features, outputCol="features")

In [50]:
#線形重回帰ステージ
from pyspark.ml.regression import LinearRegression
clf = LinearRegression(featuresCol="features", labelCol="balance")

In [51]:
#パイプライン登録
from pyspark.ml.pipeline import Pipeline
pipeline = Pipeline(stages=[assemble, clf])
model = pipeline.fit(train_df)

In [52]:
#実行
pred_train = model.transform(train_df)
pred_train.show()

+---+-------+--------+----------+-----------------+
|age|balance|campaign|  features|       prediction|
+---+-------+--------+----------+-----------------+
| 18|      3|       2|[18.0,2.0]|710.8452572678644|
| 18|      5|       2|[18.0,2.0]|710.8452572678644|
| 18|     35|       2|[18.0,2.0]|710.8452572678644|
| 18|    108|       1|[18.0,1.0]|723.7064702721563|
| 18|    156|       2|[18.0,2.0]|710.8452572678644|
| 18|    348|       4|[18.0,4.0]|685.1228312592808|
| 18|    438|       1|[18.0,1.0]|723.7064702721563|
| 18|    608|       1|[18.0,1.0]|723.7064702721563|
| 18|    608|       1|[18.0,1.0]|723.7064702721563|
| 18|   1944|       3|[18.0,3.0]|697.9840442635726|
| 19|      0|       3|[19.0,3.0]|726.5301363665992|
| 19|     60|       1|[19.0,1.0]|752.2525623751828|
| 19|     88|       1|[19.0,1.0]|752.2525623751828|
| 19|     96|       3|[19.0,3.0]|726.5301363665992|
| 19|    103|       2|[19.0,2.0]| 739.391349370891|
| 19|    103|       2|[19.0,2.0]| 739.391349370891|
| 19|    103

In [53]:
#RMSE（sklearn）
from sklearn.metrics import mean_squared_error
pred_train_pandas = pred_train.toPandas()
np.sqrt(mean_squared_error(pred_train_pandas["balance"], pred_train_pandas["prediction"]))

2954.785661525267

In [54]:
#係数
train_cols = train_df.columns
train_cols.remove(target)
pd.DataFrame(index=train_cols, data=model.stages[1].coefficients, columns=["coefficients"])

Unnamed: 0,coefficients
age,28.546092
campaign,-12.861213


テストデータによる予測

In [55]:
test_df.show()

+---+-------+--------+
|age|balance|campaign|
+---+-------+--------+
| 18|    108|       1|
| 18|    108|       1|
| 19|      0|       4|
| 19|      4|       1|
| 19|     27|      12|
| 19|     55|       2|
| 19|     56|       1|
| 19|    291|       5|
| 19|    329|       2|
| 19|    372|       3|
| 19|    424|       3|
| 19|    608|       1|
| 19|   1169|      18|
| 19|   1247|       1|
| 19|   1803|       1|
| 20|      0|       5|
| 20|     66|       2|
| 20|     88|       1|
| 20|    167|       1|
| 20|    215|       1|
+---+-------+--------+
only showing top 20 rows



In [56]:
test_df.count()

13535

In [58]:
pred_test = model.transform(test_df)
pred_test.show()

+---+-------+--------+-----------+-----------------+
|age|balance|campaign|   features|       prediction|
+---+-------+--------+-----------+-----------------+
| 18|    108|       1| [18.0,1.0]|723.7064702721563|
| 18|    108|       1| [18.0,1.0]|723.7064702721563|
| 19|      0|       4| [19.0,4.0]|713.6689233623074|
| 19|      4|       1| [19.0,1.0]|752.2525623751828|
| 19|     27|      12|[19.0,12.0]| 610.779219327973|
| 19|     55|       2| [19.0,2.0]| 739.391349370891|
| 19|     56|       1| [19.0,1.0]|752.2525623751828|
| 19|    291|       5| [19.0,5.0]|700.8077103580156|
| 19|    329|       2| [19.0,2.0]| 739.391349370891|
| 19|    372|       3| [19.0,3.0]|726.5301363665992|
| 19|    424|       3| [19.0,3.0]|726.5301363665992|
| 19|    608|       1| [19.0,1.0]|752.2525623751828|
| 19|   1169|      18|[19.0,18.0]|533.6119413022221|
| 19|   1247|       1| [19.0,1.0]|752.2525623751828|
| 19|   1803|       1| [19.0,1.0]|752.2525623751828|
| 20|      0|       5| [20.0,5.0]|729.35380246

In [62]:
#RMSE（sklearn）
from sklearn.metrics import mean_squared_error
pred_test_pandas = pred_test.toPandas()
np.sqrt(mean_squared_error(pred_test_pandas["balance"], pred_test_pandas["prediction"]))

3198.525889052522