Jan 13 ,Wen, 2021 

In [1]:
from pyspark.sql import  SQLContext 
from pyspark import SparkContext
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
import matplotlib.pyplot as plt
import matplotlib
import pandas as pd
matplotlib.rcParams['font.family'] = 'Malgun Gothic'
matplotlib.rcParams['axes.unicode_minus'] = False

In [2]:
sc = SparkContext('local')
sqlctx = SQLContext(sc)

In [4]:
df = sqlctx.read.csv('../data/data1/weight.csv', header=True, inferSchema=True)
df.show()

+----+------+------+----------+------+---------+----------+
|year|height|weight|     grade|gender|gradecode|gendercode|
+----+------+------+----------+------+---------+----------+
|2017| 152.5|  47.9|elementary|   man|        2|         1|
|2017| 153.2|  46.6|elementary| woman|        2|         0|
|2017| 170.6|  63.8|    middle|   man|        0|         1|
|2017| 160.4|  54.2|    middle| woman|        0|         0|
|2017| 173.9|  72.3|      high|   man|        1|         1|
|2017| 160.9|  57.7|      high| woman|        1|         0|
+----+------+------+----------+------+---------+----------+



# 다중 선형 회귀
### 특성 데이터
- height, gradecode, gendercode

### 라벨 
- weight

In [13]:
f = VectorAssembler( inputCols=[ 'height','gradecode','gendercode' ], outputCol='features' ) 
v_df = f.transform( df )
v_df.show() 

+----+------+------+----------+------+---------+----------+---------------+
|year|height|weight|     grade|gender|gradecode|gendercode|       features|
+----+------+------+----------+------+---------+----------+---------------+
|2017| 152.5|  47.9|elementary|   man|        2|         1|[152.5,2.0,1.0]|
|2017| 153.2|  46.6|elementary| woman|        2|         0|[153.2,2.0,0.0]|
|2017| 170.6|  63.8|    middle|   man|        0|         1|[170.6,0.0,1.0]|
|2017| 160.4|  54.2|    middle| woman|        0|         0|[160.4,0.0,0.0]|
|2017| 173.9|  72.3|      high|   man|        1|         1|[173.9,1.0,1.0]|
|2017| 160.9|  57.7|      high| woman|        1|         0|[160.9,1.0,0.0]|
+----+------+------+----------+------+---------+----------+---------------+



In [14]:
v_df = v_df.select( 'weight', 'features' )
v_df.show()

+------+---------------+
|weight|       features|
+------+---------------+
|  47.9|[152.5,2.0,1.0]|
|  46.6|[153.2,2.0,0.0]|
|  63.8|[170.6,0.0,1.0]|
|  54.2|[160.4,0.0,0.0]|
|  72.3|[173.9,1.0,1.0]|
|  57.7|[160.9,1.0,0.0]|
+------+---------------+



In [19]:
lr = LinearRegression(   featuresCol='features',
                         labelCol='weight',
                         maxIter=100,     # 학습 횟수
                         regParam = 0.01, # running mate(학습률)
                     )
lr_model = lr.fit( v_df ) # 학습을 시작

# 예측 모델

In [25]:
lr_model.coefficients

DenseVector([1.2441, 1.992, -0.8302])

In [27]:
w1 = lr_model.coefficients[0]
w2 = lr_model.coefficients[1]
w3 = lr_model.coefficients[2]

In [28]:
b = lr_model.intercept

In [32]:
x1 = 0
x2 = 0
x3 = 0

y = w1*x1 +  w2*x2 + w3*x3 + b
print(y)

-145.94155709709023


## 키가 170이고 gradecode가 2(초등학생) gendercode가 남자(1)일때 몸무게

In [36]:
x1 = 170
x2 = 2
x3 = 1

y = w1*x1 +  w2*x2 + w3*x3 + b
print('예상 몸무게는:', y.round(2))

예상 몸무게는: 68.72
