## 과제
* Input data
    + Big five personality test records whose country is KR or US
    + Download data-final.csv file from https://www.kaggle.com/tunguz/big-five-personality-test
* Assignments
    + Do clustering with KMeans and GaussianMixture
    + For each clustering method, compute the Pearson correlation matrix between country attribute   
    and the probability distributions (or cluster assignment) that a person belongs to Group 1 or 2
* Code submission
각 클러스터에 속할 확률과 응답자 국가 간의 Pearson correlation을 출력하는 jupyter notebook을 작성하고  
파일이름을 '학생이름.ipynb'으로 하여 업로드 하시오. 

In [1]:
import time                     # 수행시간
start_time = time.time()

import findspark
findspark.init()

# create spark session
from pyspark import SparkConf
from pyspark.sql import SparkSession
# spark = SparkSession.builder.appName("my app").master("local").getOrCreate()

conf = SparkConf().setAppName("quiz_9") \
                  .setMaster("local[*]") \
                  .set("spark.driver.memory", "12g") \
                  .set("spark.executor.memory", "12g")
spark = SparkSession.builder \
                    .config(conf=conf) \
                    .getOrCreate()
sc = spark.sparkContext



# Data loading & schema configuration

Field types
* EXT1 ~ OPN10: float
* country: string 

In [2]:
cols = ['EXT1', 'EXT2', 'EXT3', 'EXT4', 'EXT5', 'EXT6', 'EXT7', 'EXT8', 'EXT9', 'EXT10', 'EST1', 'EST2', 'EST3', 'EST4', 'EST5', 'EST6', 'EST7', 'EST8', 'EST9', 'EST10', 'AGR1', 'AGR2', 'AGR3', 'AGR4', 'AGR5', 'AGR6', 'AGR7', 'AGR8', 'AGR9', 'AGR10', 'CSN1', 'CSN2', 'CSN3', 'CSN4', 'CSN5', 'CSN6', 'CSN7', 'CSN8', 'CSN9', 'CSN10', 'OPN1', 'OPN2', 'OPN3', 'OPN4', 'OPN5', 'OPN6', 'OPN7', 'OPN8', 'OPN9', 'OPN10', 'EXT1_E', 'EXT2_E', 'EXT3_E', 'EXT4_E', 'EXT5_E', 'EXT6_E', 'EXT7_E', 'EXT8_E', 'EXT9_E', 'EXT10_E', 'EST1_E', 'EST2_E', 'EST3_E', 'EST4_E', 'EST5_E', 'EST6_E', 'EST7_E', 'EST8_E', 'EST9_E', 'EST10_E', 'AGR1_E', 'AGR2_E', 'AGR3_E', 'AGR4_E', 'AGR5_E', 'AGR6_E', 'AGR7_E', 'AGR8_E', 'AGR9_E', 'AGR10_E', 'CSN1_E', 'CSN2_E', 'CSN3_E', 'CSN4_E', 'CSN5_E', 'CSN6_E', 'CSN7_E', 'CSN8_E', 'CSN9_E', 'CSN10_E', 'OPN1_E', 'OPN2_E', 'OPN3_E', 'OPN4_E', 'OPN5_E', 'OPN6_E', 'OPN7_E', 'OPN8_E', 'OPN9_E', 'OPN10_E', 'dateload', 'screenw', 'screenh', 'introelapse', 'testelapse', 'endelapse', 'IPC', 'country', 'lat_appx_lots_of_err', 'long_appx_lots_of_err']

In [3]:
from pyspark.sql.types import *

schema = StructType([ StructField(col, StringType(), False) if col == 'country' else StructField(col, FloatType(), False) for col in cols ]  )
raw_data = spark.read.format("csv").option("header","true").option("delimiter", "\t").schema(schema).load("data-final.csv")

In [4]:
cols_to_use = cols[:50] + ['country']
cols_to_drop = [ col for col in cols if col not in cols_to_use ]
raw_data= raw_data.drop(*cols_to_drop)

In [5]:
raw_data.show(2)

+----+----+----+----+----+----+----+----+----+-----+----+----+----+----+----+----+----+----+----+-----+----+----+----+----+----+----+----+----+----+-----+----+----+----+----+----+----+----+----+----+-----+----+----+----+----+----+----+----+----+----+-----+-------+
|EXT1|EXT2|EXT3|EXT4|EXT5|EXT6|EXT7|EXT8|EXT9|EXT10|EST1|EST2|EST3|EST4|EST5|EST6|EST7|EST8|EST9|EST10|AGR1|AGR2|AGR3|AGR4|AGR5|AGR6|AGR7|AGR8|AGR9|AGR10|CSN1|CSN2|CSN3|CSN4|CSN5|CSN6|CSN7|CSN8|CSN9|CSN10|OPN1|OPN2|OPN3|OPN4|OPN5|OPN6|OPN7|OPN8|OPN9|OPN10|country|
+----+----+----+----+----+----+----+----+----+-----+----+----+----+----+----+----+----+----+----+-----+----+----+----+----+----+----+----+----+----+-----+----+----+----+----+----+----+----+----+----+-----+----+----+----+----+----+----+----+----+----+-----+-------+
| 4.0| 1.0| 5.0| 2.0| 5.0| 1.0| 5.0| 2.0| 4.0|  1.0| 1.0| 4.0| 4.0| 2.0| 2.0| 2.0| 2.0| 2.0| 3.0|  2.0| 2.0| 5.0| 2.0| 4.0| 2.0| 3.0| 2.0| 4.0| 3.0|  4.0| 3.0| 4.0| 3.0| 2.0| 2.0| 4.0| 4.0| 2.0| 4.0|  4.0|

In [6]:
raw_data.printSchema()

root
 |-- EXT1: float (nullable = true)
 |-- EXT2: float (nullable = true)
 |-- EXT3: float (nullable = true)
 |-- EXT4: float (nullable = true)
 |-- EXT5: float (nullable = true)
 |-- EXT6: float (nullable = true)
 |-- EXT7: float (nullable = true)
 |-- EXT8: float (nullable = true)
 |-- EXT9: float (nullable = true)
 |-- EXT10: float (nullable = true)
 |-- EST1: float (nullable = true)
 |-- EST2: float (nullable = true)
 |-- EST3: float (nullable = true)
 |-- EST4: float (nullable = true)
 |-- EST5: float (nullable = true)
 |-- EST6: float (nullable = true)
 |-- EST7: float (nullable = true)
 |-- EST8: float (nullable = true)
 |-- EST9: float (nullable = true)
 |-- EST10: float (nullable = true)
 |-- AGR1: float (nullable = true)
 |-- AGR2: float (nullable = true)
 |-- AGR3: float (nullable = true)
 |-- AGR4: float (nullable = true)
 |-- AGR5: float (nullable = true)
 |-- AGR6: float (nullable = true)
 |-- AGR7: float (nullable = true)
 |-- AGR8: float (nullable = true)
 |-- AGR9: fl

In [7]:
raw_data = raw_data.filter((raw_data['country'] == 'KR') | (raw_data['country'] == 'US'))

In [8]:
print(raw_data.count())
# raw_data = raw_data.drop('country')
raw_data = raw_data.limit(547990)
print(raw_data.count())
raw_data.show(3)

547996
547990
+----+----+----+----+----+----+----+----+----+-----+----+----+----+----+----+----+----+----+----+-----+----+----+----+----+----+----+----+----+----+-----+----+----+----+----+----+----+----+----+----+-----+----+----+----+----+----+----+----+----+----+-----+-------+
|EXT1|EXT2|EXT3|EXT4|EXT5|EXT6|EXT7|EXT8|EXT9|EXT10|EST1|EST2|EST3|EST4|EST5|EST6|EST7|EST8|EST9|EST10|AGR1|AGR2|AGR3|AGR4|AGR5|AGR6|AGR7|AGR8|AGR9|AGR10|CSN1|CSN2|CSN3|CSN4|CSN5|CSN6|CSN7|CSN8|CSN9|CSN10|OPN1|OPN2|OPN3|OPN4|OPN5|OPN6|OPN7|OPN8|OPN9|OPN10|country|
+----+----+----+----+----+----+----+----+----+-----+----+----+----+----+----+----+----+----+----+-----+----+----+----+----+----+----+----+----+----+-----+----+----+----+----+----+----+----+----+----+-----+----+----+----+----+----+----+----+----+----+-----+-------+
| 4.0| 3.0| 4.0| 3.0| 3.0| 3.0| 5.0| 3.0| 4.0|  3.0| 2.0| 4.0| 4.0| 2.0| 4.0| 2.0| 2.0| 2.0| 4.0|  4.0| 1.0| 2.0| 1.0| 5.0| 3.0| 5.0| 3.0| 4.0| 4.0|  5.0| 3.0| 2.0| 4.0| 2.0| 1.0| 4.0| 4.0| 2

# Data processing steps

1. Imputer: replace <code>None</code> with average values
1. VectorAssembler: create a feature vector with all scores for the 50 questions
1. PCA: reduce the dimensionality into 10
1. GaussianMixture (or KMeans): clustering
1. StringIndexer: represent country attributes with integer IDs
1. OneHotEncoder: interpret country IDs with one hot vectors
1. VectorAssembler: concatenate cluster probability vector and one-hot encoded country ID vector
1. Correlation: compute correlation matrix

In [9]:
from pyspark.ml.feature import Imputer

imputer=Imputer(inputCols=raw_data.columns[:50],outputCols=raw_data.columns[:50])
raw_data = imputer.fit(raw_data).transform(raw_data)

In [10]:
print(imputer.getStrategy())
print(raw_data.count())
raw_data.show(2)

mean
547990
+----+----+----+----+----+----+----+----+----+-----+----+----+----+----+----+----+----+----+----+-----+----+----+----+----+----+----+----+----+----+-----+----+----+----+----+----+----+----+----+----+-----+----+----+----+----+----+----+----+----+----+-----+-------+
|EXT1|EXT2|EXT3|EXT4|EXT5|EXT6|EXT7|EXT8|EXT9|EXT10|EST1|EST2|EST3|EST4|EST5|EST6|EST7|EST8|EST9|EST10|AGR1|AGR2|AGR3|AGR4|AGR5|AGR6|AGR7|AGR8|AGR9|AGR10|CSN1|CSN2|CSN3|CSN4|CSN5|CSN6|CSN7|CSN8|CSN9|CSN10|OPN1|OPN2|OPN3|OPN4|OPN5|OPN6|OPN7|OPN8|OPN9|OPN10|country|
+----+----+----+----+----+----+----+----+----+-----+----+----+----+----+----+----+----+----+----+-----+----+----+----+----+----+----+----+----+----+-----+----+----+----+----+----+----+----+----+----+-----+----+----+----+----+----+----+----+----+----+-----+-------+
| 4.0| 3.0| 4.0| 3.0| 3.0| 3.0| 5.0| 3.0| 4.0|  3.0| 2.0| 4.0| 4.0| 2.0| 4.0| 2.0| 2.0| 2.0| 4.0|  4.0| 1.0| 2.0| 1.0| 5.0| 3.0| 5.0| 3.0| 4.0| 4.0|  5.0| 3.0| 2.0| 4.0| 2.0| 1.0| 4.0| 4.0| 2.0

In [11]:
from pyspark.ml.feature import VectorAssembler

input_cols = raw_data.columns[:50]

In [12]:
assembler = VectorAssembler(inputCols=input_cols,outputCol="raw_features")
df1=assembler.transform(raw_data)

In [13]:
df1.select("raw_features").show(3)

+--------------------+
|        raw_features|
+--------------------+
|[4.0,3.0,4.0,3.0,...|
|[3.0,2.0,2.0,4.0,...|
|[1.0,2.0,3.0,4.0,...|
+--------------------+
only showing top 3 rows



In [14]:
from pyspark.ml.feature import PCA

pca = PCA(k=10, inputCol="raw_features", outputCol="features")
df2 = pca.fit(df1).transform(df1)
df2.show(3)

+----+----+----+----+----+----+----+----+----+-----+----+----+----+----+----+----+----+----+----+-----+----+----+----+----+----+----+----+----+----+-----+----+----+----+----+----+----+----+----+----+-----+----+----+----+----+----+----+----+----+----+-----+-------+--------------------+--------------------+
|EXT1|EXT2|EXT3|EXT4|EXT5|EXT6|EXT7|EXT8|EXT9|EXT10|EST1|EST2|EST3|EST4|EST5|EST6|EST7|EST8|EST9|EST10|AGR1|AGR2|AGR3|AGR4|AGR5|AGR6|AGR7|AGR8|AGR9|AGR10|CSN1|CSN2|CSN3|CSN4|CSN5|CSN6|CSN7|CSN8|CSN9|CSN10|OPN1|OPN2|OPN3|OPN4|OPN5|OPN6|OPN7|OPN8|OPN9|OPN10|country|        raw_features|            features|
+----+----+----+----+----+----+----+----+----+-----+----+----+----+----+----+----+----+----+----+-----+----+----+----+----+----+----+----+----+----+-----+----+----+----+----+----+----+----+----+----+-----+----+----+----+----+----+----+----+----+----+-----+-------+--------------------+--------------------+
| 4.0| 3.0| 4.0| 3.0| 3.0| 3.0| 5.0| 3.0| 4.0|  3.0| 2.0| 4.0| 4.0| 2.0| 4.0| 2

In [15]:
df2.show(3)

+----+----+----+----+----+----+----+----+----+-----+----+----+----+----+----+----+----+----+----+-----+----+----+----+----+----+----+----+----+----+-----+----+----+----+----+----+----+----+----+----+-----+----+----+----+----+----+----+----+----+----+-----+-------+--------------------+--------------------+
|EXT1|EXT2|EXT3|EXT4|EXT5|EXT6|EXT7|EXT8|EXT9|EXT10|EST1|EST2|EST3|EST4|EST5|EST6|EST7|EST8|EST9|EST10|AGR1|AGR2|AGR3|AGR4|AGR5|AGR6|AGR7|AGR8|AGR9|AGR10|CSN1|CSN2|CSN3|CSN4|CSN5|CSN6|CSN7|CSN8|CSN9|CSN10|OPN1|OPN2|OPN3|OPN4|OPN5|OPN6|OPN7|OPN8|OPN9|OPN10|country|        raw_features|            features|
+----+----+----+----+----+----+----+----+----+-----+----+----+----+----+----+----+----+----+----+-----+----+----+----+----+----+----+----+----+----+-----+----+----+----+----+----+----+----+----+----+-----+----+----+----+----+----+----+----+----+----+-----+-------+--------------------+--------------------+
| 4.0| 3.0| 4.0| 3.0| 3.0| 3.0| 5.0| 3.0| 4.0|  3.0| 2.0| 4.0| 4.0| 2.0| 4.0| 2

## GaussianMixture

In [16]:
from pyspark.ml.clustering import GaussianMixture
gm = GaussianMixture(featuresCol="features", k=2, tol=0.0001, seed=10) # tol 조기종료 조건
df3 = gm.fit(df2).transform(df2) # prediction,  probability 생성됨

In [17]:
# 각 클러스터에 속할 확률
df3.select("prediction", "probability").show(3, truncate=False)

+----------+-----------------------------------------+
|prediction|probability                              |
+----------+-----------------------------------------+
|0         |[0.9962795654227283,0.003720434577271679]|
|1         |[0.4722966849601867,0.5277033150398134]  |
|0         |[0.9430267462702172,0.05697325372978272] |
+----------+-----------------------------------------+
only showing top 3 rows



In [18]:
# StringIndexer : 한 컬럼에 주어진 모든 워드 리스트에 대해 인덱스 벡터 형성 (us, kr) => 0, 1
from pyspark.ml.feature import StringIndexer
indexer = StringIndexer(inputCol="country", outputCol="country_int")
df4 = indexer.fit(df3).transform(df3)

In [19]:
df4.select("country", "country_int").show(3)

+-------+-----------+
|country|country_int|
+-------+-----------+
|     US|        0.0|
|     US|        0.0|
|     US|        0.0|
+-------+-----------+
only showing top 3 rows



In [20]:
from pyspark.ml.feature import OneHotEncoder
encoder = OneHotEncoder(inputCol="country_int", outputCol="country_onehot", dropLast=False)
df5 = encoder.transform(df4)

In [21]:
print(df5.select("country", "country_int", "country_onehot").show(3))
df5.filter(df5["country"] == "KR").select("country", "country_int", "country_onehot").show(3)

+-------+-----------+--------------+
|country|country_int|country_onehot|
+-------+-----------+--------------+
|     US|        0.0| (2,[0],[1.0])|
|     US|        0.0| (2,[0],[1.0])|
|     US|        0.0| (2,[0],[1.0])|
+-------+-----------+--------------+
only showing top 3 rows

None
+-------+-----------+--------------+
|country|country_int|country_onehot|
+-------+-----------+--------------+
|     KR|        1.0| (2,[1],[1.0])|
|     KR|        1.0| (2,[1],[1.0])|
|     KR|        1.0| (2,[1],[1.0])|
+-------+-----------+--------------+
only showing top 3 rows



In [22]:
assembler = VectorAssembler(inputCols=["probability", "country_onehot"],outputCol="corr_features")
df6=assembler.transform(df5)

In [23]:
df6.select("corr_features").show(3, truncate=False)

+-------------------------------------------------+
|corr_features                                    |
+-------------------------------------------------+
|[0.9962795654227283,0.003720434577271679,1.0,0.0]|
|[0.4722966849601867,0.5277033150398134,1.0,0.0]  |
|[0.9430267462702172,0.05697325372978272,1.0,0.0] |
+-------------------------------------------------+
only showing top 3 rows



In [24]:
# 응답자 국가 간의 Pearson correlation
from pyspark.ml.stat import Correlation
Correlation.corr(df6, 'corr_features', 'pearson').first()

Row(pearson(corr_features)=DenseMatrix(4, 4, [1.0, -1.0, 0.0116, -0.0116, -1.0, 1.0, -0.0116, 0.0116, 0.0116, -0.0116, 1.0, -1.0, -0.0116, 0.0116, -1.0, 1.0], False))

## KMeans

In [25]:
from pyspark.ml.clustering import KMeans
kmeans = KMeans(featuresCol="features", k=2, tol=0.0001, seed=10)
df7 = kmeans.fit(df2).transform(df2) # prediction 생성  # probability 제공되지 않음

In [26]:
# 각 클러스터에 속할 확률
df7.select("prediction").show(3)

+----------+
|prediction|
+----------+
|         1|
|         0|
|         0|
+----------+
only showing top 3 rows



In [27]:
# StringIndexer : 한 컬럼에 주어진 모든 워드 리스트에 대해 인덱스 벡터 형성 (us, kr) => 0, 1
from pyspark.ml.feature import StringIndexer
indexer = StringIndexer(inputCol="country", outputCol="country_int")
df8 = indexer.fit(df7).transform(df7)

In [28]:
df8.select("country", "country_int").show(3)

+-------+-----------+
|country|country_int|
+-------+-----------+
|     US|        0.0|
|     US|        0.0|
|     US|        0.0|
+-------+-----------+
only showing top 3 rows



In [29]:
from pyspark.ml.feature import OneHotEncoder
encoder = OneHotEncoder(inputCol="country_int", outputCol="country_onehot", dropLast=False)
df9 = encoder.transform(df8)

In [30]:
assembler = VectorAssembler(inputCols=["prediction", "country_onehot"],outputCol="corr_features")
df10=assembler.transform(df9)

In [31]:
df10.select("corr_features").show(3, truncate=False)

+-------------+
|corr_features|
+-------------+
|[1.0,1.0,0.0]|
|[0.0,1.0,0.0]|
|[0.0,1.0,0.0]|
+-------------+
only showing top 3 rows



In [32]:
# 응답자 국가 간의 Pearson correlation
from pyspark.ml.stat import Correlation
Correlation.corr(df10, 'corr_features', 'pearson').first()

Row(pearson(corr_features)=DenseMatrix(3, 3, [1.0, 0.0102, -0.0102, 0.0102, 1.0, -1.0, -0.0102, -1.0, 1.0], False))

In [33]:
spend_time=(time.time() - start_time)
print(f'수행시간(20core, memory 24G) : {int(spend_time//60)}분 {spend_time%60:.2f}초')

수행시간(20core, memory 24G) : 18분 11.06초
