In [1]:
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

In [2]:
df = sqlContext.read.csv('hdfs://master:9000/user/hadoop/data/population/PEOPLE_DONG_*.csv', encoding='utf-8', header=True)
df.show()

+--------+--------+------------------+------------------+
|    Date|    Dong|            Korean|         Foreigner|
+--------+--------+------------------+------------------+
|20181001|11650540|       844996.1385| 35918.26980000001|
|20181001|11500560|       323294.1939| 9496.722299999998|
|20181001|11500550|       273868.1758|        14069.8514|
|20181001|11500540|1011135.1059999999| 55369.08840000001|
|20181001|11500535| 682224.7902000003|         17591.523|
|20181001|11500530|       320394.0067| 8381.150699999998|
|20181001|11500520| 418434.2847000001|        10502.9019|
|20181001|11500510|       639723.6102|12931.091199999999|
|20181001|11470680|       455347.2532| 8172.707399999999|
|20181001|11470670| 342475.7120000001|         6246.6514|
|20181001|11470650|       683034.5879| 37323.77840000001|
|20181001|11470640| 693075.5993000001|        22115.5823|
|20181001|11470630|       264390.8291|         8286.0503|
|20181001|11470620|       310167.9084|10245.055199999999|
|20181001|1147

In [3]:
from pyspark.sql.types import IntegerType, DoubleType

In [4]:
df_needs = df[df['Date'] <= 20180331]
df_needs = df_needs.withColumn('Korean', df_needs['Korean'].cast(DoubleType()))
df_needs = df_needs.withColumn('Foreigner', df_needs['Foreigner'].cast(DoubleType()))
df_needs.show()

+--------+--------+------------------+------------------+
|    Date|    Dong|            Korean|         Foreigner|
+--------+--------+------------------+------------------+
|20180101|11110515|319425.17959999986|10974.346700000002|
|20180101|11110530| 494192.9028999999|33892.802800000005|
|20180101|11110540|       113893.2746| 9420.881100000002|
|20180101|11110550|296429.03169999993|16366.695899999999|
|20180101|11110560| 454602.6883000001|        15273.9189|
|20180101|11110570|       144968.5139| 4615.905100000001|
|20180101|11110580|       171402.9139| 8410.867799999998|
|20180101|11110600|       116639.5573|         7563.7665|
|20180101|11110615|      1037569.4191| 96341.90960000001|
|20180101|11110630|249625.16680000004|50755.090399999994|
|20180101|11110640|       380193.6807|        35893.0767|
|20180101|11110650|       647593.3987|179461.02030000003|
|20180101|11110670|       209789.6862| 68824.49090000002|
|20180101|11110680|       233995.1517|        93274.5834|
|20180101|1111

In [5]:
df_sum = df_needs.groupBy('Dong').sum('Korean', 'Foreigner') \
    .withColumnRenamed('sum(Korean)', 'Korean') \
    .withColumnRenamed('sum(Foreigner)', 'Foreigner')

df_sum = df_sum.withColumn('Korean', df_sum['Korean'].cast(IntegerType()))
df_sum = df_sum.withColumn('Foreigner', df_sum['Foreigner'].cast(IntegerType()))

df_sum.show()

+--------+---------+---------+
|    Dong|   Korean|Foreigner|
+--------+---------+---------+
|11410660|139116312|  4891592|
|11530770|177618139| 16484677|
|11590520|207978813| 21610133|
|11215820|243033354| 38141462|
|11110640|249337625| 23261675|
|11140605|196659655| 12810401|
|11320690|288049365|  5788424|
|11545510|614279089|107534825|
|11545670|291529789| 40697511|
|11560550|230125204| 14146248|
|11620545| 80823491|  2831788|
|11230570|154904180|  4964563|
|11320680|157372252|  3272477|
|11440555|258255498| 15521211|
|11470630|123663368|  3493622|
|11350580|263438094|  4781984|
|11350710|178551176|  1338910|
|11440720|184951514|  8974997|
|11680521|469435018| 28429211|
|11410585|672597429|172744020|
+--------+---------+---------+
only showing top 20 rows



In [6]:
df_total = df_sum.withColumn('Total', df_sum['Korean'] + df_sum['Foreigner'])
df_total.show()

+--------+---------+---------+---------+
|    Dong|   Korean|Foreigner|    Total|
+--------+---------+---------+---------+
|11410660|139116312|  4891592|144007904|
|11530770|177618139| 16484677|194102816|
|11590520|207978813| 21610133|229588946|
|11215820|243033354| 38141462|281174816|
|11110640|249337625| 23261675|272599300|
|11140605|196659655| 12810401|209470056|
|11320690|288049365|  5788424|293837789|
|11545510|614279089|107534825|721813914|
|11545670|291529789| 40697511|332227300|
|11560550|230125204| 14146248|244271452|
|11620545| 80823491|  2831788| 83655279|
|11230570|154904180|  4964563|159868743|
|11320680|157372252|  3272477|160644729|
|11440555|258255498| 15521211|273776709|
|11470630|123663368|  3493622|127156990|
|11350580|263438094|  4781984|268220078|
|11350710|178551176|  1338910|179890086|
|11440720|184951514|  8974997|193926511|
|11680521|469435018| 28429211|497864229|
|11410585|672597429|172744020|845341449|
+--------+---------+---------+---------+
only showing top

In [11]:
maxTotal = df_total.agg({'Total': 'max'}).collect()[0][0]
maxForeigner = df_total.agg({'Foreigner': 'max'}).collect()[0][0]

In [12]:
df_norm = df_total.withColumn('Total_Norm', df_total['Total'] / maxTotal)
df_norm = df_total.withColumn('Foreigner_Norm', df_total['Foreigner'] / maxForeigner)
df_norm.show()

+--------+---------+---------+---------+--------------------+
|    Dong|   Korean|Foreigner|    Total|      Foreigner_Norm|
+--------+---------+---------+---------+--------------------+
|11410660|139116312|  4891592|144007904|0.028316997601422034|
|11530770|177618139| 16484677|194102816|  0.0954283511521846|
|11590520|207978813| 21610133|229588946|  0.1250991669639273|
|11215820|243033354| 38141462|281174816| 0.22079758245755773|
|11110640|249337625| 23261675|272599300|  0.1346597989325477|
|11140605|196659655| 12810401|209470056| 0.07415828924208201|
|11320690|288049365|  5788424|293837789| 0.03350867949003387|
|11545510|614279089|107534825|721813914|  0.6225096822454403|
|11545670|291529789| 40697511|332227300| 0.23559432621748644|
|11560550|230125204| 14146248|244271452| 0.08189139050949491|
|11620545| 80823491|  2831788| 83655279|0.016392972677143902|
|11230570|154904180|  4964563|159868743|0.028739420328414263|
|11320680|157372252|  3272477|160644729|0.018944082695308354|
|1144055

In [13]:
df_norm[['Dong', 'Foreigner', 'Foreigner_Norm']].repartition(1).write.csv('hdfs://master:9000/user/hadoop/foreigner_0614')