# 0.数据加载

初始化 Spark

In [1]:
import findspark
findspark.init()

from pyspark.sql import SparkSession # SparkSession 是Spark 2.0版本的新入口
spark = SparkSession.builder.master('local').getOrCreate()

### 格式化读取

定义schema

In [2]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType

schema= StructType([
    StructField("review_id",StringType(),True),
    StructField("longitude",FloatType(),True),
    StructField("latitude",FloatType(),True),
    StructField("altitude",FloatType(),True),
    StructField("review_date",StringType(),True),
    StructField("temperature",StringType(),True),
    StructField("rating",FloatType(),True),
    StructField("user_id",StringType(),True),
    StructField("user_birthday",StringType(),True),
    StructField("user_nationality",StringType(),True),
    StructField("user_career",StringType(),True),
    StructField("user_income",FloatType(),True),
])

从hdfs加载数据，获得dataframe

注意设置 nullValue 标志

In [3]:
data = spark.read.csv(path="hdfs://localhost:9000/user/bdlab/lab1/data_all.txt.gz",header=None,nullValue='?',schema=schema,sep='|')
data.count()

4783614

In [4]:
data.take(2)

[Row(review_id='144552912', longitude=9.349848747253418, latitude=56.740875244140625, altitude=17.052772521972656, review_date='2011/06/27', temperature='18.5℃', rating=83.91000366210938, user_id='38267', user_birthday='1974-06-08', user_nationality='Switzerland', user_career='programmer', user_income=5042.0),
 Row(review_id='144552912', longitude=9.350188255310059, latitude=56.74068069458008, altitude=17.614839553833008, review_date='2016-10-08', temperature='37.8℉', rating=78.80000305175781, user_id='1205', user_birthday='1991-04-14', user_nationality='Italy', user_career='teacher', user_income=1705.0)]

In [58]:
data.groupBy('user_career').count().show()

+-----------+------+
|user_career| count|
+-----------+------+
|    teacher|607541|
|     writer|606584|
| programmer|600149|
|     farmer|600468|
| accountant|587367|
|     artist|590183|
|    Manager|595507|
|     doctor|595815|
+-----------+------+



In [14]:
607541+606584+600149+600468+587367+590183+595507+595815

4783614

### 转换RDD

In [5]:
data_rdd = data.rdd

In [6]:
data_rdd.take(2)

[Row(review_id='144552912', longitude=9.349848747253418, latitude=56.740875244140625, altitude=17.052772521972656, review_date='2011/06/27', temperature='18.5℃', rating=83.91000366210938, user_id='38267', user_birthday='1974-06-08', user_nationality='Switzerland', user_career='programmer', user_income=5042.0),
 Row(review_id='144552912', longitude=9.350188255310059, latitude=56.74068069458008, altitude=17.614839553833008, review_date='2016-10-08', temperature='37.8℉', rating=78.80000305175781, user_id='1205', user_birthday='1991-04-14', user_nationality='Italy', user_career='teacher', user_income=1705.0)]

# 1.分层抽样

映射为 pair(career,rows) 

In [7]:
# 按career分层
sample = data_rdd.map(lambda x : (x['user_career'],[x]))

In [8]:
sample.take(2)

[('programmer',
  [Row(review_id='144552912', longitude=9.349848747253418, latitude=56.740875244140625, altitude=17.052772521972656, review_date='2011/06/27', temperature='18.5℃', rating=83.91000366210938, user_id='38267', user_birthday='1974-06-08', user_nationality='Switzerland', user_career='programmer', user_income=5042.0)]),
 ('teacher',
  [Row(review_id='144552912', longitude=9.350188255310059, latitude=56.74068069458008, altitude=17.614839553833008, review_date='2016-10-08', temperature='37.8℉', rating=78.80000305175781, user_id='1205', user_birthday='1991-04-14', user_nationality='Italy', user_career='teacher', user_income=1705.0)])]

In [115]:
# dubug
# sampledf = spark.createDataFrame(sample.values()[0],schema=schema)

TypeError: 'PipelinedRDD' object does not support indexing

In [107]:
# sampledf.groupBy('user_career').count().show()

+-----------+------+
|user_career| count|
+-----------+------+
|    teacher|607541|
|     writer|606584|
| programmer|600149|
|     farmer|600468|
| accountant|587367|
|     artist|590183|
|    Manager|595507|
|     doctor|595815|
+-----------+------+



分层抽样 1%

In [9]:
# 分层函数
n = 1
def layer(x,y):
    """
    Row x , y  with same key.
    """
    global n
    n += 1 
    if n%100 is 1:
        return x+y
    return x      

In [10]:
sample_layer = sample.reduceByKey(layer)

In [12]:
sample_layer_result = sample_layer.collect()

In [13]:
for item in sample_layer_result:
    print(item[0],len(item[1]))

programmer 5906
teacher 6118
farmer 6039
doctor 5975
Manager 5911
accountant 5874
artist 5816
writer 6205


In [16]:
sample_layer_result[0][1][:3]

[Row(review_id='144552912', longitude=9.349848747253418, latitude=56.740875244140625, altitude=17.052772521972656, review_date='2011/06/27', temperature='18.5℃', rating=83.91000366210938, user_id='38267', user_birthday='1974-06-08', user_nationality='Switzerland', user_career='programmer', user_income=5042.0),
 Row(review_id='125830646', longitude=9.975799560546875, latitude=56.607295989990234, altitude=54.140926361083984, review_date='2011-08-04', temperature='52.3℉', rating=88.77999877929688, user_id='14661', user_birthday='1980-11-20', user_nationality='Italy', user_career='programmer', user_income=2128.0),
 Row(review_id='26218810', longitude=9.98774242401123, latitude=56.99723434448242, altitude=10.679211616516113, review_date='2010-02-27', temperature='-2.1℃', rating=81.68000030517578, user_id='9308', user_birthday='1986/11/09', user_nationality='Austria', user_career='programmer', user_income=2681.0)]

#### flat映射： 划分value至多行，格式化str

In [82]:
sample_layer = sample_layer.flatMap(lambda x:x[1][:])

In [77]:
def str_form(item):
    return "|".join([str(x) for x in item[:]])

In [78]:
# 格式化用于输出
sample_result = sample_layer.map(str_form)

# 抽样样本保存到 hdfs
sample_result.saveAsTextFile("hdfs://localhost:9000/user/bdlab/lab1/data_sample.txt")

# 2.数据过滤

In [83]:
sample_layer.take(2)

[Row(review_id='144552912', longitude=9.349848747253418, latitude=56.740875244140625, altitude=17.052772521972656, review_date='2011/06/27', temperature='18.5℃', rating=83.91000366210938, user_id='38267', user_birthday='1974-06-08', user_nationality='Switzerland', user_career='programmer', user_income=5042.0),
 Row(review_id='125830646', longitude=9.975799560546875, latitude=56.607295989990234, altitude=54.140926361083984, review_date='2011-08-04', temperature='52.3℉', rating=88.77999877929688, user_id='14661', user_birthday='1980-11-20', user_nationality='Italy', user_career='programmer', user_income=2128.0)]

#### 定义上下界

获得rating上下界

In [86]:
rate_sample = sample_layer.map(lambda x:x['rating']).collect()

In [94]:
rate_sample = list(filter(lambda x: isinstance(x,float),rate_sample))

In [95]:
rate_sample.sort()

In [111]:
rate_size = len(rate_sample)
rate_min = rate_sample[ int(rate_size*0.01) ]
rate_max = rate_sample[ int(rate_size*0.99)]
print(rate_min,rate_max)

59.63999938964844 95.94999694824219


longitude,latitude 界限

In [114]:
longitude_min = 8.1461259
longitude_max = 11.1993265
latitude_min = 56.5824856
latitude_max = 57.750511

### 过滤总体数据

In [119]:
def llr_filter(item):
    """
    filter item based value of longitude, latitude and rating
    """
    if not isinstance(item['longitude'],float):
        return False
    if item['longitude']<longitude_min or item['longitude']>longitude_max:
        return False
    
    if not isinstance(item['latitude'],float):
        return False
    if item['latitude']<latitude_min or item['latitude']>latitude_max:
        return False
    
    if not isinstance(item['rating'],float):
        return False
    if item['rating']<rate_min or item['rating']>rate_max:
        return False
    
    return True

In [120]:
data_filtered = data_rdd.filter(llr_filter)

In [121]:
data_filtered.count()

4681976

In [122]:
# 格式化用于输出
data_filtered_result = data_filtered.map(str_form)

# 过滤结果保存到 hdfs
data_filtered_result.saveAsTextFile("hdfs://localhost:9000/user/bdlab/lab1/data_filtered.txt")

# 3.数据标准化和归一化