In [1]:
import json
import codecs
from pyspark.sql import SparkSession

In [2]:
feature = {"type":"uType", "cid":"categoryId", "pid":"parentCategoryId",
        "ctr":"reItemCtr","collect1":"collectInterval", "add1":"addcartInterval", "tag":"kind",
        "uid":"hashUid", "province_city":"uCity_province", "tag_city":"kind_province", 
        "tag_grade":"kind_categoryGrade", "tag_uid":"kind_hashUid", "province":"uCity",
         "sale":"sellcntInterval","ctr_sale":"reItemCtr_sellcntInterval","iftag":"preferenceKind","ifcid":"preferenceCategory"
        }

def main():
    spark = SparkSession.builder.appName("lr_sample_train").getOrCreate()
    craftsman_city = spark.sql("SELECT new_city, row_number() OVER(ORDER BY new_city) as num from (SELECT DISTINCT new_city from songwt.city_normalize) t").collect()
    _dict = dict()
    for row in craftsman_city:
        _dict[row.new_city] = row.num
    
    definition = list()
    for fn in codecs.open("best_model", "rb"):
        ln = fn.strip().decode("utf8")
        lnn = ln.split("\t")
        name = lnn[0]
        value = lnn[1]
        weight = float(lnn[2])
        
        if weight == 0.0 or name == "(INTERCEPT)":
            continue
        if len(name.split("_")) == 2:
            new_name = feature[name]
            lr_name = [new_name.split("_")[0], new_name.split("_")[1]]
        
            if new_name.find("province") != -1 and value.split("_")[1] != "null":
                
                province = _dict[value.split("_")[1]]

                lr_term = [value.split("_")[0], province]
                
            else:
                lr_term = [value.split("_")[0], value.split("_")[1]]
        else:
            lr_name = [feature[name]]
            lr_term = [value]
        definition.append({"name":lr_name, "term":lr_term, "weight":weight})
    _finally = {"model":{"name":"index_lr_model","model":{"type":"model/logistic", "definition":definition}}}
    wf = open("lr_model.json", "w")
    json.dump(_finally, wf)
    print("finished")
    
def test():
    spark = SparkSession.builder.appName("lr_sample_train").getOrCreate()
    _list = []
    for ln in codecs.open("craftsman.csv", "rb"):
        ln = ln.strip().decode("utf8")
        lnn = ln.split(",")
        old_city = lnn[0]
        new_city = lnn[3]
        _list.append((old_city, new_city))
    df = spark.createDataFrame(_list, ['old_city', 'new_city'])
    df.write.format("parquet").mode("overwrite").saveAsTable("songwt.city_normalize")
    print("finished")

if __name__ == "__main__":
    main()
    #test()
        

{'': 1, '上海': 2, '东南': 3, '临安': 4, '临沧': 5, '丽水': 6, '丽江': 7, '义乌': 8, '云南': 9, '佛山': 10, '保定': 11, '内蒙古': 12, '凉山': 13, '北京': 14, '南京': 15, '南昌': 16, '南通': 17, '南阳': 18, '厦门': 19, '台州': 20, '台湾': 21, '吉林': 22, '嘉兴': 23, '四川': 24, '四日': 25, '国外': 26, '大理': 27, '天津': 28, '宁夏': 29, '宁波': 30, '安徽': 31, '宜兴': 32, '宣城': 33, '山东': 34, '山西': 35, '广东': 36, '广州': 37, '广西': 38, '成都': 39, '扬州': 40, '新疆': 41, '新馀': 42, '无锡': 43, '日本': 44, '景德镇': 45, '杭州': 46, '武夷山': 47, '武汉': 48, '江苏': 49, '江西': 50, '江门': 51, '河北': 52, '河南': 53, '泉州': 54, '浙江': 55, '海南': 56, '深圳': 57, '温州': 58, '湖北': 59, '湖南': 60, '湖州': 61, '甘肃': 62, '眉山': 63, '石家庄': 64, '福州': 65, '福建': 66, '绍兴': 67, '苏州': 68, '茶道': 69, '莆田': 70, '衢州': 71, '西双版纳': 72, '西安': 73, '西藏': 74, '贵州': 75, '辽宁': 76, '重庆': 77, '金华': 78, '陕西': 79, '雅安': 80, '青岛': 81, '青海': 82, '香港': 83, '黄山': 84, '黑龙江': 85, '龙泉': 86}
finished
