In [1]:
import pandas as pd
import pymysql
from pyhive import hive
import pandas as pd
from pyspark import SparkContext,SQLContext
from pyspark.sql import SQLContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col
from pyspark.sql.types import StringType
import datetime


In [2]:
spark=SparkSession \
        .builder \
        .config("spark.eventLog.enabled", "false") \
        .config("spark.executor.memory", "4g")\
        .config("spark.driver.memory", "8g")\
        .config("spark.cores.max", "10")\
        .config("spark.task.maxFailures", "1000")\
        .config("spark.default.parallelism", "500")\
        .config("spark.sql.shuffle.partitions",50)\
        .appName('renhang_etl') \
        .master('yarn')\
        .getOrCreate()

In [5]:
class hiveParse(object):
    """
    该方法用于读取hive 数据，并转化成pandas dataframe
    """
    def __init__(self, host='', port=10000, user='', passwd='', dbName=''):
        self.host = host
        self.port = port
        self.user = user
        self.passwd = passwd
        self.dbName = dbName

    def conn_hive(self):
        self.connection = hive.Connection(host=self.host,port=self.port,username=self.user)

    def hive_connect(self,sql_select):
        cur = self.connection.cursor()
        cur.execute(sql_select)
        columns = [col[0] for col in cur.description]
        data = cur.fetchall()
        query_df = pd.DataFrame(data)
        query_df.columns = columns
        return query_df

class mysqldbParse(object):
    """
    该方法用于连接mysql，并执行相应操作
    """

    def __init__(self, host='', port=3306, user='', passwd='', dbName='', charset='utf8', connect_timeout=31536000):
        self.host = host
        self.port = port
        self.user = user
        self.passwd = passwd
        self.dbName = dbName
        self.charset = charset
        self.connect_timeout = connect_timeout

    def conn_mysql(self):
        self.conn = pymysql.connect(host=self.host,
                                    port=self.port,
                                    user=self.user,
                                    password=self.passwd,
                                    db=self.dbName,
                                    charset=self.charset,
                                    connect_timeout=self.connect_timeout,
                                    cursorclass=pymysql.cursors.DictCursor)

    def insert_sql(self, insertSql):
        with self.conn.cursor() as cursor:
            cursor.execute(insertSql)
            self.conn.commit()

    def select_Sql(self, selectSql):
        df = pd.read_sql(selectSql, self.conn)
        return df

    def close(self):
        self.conn.close()

In [6]:
def hive_df(sql):
    host = "58.59.18.61"
    port = 10000
    user = "hdfs"
    dbName = "ods"
    hive_parse = hiveParse(host,port,user,dbName)
    hive_parse.conn_hive()
    hive_pd = hive_parse.hive_connect(sql)
    return hive_pd

def mysql_df(selectSql):
    host = "58.59.11.86"
    port = 3306
    user = "tangdinghai"
    passwd = "Tangdinghai2019!"
    dbName = "reportpublic"
    mysql_parse = mysqldbParse(host,port,user,passwd,dbName)
    mysql_parse.conn_mysql()
    mysql_pd = mysql_parse.select_Sql(selectSql)
    mysql_parse.close()
    return mysql_pd



In [68]:
#### 头条分发模型：有授信额度相关客户级和订单级汇总监控报表---日报

In [26]:
base_dict = {'score_xgb':['>=0.093300','[0.068954,0.093300)','[0.055221,0.068954)','[0.045881,0.055221)','[0.038707,0.045881)','[0.032669,0.038707)','[0.027251,0.032669)','[0.022232,0.027251)','[0.016447,0.022232)','<0.016447','汇总'],\
             'toutiao_develop':['10.00%','10.00%','10.00%','10.00%','10.00%','10.00%','10.00%','10.00%','10.00%','10.00%','520']}
dairy_report = pd.DataFrame(base_dict)


In [27]:
table1 = spark.read \
    .format("jdbc") \
    .option("url", "jdbc:mysql://58.59.11.86:3306") \
    .option("dbtable", "reportpublic.ods_credit_credit_info") \
    .option("user", "tangdinghai") \
    .option("password", "Tangdinghai2019!") \
    .load()
table1.createOrReplaceTempView("ods_credit_credit_info")

table1 = spark.read \
    .format("jdbc") \
    .option("url", "jdbc:mysql://58.59.11.86:3306") \
    .option("dbtable", "reportpublic.ods_credit_credit_apply") \
    .option("user", "tangdinghai") \
    .option("password", "Tangdinghai2019!") \
    .load()
table1.createOrReplaceTempView("ods_credit_credit_apply")

table1 = spark.read \
    .format("jdbc") \
    .option("url", "jdbc:mysql://58.59.11.86:3306") \
    .option("dbtable", "reportpublic.ods_loan_invoice") \
    .option("user", "tangdinghai") \
    .option("password", "Tangdinghai2019!") \
    .load()
table1.createOrReplaceTempView("ods_loan_invoice")

In [28]:
hive_sql = """


select
	score_xgb,
	count(distinct(user_id)) as apply_uid_%s,
    count(distinct(session_id)) as apply_sessionId_%s
from
(
	select
		user_id,
        session_id,
		case
			when cast(t.T06_JRTT_XGB_SCORE as double) >= 0.093300 then ">==0.093300"
			when cast(t.T06_JRTT_XGB_SCORE as double) >= 0.068954 and cast(t.T06_JRTT_XGB_SCORE as double) < 0.093300 then "[0.068954,0.093300)"
			when cast(t.T06_JRTT_XGB_SCORE as double) >= 0.055221 and cast(t.T06_JRTT_XGB_SCORE as double) < 0.068954 then "[0.055221,0.068954)"
			when cast(t.T06_JRTT_XGB_SCORE as double) >= 0.045881 and cast(t.T06_JRTT_XGB_SCORE as double) < 0.055221 then "[0.045881,0.055221)"
			when cast(t.T06_JRTT_XGB_SCORE as double) >= 0.038707 and cast(t.T06_JRTT_XGB_SCORE as double) < 0.045881 then "[0.038707,0.045881)"
			when cast(t.T06_JRTT_XGB_SCORE as double) >= 0.032669 and cast(t.T06_JRTT_XGB_SCORE as double) < 0.038707 then "[0.032669,0.038707)"
			when cast(t.T06_JRTT_XGB_SCORE as double) >= 0.027251 and cast(t.T06_JRTT_XGB_SCORE as double) < 0.032669 then "[0.027251,0.032669)"			
			when cast(t.T06_JRTT_XGB_SCORE as double) >= 0.022232 and cast(t.T06_JRTT_XGB_SCORE as double) < 0.027251 then "[0.022232,0.027251)"
			when cast(t.T06_JRTT_XGB_SCORE as double) >= 0.016447 and cast(t.T06_JRTT_XGB_SCORE as double) < 0.022232 then "[0.016447,0.022232)"
			when cast(t.T06_JRTT_XGB_SCORE as double) >= 0.016447 then "<0.016447"
		end as score_xgb
	from 
			(
				SELECT
     				a.session_id as session_id,
     				a.user_id as user_id,
     				a.t02_apply_time as apply_time,
     				b.out_credit_score as out_credit_score,
                    a.T06_JRTT_XGB_SCORE as T06_JRTT_XGB_SCORE
				FROM 
				(
					SELECT 
						session_id,
       					user_id,
       					t02_apply_time,
                        T06_JRTT_XGB_SCORE
					FROM ods.ods_toutiao_shouxin_input 
				) AS a

                left JOIN
                (
                    SELECT 
                        session_id,
                        user_id,
                        out_credit_score,
                        OUT_FRONT_RISK_DEGREE
                    FROM ods.ods_toutiao_shouxin_output
                ) AS b ON a.session_id = b.session_id
                

                
        ) as t  where from_unixtime(unix_timestamp(substr(t.apply_time,0,8),'yyyymmdd'),'yyyy-mm-dd') 
            between %s and %s  
) as a group by a.score_xgb



"""
# toutiao = spark.sql(hive_sql).toPandas()

In [29]:
def dairly_monitor(sql,flag,daydiff1=0,daydiff2=0):
    
    if(flag == 'today'):
        time1 = """date_add(CURRENT_DATE,-1)"""
        time2 = """date_add(CURRENT_DATE,-1)"""
        flag = 'today'
        excute_sql = sql%(flag,flag,time1,time2)
#         print(excute_sql)
    if(flag == 'week'):
        time1 = """date_add(CURRENT_DATE,-%d)"""%daydiff1
        time2 = """date_add(CURRENT_DATE,-%d)"""%daydiff2
        excute_sql = sql%(flag,flag,time1,time2)
        
    if(flag == 'month'):
        time1 = """date_add(add_months(CURRENT_DATE, -%d ),-%d)"""%(daydiff1,daydiff2)
        time2 = """date_add(CURRENT_DATE,-%d)"""%daydiff2
        excute_sql = sql%(flag,flag,time1,time2)
    #获取每个月的时间
    credit_apply = spark.sql(excute_sql).toPandas()
    return credit_apply

def monthly_monitor(sql,flag,monthdiff):
    
    if(flag == 'online'):
        time1 = """date_add(last_day(add_months('2019-01-14', -1 )),1)"""
        time2 = """last_day('2019-01-14')"""
        flag = 'online'
        excute_sql = sql%(flag,flag,time1,time2)
#         print(excute_sql)
    else:
        time1 = """date_add(last_day(add_months(CURRENT_DATE, -%d )),1)"""%(monthdiff+1)    #当月第一天：上月最后一天+1天
        time2 = """last_day(add_months(CURRENT_DATE,-%d))"""%monthdiff                  #当月最后一天
        
        date_sql = """select last_day(add_months(CURRENT_DATE,-%d)) as dt"""%monthdiff
        date =  hive_df(date_sql)['dt'].astype('str')
        flag = date[0][0:7].replace('-','_')
        excute_sql = sql%(flag,flag,time1,time2)
#         print(excute_sql)

    #获取每个月的时间
    credit_apply = spark.sql(excute_sql).toPandas()
    return credit_apply

In [30]:
dairly_apply_day = dairly_monitor(hive_sql,'today',0,0)
dairly_apply_week = dairly_monitor(hive_sql,'week',8,2)
dairly_apply_month = dairly_monitor(hive_sql,'month',1,2)


join_key = 'score_xgb'

apply_monitor_dairly = dairy_report.merge(dairly_apply_month,on = join_key ,how='left')\
                                  .merge(dairly_apply_week,on = join_key, how='left')\
                                  .merge(dairly_apply_day,on = join_key, how='left').fillna(0)

col_keep = list(apply_monitor_dairly.columns)
col_keep.remove(join_key)
col_keep.remove('toutiao_develop')

for col in col_keep:
    col_total = sum(apply_monitor_dairly[col])
    idx = len(dairy_report['toutiao_develop']) - 1
    apply_monitor_dairly[col][idx] = col_total
    apply_monitor_dairly[col+"_percent"] = apply_monitor_dairly[col].map(lambda x: x/(col_total+0.000001))
    apply_monitor_dairly[col+"_percent"][idx] = col_total 

    
apply_monitor_dairly


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,score_xgb,toutiao_develop,apply_uid_month,apply_sessionId_month,apply_uid_week,apply_sessionId_week,apply_uid_today,apply_sessionId_today,apply_uid_month_percent,apply_sessionId_month_percent,apply_uid_week_percent,apply_sessionId_week_percent,apply_uid_today_percent,apply_sessionId_today_percent
0,>=0.093300,10.00%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"[0.068954,0.093300)",10.00%,180.0,181.0,122.0,123.0,0.0,0.0,0.127479,0.128096,0.122244,0.123123,0.0,0.0
2,"[0.055221,0.068954)",10.00%,228.0,228.0,161.0,161.0,0.0,0.0,0.161473,0.161359,0.161323,0.161161,0.0,0.0
3,"[0.045881,0.055221)",10.00%,305.0,305.0,213.0,213.0,0.0,0.0,0.216006,0.215853,0.213427,0.213213,0.0,0.0
4,"[0.038707,0.045881)",10.00%,214.0,214.0,149.0,149.0,0.0,0.0,0.151558,0.151451,0.149299,0.149149,0.0,0.0
5,"[0.032669,0.038707)",10.00%,308.0,308.0,228.0,228.0,0.0,0.0,0.21813,0.217976,0.228457,0.228228,0.0,0.0
6,"[0.027251,0.032669)",10.00%,136.0,136.0,99.0,99.0,0.0,0.0,0.096317,0.096249,0.099198,0.099099,0.0,0.0
7,"[0.022232,0.027251)",10.00%,23.0,23.0,14.0,14.0,0.0,0.0,0.016289,0.016277,0.014028,0.014014,0.0,0.0
8,"[0.016447,0.022232)",10.00%,18.0,18.0,12.0,12.0,0.0,0.0,0.012748,0.012739,0.012024,0.012012,0.0,0.0
9,<0.016447,10.00%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
#上线月
apply_month_online = monthly_monitor(hive_sql,'online',-1)
#当月
apply_month_now = monthly_monitor(hive_sql,'not_online',0)
#上月
apply_month_last1 = monthly_monitor(hive_sql,'not_online',1)
#上两月
apply_month_last2 = monthly_monitor(hive_sql,'not_online',2)
#上3月
apply_month_last3 = monthly_monitor(hive_sql,'not_online',3)
#上4月
apply_month_last4 = monthly_monitor(hive_sql,'not_online',4)
#上5月
apply_month_last5 = monthly_monitor(hive_sql,'not_online',5)
#上6月
apply_month_last6 = monthly_monitor(hive_sql,'not_online',6)



join_key = 'score_xgb'

apply_monitor_monthly = dairy_report.merge(apply_month_online,on = join_key,how='left')\
                                  .merge(apply_month_last6,on = join_key,how='left')\
                                  .merge(apply_month_last5,on = join_key,how='left')\
                                  .merge(apply_month_last4,on = join_key,how='left')\
                                  .merge(apply_month_last3,on = join_key,how='left')\
                                  .merge(apply_month_last2,on = join_key,how='left')\
                                  .merge(apply_month_last1,on = join_key,how='left')\
                                  .merge(apply_month_now,on = join_key,how='left').fillna(0)

col_keep = list(apply_monitor_monthly.columns)
col_keep.remove(join_key)
col_keep.remove('toutiao_develop')

for col in col_keep:
    col_total = sum(apply_monitor_monthly[col])
    idx = len(dairy_report['toutiao_develop']) - 1    
    apply_monitor_monthly[col][idx] = col_total
    apply_monitor_monthly[col+"_percent"] = apply_monitor_monthly[col].map(lambda x: x/(col_total+0.000001))
    apply_monitor_monthly[col+"_percent"][idx] = col_total 

    
apply_monitor_monthly

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,score_xgb,toutiao_develop,apply_uid_online,apply_sessionId_online,apply_uid_2018_07,apply_sessionId_2018_07,apply_uid_2018_08,apply_sessionId_2018_08,apply_uid_2018_09,apply_sessionId_2018_09,...,apply_uid_2018_09_percent,apply_sessionId_2018_09_percent,apply_uid_2018_10_percent,apply_sessionId_2018_10_percent,apply_uid_2018_11_percent,apply_sessionId_2018_11_percent,apply_uid_2018_12_percent,apply_sessionId_2018_12_percent,apply_uid_2019_01_percent,apply_sessionId_2019_01_percent
0,>=0.093300,10.00%,0.0,0.0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"[0.068954,0.093300)",10.00%,180.0,181.0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.127479,0.128096
2,"[0.055221,0.068954)",10.00%,228.0,228.0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.161473,0.161359
3,"[0.045881,0.055221)",10.00%,305.0,305.0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.216006,0.215853
4,"[0.038707,0.045881)",10.00%,214.0,214.0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.151558,0.151451
5,"[0.032669,0.038707)",10.00%,308.0,308.0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.21813,0.217976
6,"[0.027251,0.032669)",10.00%,136.0,136.0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.096317,0.096249
7,"[0.022232,0.027251)",10.00%,23.0,23.0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.016289,0.016277
8,"[0.016447,0.022232)",10.00%,18.0,18.0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.012748,0.012739
9,<0.016447,10.00%,0.0,0.0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [32]:
##### 授信相关日报和月报

In [33]:
shouxin_hive_sql = """


select
	score_xgb,
	count(distinct(user_id)) as apply_uid_%s,
    count(distinct(session_id)) as apply_sessionId_%s
from
(
	select
		user_id,
        session_id,
		case
			when cast(t.T06_JRTT_XGB_SCORE as double) >= 0.093300 then ">==0.093300"
			when cast(t.T06_JRTT_XGB_SCORE as double) >= 0.068954 and cast(t.T06_JRTT_XGB_SCORE as double) < 0.093300 then "[0.068954,0.093300)"
			when cast(t.T06_JRTT_XGB_SCORE as double) >= 0.055221 and cast(t.T06_JRTT_XGB_SCORE as double) < 0.068954 then "[0.055221,0.068954)"
			when cast(t.T06_JRTT_XGB_SCORE as double) >= 0.045881 and cast(t.T06_JRTT_XGB_SCORE as double) < 0.055221 then "[0.045881,0.055221)"
			when cast(t.T06_JRTT_XGB_SCORE as double) >= 0.038707 and cast(t.T06_JRTT_XGB_SCORE as double) < 0.045881 then "[0.038707,0.045881)"
			when cast(t.T06_JRTT_XGB_SCORE as double) >= 0.032669 and cast(t.T06_JRTT_XGB_SCORE as double) < 0.038707 then "[0.032669,0.038707)"
			when cast(t.T06_JRTT_XGB_SCORE as double) >= 0.027251 and cast(t.T06_JRTT_XGB_SCORE as double) < 0.032669 then "[0.027251,0.032669)"			
			when cast(t.T06_JRTT_XGB_SCORE as double) >= 0.022232 and cast(t.T06_JRTT_XGB_SCORE as double) < 0.027251 then "[0.022232,0.027251)"
			when cast(t.T06_JRTT_XGB_SCORE as double) >= 0.016447 and cast(t.T06_JRTT_XGB_SCORE as double) < 0.022232 then "[0.016447,0.022232)"
			when cast(t.T06_JRTT_XGB_SCORE as double) >= 0.016447 then "<0.016447"
		end as score_xgb
	from 
			(
				SELECT
     				a.session_id as session_id,
     				a.user_id as user_id,
     				a.t02_apply_time as apply_time,
     				b.out_credit_score as out_credit_score,
                    a.T06_JRTT_XGB_SCORE as T06_JRTT_XGB_SCORE
				FROM 
				(
					SELECT 
						session_id,
       					user_id,
       					t02_apply_time,
                        T06_JRTT_XGB_SCORE
					FROM ods.ods_toutiao_shouxin_input 
				) AS a

                left JOIN
                (
                    SELECT 
                        session_id,
                        user_id,
                        out_credit_score,
                        OUT_FRONT_RISK_DEGREE
                    FROM ods.ods_toutiao_shouxin_output
                ) AS b ON a.session_id = b.session_id

                
                right join 
                (
                
                    select 
                        user_id
                    from ods_credit_credit_info
                
                ) as c on a.user_id = c.user_id
                
        ) as t  where from_unixtime(unix_timestamp(substr(t.apply_time,0,8),'yyyymmdd'),'yyyy-mm-dd') 
            between %s and %s   
) as a group by a.score_xgb



"""

# dairly_shouxin_day = dairly_monitor(shouxin_hive_sql,'today',0,0)


In [34]:
dairly_shouxin_day = dairly_monitor(shouxin_hive_sql,'today',0,0)
dairly_shouxin_week = dairly_monitor(shouxin_hive_sql,'week',8,2)
dairly_shouxin_month = dairly_monitor(shouxin_hive_sql,'month',1,2)


join_key = 'score_xgb'

shouxin_monitor_dairly = dairy_report.merge(dairly_shouxin_month,on = join_key ,how='left')\
                                  .merge(dairly_shouxin_week,on = join_key, how='left')\
                                  .merge(dairly_shouxin_day,on = join_key, how='left').fillna(0)

col_keep = list(shouxin_monitor_dairly.columns)
col_keep.remove(join_key)
col_keep.remove('toutiao_develop')

for col in col_keep:
    col_total = sum(shouxin_monitor_dairly[col])
    idx = len(dairy_report['toutiao_develop']) - 1
    shouxin_monitor_dairly[col][idx] = col_total
    shouxin_monitor_dairly[col+"_percent"] = shouxin_monitor_dairly[col].map(lambda x: x/(col_total+0.000001))
    shouxin_monitor_dairly[col+"_percent"][idx] = col_total 

    
shouxin_monitor_dairly


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,score_xgb,toutiao_develop,apply_uid_month,apply_sessionId_month,apply_uid_week,apply_sessionId_week,apply_uid_today,apply_sessionId_today,apply_uid_month_percent,apply_sessionId_month_percent,apply_uid_week_percent,apply_sessionId_week_percent,apply_uid_today_percent,apply_sessionId_today_percent
0,>=0.093300,10.00%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"[0.068954,0.093300)",10.00%,28.0,28.0,22.0,22.0,0.0,0.0,0.132075,0.132075,0.140127,0.140127,0.0,0.0
2,"[0.055221,0.068954)",10.00%,30.0,30.0,24.0,24.0,0.0,0.0,0.141509,0.141509,0.152866,0.152866,0.0,0.0
3,"[0.045881,0.055221)",10.00%,46.0,46.0,33.0,33.0,0.0,0.0,0.216981,0.216981,0.210191,0.210191,0.0,0.0
4,"[0.038707,0.045881)",10.00%,39.0,39.0,27.0,27.0,0.0,0.0,0.183962,0.183962,0.171975,0.171975,0.0,0.0
5,"[0.032669,0.038707)",10.00%,38.0,38.0,30.0,30.0,0.0,0.0,0.179245,0.179245,0.191083,0.191083,0.0,0.0
6,"[0.027251,0.032669)",10.00%,22.0,22.0,16.0,16.0,0.0,0.0,0.103774,0.103774,0.101911,0.101911,0.0,0.0
7,"[0.022232,0.027251)",10.00%,7.0,7.0,5.0,5.0,0.0,0.0,0.033019,0.033019,0.031847,0.031847,0.0,0.0
8,"[0.016447,0.022232)",10.00%,2.0,2.0,0.0,0.0,0.0,0.0,0.009434,0.009434,0.0,0.0,0.0,0.0
9,<0.016447,10.00%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [35]:
#上线月
shouxin_month_online = monthly_monitor(hive_sql,'online',-1)
#当月
shouxin_month_now = monthly_monitor(hive_sql,'not_online',0)
#上月
shouxin_month_last1 = monthly_monitor(hive_sql,'not_online',1)
#上两月
shouxin_month_last2 = monthly_monitor(hive_sql,'not_online',2)
#上3月
shouxin_month_last3 = monthly_monitor(hive_sql,'not_online',3)
#上4月
shouxin_month_last4 = monthly_monitor(hive_sql,'not_online',4)
#上5月
shouxin_month_last5 = monthly_monitor(hive_sql,'not_online',5)
#上6月
shouxin_month_last6 = monthly_monitor(hive_sql,'not_online',6)



join_key = 'score_xgb'

shouxin_monitor_monthly = dairy_report.merge(shouxin_month_online,on = join_key,how='left')\
                                  .merge(shouxin_month_last6,on = join_key,how='left')\
                                  .merge(shouxin_month_last5,on = join_key,how='left')\
                                  .merge(shouxin_month_last4,on = join_key,how='left')\
                                  .merge(shouxin_month_last3,on = join_key,how='left')\
                                  .merge(shouxin_month_last2,on = join_key,how='left')\
                                  .merge(shouxin_month_last1,on = join_key,how='left')\
                                  .merge(shouxin_month_now,on = join_key,how='left').fillna(0)

col_keep = list(shouxin_monitor_monthly.columns)
col_keep.remove(join_key)
col_keep.remove('toutiao_develop')

for col in col_keep:
    col_total = sum(shouxin_monitor_monthly[col])
    idx = len(dairy_report['toutiao_develop']) - 1
    shouxin_monitor_monthly[col][idx] = col_total
    shouxin_monitor_monthly[col+"_percent"] = shouxin_monitor_monthly[col].map(lambda x: x/(col_total+0.000001))
    shouxin_monitor_monthly[col+"_percent"][idx] = col_total 

    
shouxin_monitor_monthly

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,score_xgb,toutiao_develop,apply_uid_online,apply_sessionId_online,apply_uid_2018_07,apply_sessionId_2018_07,apply_uid_2018_08,apply_sessionId_2018_08,apply_uid_2018_09,apply_sessionId_2018_09,...,apply_uid_2018_09_percent,apply_sessionId_2018_09_percent,apply_uid_2018_10_percent,apply_sessionId_2018_10_percent,apply_uid_2018_11_percent,apply_sessionId_2018_11_percent,apply_uid_2018_12_percent,apply_sessionId_2018_12_percent,apply_uid_2019_01_percent,apply_sessionId_2019_01_percent
0,>=0.093300,10.00%,0.0,0.0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"[0.068954,0.093300)",10.00%,180.0,181.0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.127479,0.128096
2,"[0.055221,0.068954)",10.00%,228.0,228.0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.161473,0.161359
3,"[0.045881,0.055221)",10.00%,305.0,305.0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.216006,0.215853
4,"[0.038707,0.045881)",10.00%,214.0,214.0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.151558,0.151451
5,"[0.032669,0.038707)",10.00%,308.0,308.0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.21813,0.217976
6,"[0.027251,0.032669)",10.00%,136.0,136.0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.096317,0.096249
7,"[0.022232,0.027251)",10.00%,23.0,23.0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.016289,0.016277
8,"[0.016447,0.022232)",10.00%,18.0,18.0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.012748,0.012739
9,<0.016447,10.00%,0.0,0.0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
###########################################
#### 有授信额度相关客户级和订单级汇总监控报表---月报

In [37]:
#### 申请表通过率

In [40]:
accept_rate_dairly = shouxin_monitor_dairly[['score_xgb']]
idx = len(dairy_report['toutiao_develop']) - 1

for i in range(len(shouxin_monitor_dairly.columns)):
    if(i>=2 and i <=7):
        shouxin_col = shouxin_monitor_dairly.columns[i]
        apply_col = apply_monitor_dairly.columns[i]
#         print(shouxin_col,apply_col)

        accept_col = shouxin_col.split("_")[1]+"_"+shouxin_col.split("_")[2]
        accept_rate_dairly[accept_col+"_rate"] = shouxin_monitor_dairly[shouxin_col]/(apply_monitor_dairly[apply_col])
        accept_rate_dairly[accept_col+"_rate"][idx] = apply_monitor_dairly[apply_col][idx]

accept_rate_dairly = accept_rate_dairly.fillna(0)
accept_rate_dairly

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':


Unnamed: 0,score_xgb,uid_month_rate,sessionId_month_rate,uid_week_rate,sessionId_week_rate,uid_today_rate,sessionId_today_rate
0,>=0.093300,0.0,0.0,0.0,0.0,0.0,0.0
1,"[0.068954,0.093300)",0.155556,0.154696,0.180328,0.178862,0.0,0.0
2,"[0.055221,0.068954)",0.131579,0.131579,0.149068,0.149068,0.0,0.0
3,"[0.045881,0.055221)",0.15082,0.15082,0.15493,0.15493,0.0,0.0
4,"[0.038707,0.045881)",0.182243,0.182243,0.181208,0.181208,0.0,0.0
5,"[0.032669,0.038707)",0.123377,0.123377,0.131579,0.131579,0.0,0.0
6,"[0.027251,0.032669)",0.161765,0.161765,0.161616,0.161616,0.0,0.0
7,"[0.022232,0.027251)",0.304348,0.304348,0.357143,0.357143,0.0,0.0
8,"[0.016447,0.022232)",0.111111,0.111111,0.0,0.0,0.0,0.0
9,<0.016447,0.0,0.0,0.0,0.0,0.0,0.0


In [42]:
accept_rate_monthly = shouxin_monitor_monthly[['score_xgb']]
idx = len(dairy_report['toutiao_develop']) - 1

for i in range(len(shouxin_monitor_monthly.columns)):
    if(i>=2 and i <=16):
        shouxin_col = shouxin_monitor_monthly.columns[i]
        apply_col = apply_monitor_monthly.columns[i]
#       print(shouxin_col,apply_col)

        accept_col = shouxin_col[6:]
        accept_rate_monthly[accept_col+"_rate"] = shouxin_monitor_monthly[shouxin_col]/(apply_monitor_monthly[apply_col])
        accept_rate_monthly[accept_col+"_rate"][idx] = apply_monitor_monthly[apply_col][idx]

accept_rate_monthly = accept_rate_monthly.fillna("0")
accept_rate_monthly


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':


Unnamed: 0,score_xgb,uid_online_rate,sessionId_online_rate,uid_2018_07_rate,sessionId_2018_07_rate,uid_2018_08_rate,sessionId_2018_08_rate,uid_2018_09_rate,sessionId_2018_09_rate,uid_2018_10_rate,sessionId_2018_10_rate,uid_2018_11_rate,sessionId_2018_11_rate,uid_2018_12_rate,sessionId_2018_12_rate,uid_2019_01_rate
0,>=0.093300,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,"[0.068954,0.093300)",1,1,0,0,0,0,0,0,0,0,0,0,0,0,1
2,"[0.055221,0.068954)",1,1,0,0,0,0,0,0,0,0,0,0,0,0,1
3,"[0.045881,0.055221)",1,1,0,0,0,0,0,0,0,0,0,0,0,0,1
4,"[0.038707,0.045881)",1,1,0,0,0,0,0,0,0,0,0,0,0,0,1
5,"[0.032669,0.038707)",1,1,0,0,0,0,0,0,0,0,0,0,0,0,1
6,"[0.027251,0.032669)",1,1,0,0,0,0,0,0,0,0,0,0,0,0,1
7,"[0.022232,0.027251)",1,1,0,0,0,0,0,0,0,0,0,0,0,0,1
8,"[0.016447,0.022232)",1,1,0,0,0,0,0,0,0,0,0,0,0,0,1
9,<0.016447,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
