In [36]:
import pandas as pd
import pymysql
from pyhive import hive
import pandas as pd
from pyspark import SparkContext,SQLContext
from pyspark.sql import SQLContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col
from pyspark.sql.types import StringType
import datetime



In [37]:
spark=SparkSession \
        .builder \
        .config("spark.eventLog.enabled", "false") \
        .config("spark.executor.memory", "4g")\
        .config("spark.driver.memory", "8g")\
        .config("spark.cores.max", "10")\
        .config("spark.task.maxFailures", "1000")\
        .config("spark.default.parallelism", "500")\
        .config("spark.sql.shuffle.partitions",50)\
        .appName('renhang_etl') \
        .master('yarn')\
        .getOrCreate()

In [38]:
class hiveParse(object):
    """
    该方法用于读取hive 数据，并转化成pandas dataframe
    """
    def __init__(self, host='', port=10000, user='', passwd='', dbName=''):
        self.host = host
        self.port = port
        self.user = user
        self.passwd = passwd
        self.dbName = dbName

    def conn_hive(self):
        self.connection = hive.Connection(host=self.host,port=self.port,username=self.user)

    def hive_connect(self,sql_select):
        cur = self.connection.cursor()
        cur.execute(sql_select)
        columns = [col[0] for col in cur.description]
        data = cur.fetchall()
        query_df = pd.DataFrame(data)
        query_df.columns = columns
        return query_df

class mysqldbParse(object):
    """
    该方法用于连接mysql，并执行相应操作
    """

    def __init__(self, host='', port=3306, user='', passwd='', dbName='', charset='utf8', connect_timeout=31536000):
        self.host = host
        self.port = port
        self.user = user
        self.passwd = passwd
        self.dbName = dbName
        self.charset = charset
        self.connect_timeout = connect_timeout

    def conn_mysql(self):
        self.conn = pymysql.connect(host=self.host,
                                    port=self.port,
                                    user=self.user,
                                    password=self.passwd,
                                    db=self.dbName,
                                    charset=self.charset,
                                    connect_timeout=self.connect_timeout,
                                    cursorclass=pymysql.cursors.DictCursor)

    def insert_sql(self, insertSql):
        with self.conn.cursor() as cursor:
            cursor.execute(insertSql)
            self.conn.commit()

    def select_Sql(self, selectSql):
        df = pd.read_sql(selectSql, self.conn)
        return df

    def close(self):
        self.conn.close()

In [39]:
def hive_df(sql):
    host = "58.59.18.61"
    port = 10000
    user = "hdfs"
    dbName = "ods"
    hive_parse = hiveParse(host,port,user,dbName)
    hive_parse.conn_hive()
    hive_pd = hive_parse.hive_connect(sql)
    return hive_pd

def mysql_df(selectSql):
    host = "58.59.11.86"
    port = 3306
    user = "tangdinghai"
    passwd = "Tangdinghai2019!"
    dbName = "reportpublic"
    mysql_parse = mysqldbParse(host,port,user,passwd,dbName)
    mysql_parse.conn_mysql()
    mysql_pd = mysql_parse.select_Sql(selectSql)
    mysql_parse.close()
    return mysql_pd



In [40]:
#### 头条分发模型：有授信额度相关客户级和订单级汇总监控报表---日报

In [41]:
base_dict = {'score_logit':['<=593','(593,604]','(604,611]','(611,617]','(617,623]','(623,626]','(626,631]','(631,633]','(633,639]','(>639]','汇总'],\
             'toutiao_develop':['10.00%','10.00%','10.00%','10.00%','10.00%','10.00%','10.00%','10.00%','10.00%','10.00%','520']}
dairy_report = pd.DataFrame(base_dict)
# dairy_report['product_no'] = "PN00000053"
# dairy_report['product_name'] = '今日头条放心借'

# dairy_report = dairy_report[['product_name','product_no','score_logit','toutiao_develop']]
# dairy_report


In [42]:
table1 = spark.read \
    .format("jdbc") \
    .option("url", "jdbc:mysql://58.59.11.86:3306") \
    .option("dbtable", "reportpublic.ods_credit_credit_info") \
    .option("user", "tangdinghai") \
    .option("password", "Tangdinghai2019!") \
    .load()
table1.createOrReplaceTempView("ods_credit_credit_info")

table1 = spark.read \
    .format("jdbc") \
    .option("url", "jdbc:mysql://58.59.11.86:3306") \
    .option("dbtable", "reportpublic.ods_credit_credit_apply") \
    .option("user", "tangdinghai") \
    .option("password", "Tangdinghai2019!") \
    .load()
table1.createOrReplaceTempView("ods_credit_credit_apply")

table1 = spark.read \
    .format("jdbc") \
    .option("url", "jdbc:mysql://58.59.11.86:3306") \
    .option("dbtable", "reportpublic.ods_loan_invoice") \
    .option("user", "tangdinghai") \
    .option("password", "Tangdinghai2019!") \
    .load()
table1.createOrReplaceTempView("ods_loan_invoice")

In [43]:
hive_sql = """


select
	score_logit,
	count(distinct(user_id)) as apply_uid_%s,
    count(distinct(session_id)) as apply_sessionId_%s
from
(
	select
		user_id,
        session_id,
		case
			when cast(t.T06_JRTT_LOGISTIC_REGRESSION_SCORE as int) <= 593 then "<=593"
			when cast(t.T06_JRTT_LOGISTIC_REGRESSION_SCORE as int) > 593 and cast(t.T06_JRTT_LOGISTIC_REGRESSION_SCORE as int) <= 604 then "(593,604]"
			when cast(t.T06_JRTT_LOGISTIC_REGRESSION_SCORE as int) > 604 and cast(t.T06_JRTT_LOGISTIC_REGRESSION_SCORE as int) <= 611 then "(604,611]"
			when cast(t.T06_JRTT_LOGISTIC_REGRESSION_SCORE as int) > 611 and cast(t.T06_JRTT_LOGISTIC_REGRESSION_SCORE as int) <= 617 then "(611,617]"
			when cast(t.T06_JRTT_LOGISTIC_REGRESSION_SCORE as int) > 617 and cast(t.T06_JRTT_LOGISTIC_REGRESSION_SCORE as int) <= 623 then "(617,623]"
			when cast(t.T06_JRTT_LOGISTIC_REGRESSION_SCORE as int) > 623 and cast(t.T06_JRTT_LOGISTIC_REGRESSION_SCORE as int) <= 626 then "(623,626]"
			when cast(t.T06_JRTT_LOGISTIC_REGRESSION_SCORE as int) > 626 and cast(t.T06_JRTT_LOGISTIC_REGRESSION_SCORE as int) <= 631 then "(626,631]"			
			when cast(t.T06_JRTT_LOGISTIC_REGRESSION_SCORE as int) > 631 and cast(t.T06_JRTT_LOGISTIC_REGRESSION_SCORE as int) <= 633 then "(631,633]"
			when cast(t.T06_JRTT_LOGISTIC_REGRESSION_SCORE as int) > 633 and cast(t.T06_JRTT_LOGISTIC_REGRESSION_SCORE as int) <= 639 then "(633,639]"
			when cast(t.T06_JRTT_LOGISTIC_REGRESSION_SCORE as int) > 639 then ">639"
		end as score_logit
	from 
			(
				SELECT
     				a.session_id as session_id,
     				a.user_id as user_id,
     				a.t02_apply_time as apply_time,
     				b.out_credit_score as out_credit_score,
                    a.T06_JRTT_LOGISTIC_REGRESSION_SCORE as T06_JRTT_LOGISTIC_REGRESSION_SCORE
				FROM 
				(
					SELECT 
						session_id,
       					user_id,
       					t02_apply_time,
                        T06_JRTT_LOGISTIC_REGRESSION_SCORE
					FROM ods.ods_toutiao_shouxin_input 
				) AS a

                left JOIN
                (
                    SELECT 
                        session_id,
                        user_id,
                        out_credit_score,
                        OUT_FRONT_RISK_DEGREE
                    FROM ods.ods_toutiao_shouxin_output
                ) AS b ON a.session_id = b.session_id
                

                
        ) as t  where from_unixtime(unix_timestamp(substr(t.apply_time,0,8),'yyyymmdd'),'yyyy-mm-dd') 
            between %s and %s  
) as a group by a.score_logit



"""
# toutiao = spark.sql(hive_sql).toPandas()

In [44]:
def dairly_monitor(sql,flag,daydiff1=0,daydiff2=0):
    
    if(flag == 'today'):
        time1 = """date_add(CURRENT_DATE,-1)"""
        time2 = """date_add(CURRENT_DATE,-1)"""
        flag = 'today'
        excute_sql = sql%(flag,flag,time1,time2)
#         print(excute_sql)
    if(flag == 'week'):
        time1 = """date_add(CURRENT_DATE,-%d)"""%daydiff1
        time2 = """date_add(CURRENT_DATE,-%d)"""%daydiff2
        excute_sql = sql%(flag,flag,time1,time2)
        
    if(flag == 'month'):
        time1 = """date_add(add_months(CURRENT_DATE, -%d ),-%d)"""%(daydiff1,daydiff2)
        time2 = """date_add(CURRENT_DATE,-%d)"""%daydiff2
        excute_sql = sql%(flag,flag,time1,time2)
    #获取每个月的时间
    credit_apply = spark.sql(excute_sql).toPandas()
    return credit_apply

def monthly_monitor(sql,flag,monthdiff):
    
    if(flag == 'online'):
        time1 = """date_add(last_day(add_months('2019-01-14', -1 )),1)"""
        time2 = """last_day('2019-01-14')"""
        flag = 'online'
        excute_sql = sql%(flag,flag,time1,time2)
#         print(excute_sql)
    else:
        time1 = """date_add(last_day(add_months(CURRENT_DATE, -%d )),1)"""%(monthdiff+1)    #当月第一天：上月最后一天+1天
        time2 = """last_day(add_months(CURRENT_DATE,-%d))"""%monthdiff                  #当月最后一天
        
        date_sql = """select last_day(add_months(CURRENT_DATE,-%d)) as dt"""%monthdiff
        date =  hive_df(date_sql)['dt'].astype('str')
        flag = date[0][0:7].replace('-','_')
        excute_sql = sql%(flag,flag,time1,time2)
#         print(excute_sql)

    #获取每个月的时间
    credit_apply = spark.sql(excute_sql).toPandas()
    return credit_apply

In [45]:
dairly_apply_day = dairly_monitor(hive_sql,'today',0,0)
dairly_apply_week = dairly_monitor(hive_sql,'week',8,2)
dairly_apply_month = dairly_monitor(hive_sql,'month',1,2)


join_key = 'score_logit'

apply_monitor_dairly = dairy_report.merge(dairly_apply_month,on = join_key ,how='left')\
                                  .merge(dairly_apply_week,on = join_key, how='left')\
                                  .merge(dairly_apply_day,on = join_key, how='left').fillna(0)

col_keep = list(apply_monitor_dairly.columns)
col_keep.remove(join_key)
col_keep.remove('toutiao_develop')

for col in col_keep:
    col_total = sum(apply_monitor_dairly[col])
    idx = len(dairy_report['toutiao_develop']) - 1
    apply_monitor_dairly[col][idx] = col_total
    apply_monitor_dairly[col+"_percent"] = apply_monitor_dairly[col].map(lambda x: x/(col_total+0.000001))
    apply_monitor_dairly[col+"_percent"][idx] = col_total 

    
apply_monitor_dairly


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,score_logit,toutiao_develop,apply_uid_month,apply_sessionId_month,apply_uid_week,apply_sessionId_week,apply_uid_today,apply_sessionId_today,apply_uid_month_percent,apply_sessionId_month_percent,apply_uid_week_percent,apply_sessionId_week_percent,apply_uid_today_percent,apply_sessionId_today_percent
0,<=593,10.00%,113.0,113.0,89.0,89.0,13.0,13.0,0.094324,0.094245,0.119624,0.119624,1.0,1.0
1,"(593,604]",10.00%,96.0,96.0,62.0,62.0,0.0,0.0,0.080134,0.080067,0.083333,0.083333,0.0,0.0
2,"(604,611]",10.00%,94.0,94.0,57.0,57.0,0.0,0.0,0.078464,0.078399,0.076613,0.076613,0.0,0.0
3,"(611,617]",10.00%,123.0,123.0,71.0,71.0,0.0,0.0,0.102671,0.102585,0.09543,0.09543,0.0,0.0
4,"(617,623]",10.00%,118.0,118.0,66.0,66.0,0.0,0.0,0.098497,0.098415,0.08871,0.08871,0.0,0.0
5,"(623,626]",10.00%,184.0,184.0,119.0,119.0,0.0,0.0,0.153589,0.153461,0.159946,0.159946,0.0,0.0
6,"(626,631]",10.00%,151.0,151.0,78.0,78.0,0.0,0.0,0.126043,0.125938,0.104839,0.104839,0.0,0.0
7,"(631,633]",10.00%,50.0,50.0,27.0,27.0,0.0,0.0,0.041736,0.041701,0.03629,0.03629,0.0,0.0
8,"(633,639]",10.00%,269.0,270.0,175.0,175.0,0.0,0.0,0.224541,0.225188,0.235215,0.235215,0.0,0.0
9,(>639],10.00%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [46]:
#上线月
apply_month_online = monthly_monitor(hive_sql,'online',-1)
#当月
apply_month_now = monthly_monitor(hive_sql,'not_online',0)
#上月
apply_month_last1 = monthly_monitor(hive_sql,'not_online',1)
#上两月
apply_month_last2 = monthly_monitor(hive_sql,'not_online',2)
#上3月
apply_month_last3 = monthly_monitor(hive_sql,'not_online',3)
#上4月
apply_month_last4 = monthly_monitor(hive_sql,'not_online',4)
#上5月
apply_month_last5 = monthly_monitor(hive_sql,'not_online',5)
#上6月
apply_month_last6 = monthly_monitor(hive_sql,'not_online',6)



join_key = 'score_logit'

apply_monitor_monthly = dairy_report.merge(apply_month_online,on = join_key,how='left')\
                                  .merge(apply_month_last6,on = join_key,how='left')\
                                  .merge(apply_month_last5,on = join_key,how='left')\
                                  .merge(apply_month_last4,on = join_key,how='left')\
                                  .merge(apply_month_last3,on = join_key,how='left')\
                                  .merge(apply_month_last2,on = join_key,how='left')\
                                  .merge(apply_month_last1,on = join_key,how='left')\
                                  .merge(apply_month_now,on = join_key,how='left').fillna(0)

col_keep = list(apply_monitor_monthly.columns)
col_keep.remove(join_key)
col_keep.remove('toutiao_develop')

for col in col_keep:
    col_total = sum(apply_monitor_monthly[col])
    apply_monitor_monthly[col][6] = col_total
    apply_monitor_monthly[col+"_percent"] = apply_monitor_monthly[col].map(lambda x: x/(col_total+0.000001))
    apply_monitor_monthly[col+"_percent"][6] = col_total 

    
apply_monitor_monthly

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,score_logit,toutiao_develop,apply_uid_online,apply_sessionId_online,apply_uid_2018_07,apply_sessionId_2018_07,apply_uid_2018_08,apply_sessionId_2018_08,apply_uid_2018_09,apply_sessionId_2018_09,...,apply_uid_2018_09_percent,apply_sessionId_2018_09_percent,apply_uid_2018_10_percent,apply_sessionId_2018_10_percent,apply_uid_2018_11_percent,apply_sessionId_2018_11_percent,apply_uid_2018_12_percent,apply_sessionId_2018_12_percent,apply_uid_2019_01_percent,apply_sessionId_2019_01_percent
0,<=593,10.00%,126.0,126.0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.104046,0.10396
1,"(593,604]",10.00%,96.0,96.0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.079273,0.079208
2,"(604,611]",10.00%,94.0,94.0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.077622,0.077558
3,"(611,617]",10.00%,123.0,123.0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.101569,0.101485
4,"(617,623]",10.00%,118.0,118.0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.09744,0.09736
5,"(623,626]",10.00%,184.0,184.0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.151941,0.151815
6,"(626,631]",10.00%,1211.0,1212.0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1211.0,1212.0
7,"(631,633]",10.00%,50.0,50.0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.041288,0.041254
8,"(633,639]",10.00%,269.0,270.0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.22213,0.222772
9,(>639],10.00%,0.0,0.0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [47]:
##### 授信相关日报和月报

In [48]:
shouxin_hive_sql = """


select
	score_logit,
	count(distinct(user_id)) as apply_uid_%s,
    count(distinct(session_id)) as apply_sessionId_%s
from
(
	select
		user_id,
        session_id,
		case
			when cast(t.T06_JRTT_LOGISTIC_REGRESSION_SCORE as int) <= 593 then "<=593"
			when cast(t.T06_JRTT_LOGISTIC_REGRESSION_SCORE as int) > 593 and cast(t.T06_JRTT_LOGISTIC_REGRESSION_SCORE as int) <= 604 then "(593,604]"
			when cast(t.T06_JRTT_LOGISTIC_REGRESSION_SCORE as int) > 604 and cast(t.T06_JRTT_LOGISTIC_REGRESSION_SCORE as int) <= 611 then "(604,611]"
			when cast(t.T06_JRTT_LOGISTIC_REGRESSION_SCORE as int) > 611 and cast(t.T06_JRTT_LOGISTIC_REGRESSION_SCORE as int) <= 617 then "(611,617]"
			when cast(t.T06_JRTT_LOGISTIC_REGRESSION_SCORE as int) > 617 and cast(t.T06_JRTT_LOGISTIC_REGRESSION_SCORE as int) <= 623 then "(617,623]"
			when cast(t.T06_JRTT_LOGISTIC_REGRESSION_SCORE as int) > 623 and cast(t.T06_JRTT_LOGISTIC_REGRESSION_SCORE as int) <= 626 then "(623,626]"
			when cast(t.T06_JRTT_LOGISTIC_REGRESSION_SCORE as int) > 626 and cast(t.T06_JRTT_LOGISTIC_REGRESSION_SCORE as int) <= 631 then "(626,631]"
			when cast(t.T06_JRTT_LOGISTIC_REGRESSION_SCORE as int) > 631 and cast(t.T06_JRTT_LOGISTIC_REGRESSION_SCORE as int) <= 633 then "(631,633]"
			when cast(t.T06_JRTT_LOGISTIC_REGRESSION_SCORE as int) > 633 and cast(t.T06_JRTT_LOGISTIC_REGRESSION_SCORE as int) <= 639 then "(633,639]"
			when cast(t.T06_JRTT_LOGISTIC_REGRESSION_SCORE as int) > 639 then ">639"
		end as score_logit
	from 
			(
				SELECT
     				a.session_id as session_id,
     				a.user_id as user_id,
     				a.t02_apply_time as apply_time,
     				b.out_credit_score as out_credit_score,
                    a.T06_JRTT_LOGISTIC_REGRESSION_SCORE as T06_JRTT_LOGISTIC_REGRESSION_SCORE
				FROM 
				(
					SELECT 
						session_id,
       					user_id,
       					t02_apply_time,
                        T06_JRTT_LOGISTIC_REGRESSION_SCORE
					FROM ods.ods_toutiao_shouxin_input 
				) AS a

                left JOIN
                (
                    SELECT 
                        session_id,
                        user_id,
                        out_credit_score,
                        OUT_FRONT_RISK_DEGREE
                    FROM ods.ods_toutiao_shouxin_output
                ) AS b ON a.session_id = b.session_id

                
                right join 
                (
                
                    select 
                        user_id,
                        APPLY_STATUS
                    from ods_credit_credit_apply
                    where product_no like 'PN00000053%%' and APPLY_STATUS = 12
                
                ) as c on a.user_id = c.user_id
                
        ) as t  where from_unixtime(unix_timestamp(substr(t.apply_time,0,8),'yyyymmdd'),'yyyy-mm-dd') 
            between %s and %s   
) as a group by a.score_logit



"""

# dairly_shouxin_day = dairly_monitor(shouxin_hive_sql,'today',0,0)


In [49]:
dairly_shouxin_day = dairly_monitor(shouxin_hive_sql,'today',0,0)
dairly_shouxin_week = dairly_monitor(shouxin_hive_sql,'week',8,2)
dairly_shouxin_month = dairly_monitor(shouxin_hive_sql,'month',1,2)


join_key = 'score_logit'

shouxin_monitor_dairly = dairy_report.merge(dairly_shouxin_month,on = join_key ,how='left')\
                                  .merge(dairly_shouxin_week,on = join_key, how='left')\
                                  .merge(dairly_shouxin_day,on = join_key, how='left').fillna(0)

col_keep = list(shouxin_monitor_dairly.columns)
col_keep.remove(join_key)
col_keep.remove('toutiao_develop')

for col in col_keep:
    col_total = sum(shouxin_monitor_dairly[col])
    idx = len(dairy_report['toutiao_develop']) - 1

    shouxin_monitor_dairly[col][idx] = col_total
    shouxin_monitor_dairly[col+"_percent"] = shouxin_monitor_dairly[col].map(lambda x: x/(col_total+0.000001))
    shouxin_monitor_dairly[col+"_percent"][idx] = col_total 

    
shouxin_monitor_dairly


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,score_logit,toutiao_develop,apply_uid_month,apply_sessionId_month,apply_uid_week,apply_sessionId_week,apply_uid_today,apply_sessionId_today,apply_uid_month_percent,apply_sessionId_month_percent,apply_uid_week_percent,apply_sessionId_week_percent,apply_uid_today_percent,apply_sessionId_today_percent
0,<=593,10.00%,4.0,4.0,3.0,3.0,0,0,0.036364,0.036364,0.041096,0.041096,0.0,0.0
1,"(593,604]",10.00%,8.0,8.0,6.0,6.0,0,0,0.072727,0.072727,0.082192,0.082192,0.0,0.0
2,"(604,611]",10.00%,9.0,9.0,6.0,6.0,0,0,0.081818,0.081818,0.082192,0.082192,0.0,0.0
3,"(611,617]",10.00%,6.0,6.0,4.0,4.0,0,0,0.054545,0.054545,0.054795,0.054795,0.0,0.0
4,"(617,623]",10.00%,12.0,12.0,8.0,8.0,0,0,0.109091,0.109091,0.109589,0.109589,0.0,0.0
5,"(623,626]",10.00%,15.0,15.0,7.0,7.0,0,0,0.136364,0.136364,0.09589,0.09589,0.0,0.0
6,"(626,631]",10.00%,17.0,17.0,11.0,11.0,0,0,0.154545,0.154545,0.150685,0.150685,0.0,0.0
7,"(631,633]",10.00%,4.0,4.0,3.0,3.0,0,0,0.036364,0.036364,0.041096,0.041096,0.0,0.0
8,"(633,639]",10.00%,35.0,35.0,25.0,25.0,0,0,0.318182,0.318182,0.342466,0.342466,0.0,0.0
9,(>639],10.00%,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [50]:
#上线月
shouxin_month_online = monthly_monitor(hive_sql,'online',-1)
#当月
shouxin_month_now = monthly_monitor(hive_sql,'not_online',0)
#上月
shouxin_month_last1 = monthly_monitor(hive_sql,'not_online',1)
#上两月
shouxin_month_last2 = monthly_monitor(hive_sql,'not_online',2)
#上3月
shouxin_month_last3 = monthly_monitor(hive_sql,'not_online',3)
#上4月
shouxin_month_last4 = monthly_monitor(hive_sql,'not_online',4)
#上5月
shouxin_month_last5 = monthly_monitor(hive_sql,'not_online',5)
#上6月
shouxin_month_last6 = monthly_monitor(hive_sql,'not_online',6)



join_key = 'score_logit'

shouxin_monitor_monthly = dairy_report.merge(shouxin_month_online,on = join_key,how='left')\
                                  .merge(shouxin_month_last6,on = join_key,how='left')\
                                  .merge(shouxin_month_last5,on = join_key,how='left')\
                                  .merge(shouxin_month_last4,on = join_key,how='left')\
                                  .merge(shouxin_month_last3,on = join_key,how='left')\
                                  .merge(shouxin_month_last2,on = join_key,how='left')\
                                  .merge(shouxin_month_last1,on = join_key,how='left')\
                                  .merge(shouxin_month_now,on = join_key,how='left').fillna(0)

col_keep = list(shouxin_monitor_monthly.columns)
col_keep.remove(join_key)
col_keep.remove('toutiao_develop')

for col in col_keep:
    col_total = sum(shouxin_monitor_monthly[col])
    idx = len(dairy_report['toutiao_develop']) - 1
    shouxin_monitor_monthly[col][idx] = col_total
    shouxin_monitor_monthly[col+"_percent"] = shouxin_monitor_monthly[col].map(lambda x: x/(col_total+0.000001))
    shouxin_monitor_monthly[col+"_percent"][idx] = col_total 

    
shouxin_monitor_monthly

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,score_logit,toutiao_develop,apply_uid_online,apply_sessionId_online,apply_uid_2018_07,apply_sessionId_2018_07,apply_uid_2018_08,apply_sessionId_2018_08,apply_uid_2018_09,apply_sessionId_2018_09,...,apply_uid_2018_09_percent,apply_sessionId_2018_09_percent,apply_uid_2018_10_percent,apply_sessionId_2018_10_percent,apply_uid_2018_11_percent,apply_sessionId_2018_11_percent,apply_uid_2018_12_percent,apply_sessionId_2018_12_percent,apply_uid_2019_01_percent,apply_sessionId_2019_01_percent
0,<=593,10.00%,126.0,126.0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.104046,0.10396
1,"(593,604]",10.00%,96.0,96.0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.079273,0.079208
2,"(604,611]",10.00%,94.0,94.0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.077622,0.077558
3,"(611,617]",10.00%,123.0,123.0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.101569,0.101485
4,"(617,623]",10.00%,118.0,118.0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.09744,0.09736
5,"(623,626]",10.00%,184.0,184.0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.151941,0.151815
6,"(626,631]",10.00%,151.0,151.0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.12469,0.124587
7,"(631,633]",10.00%,50.0,50.0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.041288,0.041254
8,"(633,639]",10.00%,269.0,270.0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.22213,0.222772
9,(>639],10.00%,0.0,0.0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [51]:
###########################################
#### 有授信额度相关客户级和订单级汇总监控报表---月报

In [52]:
#### 申请表通过率

In [53]:
accept_rate_dairly = shouxin_monitor_dairly[['score_logit']]
idx = len(dairy_report['toutiao_develop']) - 1

for i in range(len(shouxin_monitor_dairly.columns)):
    if(i>=2 and i <=7):
        shouxin_col = shouxin_monitor_dairly.columns[i]
        apply_col = apply_monitor_dairly.columns[i]
#         print(shouxin_col,apply_col)

        accept_col = shouxin_col.split("_")[1]+"_"+shouxin_col.split("_")[2]
        accept_rate_dairly[accept_col+"_rate"] = shouxin_monitor_dairly[shouxin_col]/(apply_monitor_dairly[apply_col])
        accept_rate_dairly[accept_col+"_rate"][idx] = apply_monitor_dairly[apply_col][idx]

accept_rate_dairly = accept_rate_dairly.fillna(0)
accept_rate_dairly

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':


Unnamed: 0,score_logit,uid_month_rate,sessionId_month_rate,uid_week_rate,sessionId_week_rate,uid_today_rate,sessionId_today_rate
0,<=593,0.035398,0.035398,0.033708,0.033708,0.0,0.0
1,"(593,604]",0.083333,0.083333,0.096774,0.096774,0.0,0.0
2,"(604,611]",0.095745,0.095745,0.105263,0.105263,0.0,0.0
3,"(611,617]",0.04878,0.04878,0.056338,0.056338,0.0,0.0
4,"(617,623]",0.101695,0.101695,0.121212,0.121212,0.0,0.0
5,"(623,626]",0.081522,0.081522,0.058824,0.058824,0.0,0.0
6,"(626,631]",0.112583,0.112583,0.141026,0.141026,0.0,0.0
7,"(631,633]",0.08,0.08,0.111111,0.111111,0.0,0.0
8,"(633,639]",0.130112,0.12963,0.142857,0.142857,0.0,0.0
9,(>639],0.0,0.0,0.0,0.0,0.0,0.0


In [54]:
accept_rate_monthly = shouxin_monitor_monthly[['score_logit']]
idx = len(dairy_report['toutiao_develop']) - 1


for i in range(len(shouxin_monitor_monthly.columns)):
    if(i>=2 and i <=16):
        shouxin_col = shouxin_monitor_monthly.columns[i]
        apply_col = apply_monitor_monthly.columns[i]
#       print(shouxin_col,apply_col)

        accept_col = shouxin_col[6:]
        accept_rate_monthly[accept_col+"_rate"] = shouxin_monitor_monthly[shouxin_col]/(apply_monitor_monthly[apply_col])
        accept_rate_monthly[accept_col+"_rate"][idx] = apply_monitor_monthly[apply_col][idx]

accept_rate_monthly = accept_rate_monthly.fillna("0")
accept_rate_monthly


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]


Unnamed: 0,score_logit,uid_online_rate,sessionId_online_rate,uid_2018_07_rate,sessionId_2018_07_rate,uid_2018_08_rate,sessionId_2018_08_rate,uid_2018_09_rate,sessionId_2018_09_rate,uid_2018_10_rate,sessionId_2018_10_rate,uid_2018_11_rate,sessionId_2018_11_rate,uid_2018_12_rate,sessionId_2018_12_rate,uid_2019_01_rate
0,<=593,1.0,1.0,0,0,0,0,0,0,0,0,0,0,0,0,1.0
1,"(593,604]",1.0,1.0,0,0,0,0,0,0,0,0,0,0,0,0,1.0
2,"(604,611]",1.0,1.0,0,0,0,0,0,0,0,0,0,0,0,0,1.0
3,"(611,617]",1.0,1.0,0,0,0,0,0,0,0,0,0,0,0,0,1.0
4,"(617,623]",1.0,1.0,0,0,0,0,0,0,0,0,0,0,0,0,1.0
5,"(623,626]",1.0,1.0,0,0,0,0,0,0,0,0,0,0,0,0,1.0
6,"(626,631]",0.12469,0.124587,0,0,0,0,0,0,0,0,0,0,0,0,0.12469
7,"(631,633]",1.0,1.0,0,0,0,0,0,0,0,0,0,0,0,0,1.0
8,"(633,639]",1.0,1.0,0,0,0,0,0,0,0,0,0,0,0,0,1.0
9,(>639],0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0.0
