In [276]:
import pandas as pd
import pymysql
from pyhive import hive
import pandas as pd
from pyspark import SparkContext,SQLContext
from pyspark.sql import SQLContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col
from pyspark.sql.types import StringType
import datetime

In [277]:
spark=SparkSession \
        .builder \
        .config("spark.eventLog.enabled", "false") \
        .config("spark.executor.memory", "4g")\
        .config("spark.driver.memory", "8g")\
        .config("spark.cores.max", "10")\
        .config("spark.task.maxFailures", "1000")\
        .config("spark.default.parallelism", "500")\
        .config("spark.sql.shuffle.partitions",50)\
        .appName('renhang_etl') \
        .master('yarn')\
        .getOrCreate()

In [278]:
class hiveParse(object):
    """
    该方法用于读取hive 数据，并转化成pandas dataframe
    """
    def __init__(self, host='', port=10000, user='', passwd='', dbName=''):
        self.host = host
        self.port = port
        self.user = user
        self.passwd = passwd
        self.dbName = dbName

    def conn_hive(self):
        self.connection = hive.Connection(host=self.host,port=self.port,username=self.user)

    def hive_connect(self,sql_select):
        cur = self.connection.cursor()
        cur.execute(sql_select)
        columns = [col[0] for col in cur.description]
        data = cur.fetchall()
        query_df = pd.DataFrame(data)
        query_df.columns = columns
        return query_df

class mysqldbParse(object):
    """
    该方法用于连接mysql，并执行相应操作
    """

    def __init__(self, host='', port=3306, user='', passwd='', dbName='', charset='utf8', connect_timeout=31536000):
        self.host = host
        self.port = port
        self.user = user
        self.passwd = passwd
        self.dbName = dbName
        self.charset = charset
        self.connect_timeout = connect_timeout

    def conn_mysql(self):
        self.conn = pymysql.connect(host=self.host,
                                    port=self.port,
                                    user=self.user,
                                    password=self.passwd,
                                    db=self.dbName,
                                    charset=self.charset,
                                    connect_timeout=self.connect_timeout,
                                    cursorclass=pymysql.cursors.DictCursor)

    def insert_sql(self, insertSql):
        with self.conn.cursor() as cursor:
            cursor.execute(insertSql)
            self.conn.commit()

    def select_Sql(self, selectSql):
        df = pd.read_sql(selectSql, self.conn)
        return df

    def close(self):
        self.conn.close()

In [279]:
def hive_df(sql):
    host = "58.59.18.61"
    port = 10000
    user = "hdfs"
    dbName = "ods"
    hive_parse = hiveParse(host,port,user,dbName)
    hive_parse.conn_hive()
    hive_pd = hive_parse.hive_connect(sql)
    return hive_pd

def mysql_df(selectSql):
    host = "58.59.11.86"
    port = 3306
    user = "dt"
    passwd = "Usd&212%wePO2"
    dbName = "reportpublic"
    mysql_parse = mysqldbParse(host,port,user,passwd,dbName)
    mysql_parse.conn_mysql()
    mysql_pd = mysql_parse.select_Sql(selectSql)
    mysql_parse.close()
    return mysql_pd



In [280]:
sql_day =  """

select
	score_flag as score_interval,
	count(distinct(user_id)) as apply_num_day,
    date(ADDDATE(now(),-1) ) as applytime
from
(
	select
		user_id,
        create_time,
		case
			when (t.score ) > 680 then ">680"
			when (t.score ) > 650 and (t.score ) <= 680 then "(650,680]"
			when (t.score ) > 630 and (t.score ) <= 650 then "(630,650]"
			when (t.score ) > 610 and (t.score ) <= 630 then "(610,630]"
			when (t.score ) > 595 and (t.score ) <= 610 then "(595,610]"
			when (t.score ) <= 595 then "<=595"
		end as score_flag
	from 
    
             (
                select 
                        b.USER_ID as user_id,
                        b.APPLY_STATUS as APPLY_STATUS,
                        b.create_time as create_time,
                        a.SCORE as score
                from
                (
                    select 
                            USER_ID,
                            CREDIT_ID,
                            CREDIT_APPLY_ID,
                            CREDIT_AMOUNT,
                            SCORE,
                            PRODUCT_NO
                    from ods_credit_credit_info where product_no like 'PN00000053%%'
                    ) as a 

                    right join
                    (
                    select 
                            TENANT_ID,
                            USER_ID,
                            CREDIT_APPLY_ID,
                            APPLY_STATUS,
                            create_time
                    from ods_credit_credit_apply
                    where product_no like 'PN00000053%%'
                    ) as b
                    on a.USER_ID = b.USER_ID
                ) as t
                  where date(t.create_time) = date(ADDDATE(now(),-1) )
) as k group by k.score_flag


"""
table_day = mysql_df(sql_day)

In [281]:
sql_week =  """

select
	score_flag as score_interval,
	count(distinct(user_id)) as apply_num_week,
    date(ADDDATE(now(),-8) ) as applytime1,
    date(ADDDATE(now(),-2) ) as applytime2
from
(
	select
		user_id,
        create_time,
		case
			when (t.score ) > 680 then ">680"
			when (t.score ) > 650 and (t.score ) <= 680 then "(650,680]"
			when (t.score ) > 630 and (t.score ) <= 650 then "(630,650]"
			when (t.score ) > 610 and (t.score ) <= 630 then "(610,630]"
			when (t.score ) > 595 and (t.score ) <= 610 then "(595,610]"
			when (t.score ) <= 595 then "<=595"
		end as score_flag
	from 
    
             (
                select 
                        b.USER_ID as user_id,
                        b.APPLY_STATUS as APPLY_STATUS,
                        b.create_time as create_time,
                        a.SCORE as score
                from
                (
                    select 
                            USER_ID,
                            CREDIT_ID,
                            CREDIT_APPLY_ID,
                            CREDIT_AMOUNT,
                            SCORE,
                            PRODUCT_NO
                    from ods_credit_credit_info where product_no like 'PN00000053%%'
                    ) as a 

                    right join
                    (
                    select 
                            TENANT_ID,
                            USER_ID,
                            CREDIT_APPLY_ID,
                            APPLY_STATUS,
                            create_time
                    from ods_credit_credit_apply
                    where product_no like 'PN00000053%%'
                    ) as b
                    on a.USER_ID = b.USER_ID
                ) as t
                  where date(t.create_time) between date(ADDDATE(now(),-8))  and  date(ADDDATE(now(),-2))
) as k group by k.score_flag


"""
table_week = mysql_df(sql_week)

In [282]:
sql_month =  """

select
	score_flag as score_interval,
	count(distinct(user_id)) as apply_num_month,
    date(ADDDATE(ADDDATE(now(),INTERVAL -1 month),-2)) as applytime1,
    date(ADDDATE(now(),-2)) as applytime2
from
(
	select
		user_id,
        create_time,
		case
			when (t.score ) > 680 then ">680"
			when (t.score ) > 650 and (t.score ) <= 680 then "(650,680]"
			when (t.score ) > 630 and (t.score ) <= 650 then "(630,650]"
			when (t.score ) > 610 and (t.score ) <= 630 then "(610,630]"
			when (t.score ) > 595 and (t.score ) <= 610 then "(595,610]"
			when (t.score ) <= 595 then "<=595"
		end as score_flag
	from 
    
             (
                select 
                        b.USER_ID as user_id,
                        b.APPLY_STATUS as APPLY_STATUS,
                        b.create_time as create_time,
                        a.SCORE as score
                from
                (
                    select 
                            USER_ID,
                            CREDIT_ID,
                            CREDIT_APPLY_ID,
                            CREDIT_AMOUNT,
                            SCORE,
                            PRODUCT_NO
                    from ods_credit_credit_info where product_no like 'PN00000053%%'
                    ) as a 

                    right join
                    (
                    select 
                            TENANT_ID,
                            USER_ID,
                            CREDIT_APPLY_ID,
                            APPLY_STATUS,
                            create_time
                    from ods_credit_credit_apply
                    where product_no like 'PN00000053%%'
                    ) as b
                    on a.USER_ID = b.USER_ID
                ) as t
                  where date(t.create_time) between date(adddate(ADDDATE(now(), INTERVAL -1 month),-2))  and  date(ADDDATE(now(),-2))
) as k group by k.score_flag


"""
table_month = mysql_df(sql_month)
#between date(adddate(ADDDATE(now(), INTERVAL -1 month)), -1)  and  date(ADDDATE(now(),-1))

In [283]:
base_dict = {'score_interval':['>680','(650,680]','(630,650]','(610,630]','(595,610]','<=595','汇总'],\
             'toutiao_develop':['24.89%','20.10%','17.00%','14.87%','14.29%','10.85%','21590']}
dairy_report = pd.DataFrame(base_dict)

In [284]:
application_monitor = dairy_report.merge(table_day[['score_interval','apply_num_day']],on = 'score_interval',how='left')\
                                  .merge(table_week[['score_interval','apply_num_week']],on = 'score_interval',how='left')\
                                  .merge(table_month[['score_interval','apply_num_month']],on = 'score_interval',how='left').fillna(0)

total_num = sum(application_monitor.apply_num_day)
application_monitor['apply_num_day'][6] = total_num
application_monitor['apply_day_percent'] = application_monitor['apply_num_day'].map(lambda x: x/(total_num+0.000001))
application_monitor['apply_day_percent'][6] = total_num

total_num_week = sum(application_monitor.apply_num_week)
application_monitor['apply_num_week'][6] = total_num_week
application_monitor['apply_week_percent'] = application_monitor['apply_num_week'].map(lambda x: x/(total_num_week+0.000001))
application_monitor['apply_week_percent'][6] = total_num_week


total_num_month = sum(application_monitor.apply_num_month)
application_monitor['apply_num_month'][6] = total_num_month
application_monitor['apply_month_percent'] = application_monitor['apply_num_month'].map(lambda x: x/(total_num_week+0.000001))
application_monitor['apply_month_percent'][6] = total_num_month

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html

In [285]:
application_monitor

Unnamed: 0,score_interval,toutiao_develop,apply_num_day,apply_num_week,apply_num_month,apply_day_percent,apply_week_percent,apply_month_percent
0,>680,24.89%,0.0,3.0,3.0,0.0,0.044776,0.044776
1,"(650,680]",20.10%,6.0,24.0,24.0,0.272727,0.358209,0.358209
2,"(630,650]",17.00%,6.0,12.0,12.0,0.272727,0.179104,0.179104
3,"(610,630]",14.87%,6.0,23.0,23.0,0.272727,0.343284,0.343284
4,"(595,610]",14.29%,4.0,4.0,4.0,0.181818,0.059701,0.059701
5,<=595,10.85%,0.0,1.0,1.0,0.0,0.014925,0.014925
6,汇总,21590,22.0,67.0,67.0,22.0,67.0,67.0


In [286]:
spark_df = spark.createDataFrame(application_monitor).createOrReplaceTempView("application_monitor")



In [287]:
import datetime
now_time = (datetime.datetime.now()+datetime.timedelta(days=-1)).strftime('%Y-%m-%d')
now_time


sql =  """

insert overwrite table renhang_user_profile.apply_monitor_user_day partition(dt='%s') 
select 
    score_interval,
    toutiao_develop,
    apply_day_percent,
    apply_week_percent,
    apply_month_percent
from application_monitor

""" %now_time

spark.sql(sql)


DataFrame[]

In [288]:
########################################
#额度表：1. 申请通过有得分，得分高于一定点才有额度

In [289]:
sql =  """

select
	score_flag as score_interval,
	count(distinct(user_id)) as apply_num_%s,
    %s as applytime1,
    %s as applytime2
from
(
	select
		user_id,
        create_time,
		case
			when (t.score ) > 680 then ">680"
			when (t.score ) > 650 and (t.score ) <= 680 then "(650,680]"
			when (t.score ) > 630 and (t.score ) <= 650 then "(630,650]"
			when (t.score ) > 610 and (t.score ) <= 630 then "(610,630]"
			when (t.score ) > 595 and (t.score ) <= 610 then "(595,610]"
			when (t.score ) <= 595 then "<=595"
		end as score_flag
	from 
    
             (
                select 
                        b.USER_ID as user_id,
                        b.APPLY_STATUS as APPLY_STATUS,
                        b.create_time as create_time,
                        a.SCORE as score
                from
                (
                    select 
                            USER_ID,
                            CREDIT_ID,
                            CREDIT_APPLY_ID,
                            CREDIT_AMOUNT,
                            SCORE,
                            PRODUCT_NO
                    from ods_credit_credit_info where product_no like 'PN00000053%%'
                    ) as a 

                    right join
                    (
                    select 
                            TENANT_ID,
                            USER_ID,
                            CREDIT_APPLY_ID,
                            APPLY_STATUS,
                            create_time
                    from ods_credit_credit_apply
                    where product_no like 'PN00000053%%'
                    ) as b
                    on a.USER_ID = b.USER_ID
                ) as t
                  where date(t.create_time) between 
                    %s and %s

) as k group by k.score_flag


"""
# credit_apply = mysql_df(sql)

In [290]:
def get_monitor_df(sql,flag,monthdiff):
    
    if(flag == 'online'):
        time1 = """ADDDATE(DATE_ADD(STR_TO_DATE('2019-01-14',"%Y-%m-%d"),interval -day(STR_TO_DATE('2019-01-14',"%Y-%m-%d"))+1 day),  interval -0 month)"""
        time2 = """last_day(STR_TO_DATE('2019-01-14',"%Y-%m-%d"))"""
        flag = 'online'
        excute_sql = sql%(flag,time1,time2,time1,time2)

    else:
        time1 = """ADDDATE(DATE_ADD(curdate(),interval -day(curdate())+1 day),  interval -%d month)"""%monthdiff
        time2 = """ADDDATE(last_day(curdate()),interval -%d month)"""%monthdiff
        date_sql = """select ADDDATE(last_day(curdate()),interval -%d month) as date"""%monthdiff
        date =  mysql_df(date_sql)['date'].astype('str')
        flag = date[0][0:7].replace('-','_')
        excute_sql = sql%(flag,time1,time2,time1,time2)

    #获取每个月的时间
    credit_apply = mysql_df(excute_sql)
    return credit_apply

In [291]:
date_sql = """select ADDDATE(last_day(curdate()),interval -%d month) as date"""%1
res = mysql_df(date_sql)['date'].astype('str')
res[0][0:7]

'2018-12'

In [292]:
#上线月
apply_month_online = get_monitor_df(sql,'online',-1)
#当月
apply_month_now = get_monitor_df(sql,'not_online',0)
#上月
apply_month_last1 = get_monitor_df(sql,'not_online',1)
#上两月
apply_month_last2 = get_monitor_df(sql,'not_online',2)
#上3月
apply_month_last3 = get_monitor_df(sql,'not_online',3)
#上4月
apply_month_last4 = get_monitor_df(sql,'not_online',4)
#上5月
apply_month_last5 = get_monitor_df(sql,'not_online',5)
#上6月
apply_month_last6 = get_monitor_df(sql,'not_online',6)


In [293]:
####申请相关的表


In [299]:
hive_sql = """


select
	score_interval,
	count(distinct(user_id)) as user_num
from
(
	select
		user_id,
		case
			when cast(t.out_credit_score as int) > 680 then ">680"
			when cast(t.out_credit_score as int) > 650 and cast(t.out_credit_score as int) <= 680 then "(650,680]"
			when cast(t.out_credit_score as int) > 630 and cast(t.out_credit_score as int) <= 650 then "(630,650]"
			when cast(t.out_credit_score as int) > 610 and cast(t.out_credit_score as int) <= 630 then "(610,630]"
			when cast(t.out_credit_score as int) > 595 and cast(t.out_credit_score as int) <= 610 then "(595,610]"
			when cast(t.out_credit_score as int) <= 595 then "<=595"
		end as score_interval
	from 
			(
				SELECT
     				a.session_id as session_id,
     				a.user_id as user_id,
     				a.t02_apply_time as apply_time,
     				b.out_credit_score as out_credit_score
				FROM 
				(
					SELECT 
						session_id,
       					user_id,
       					t02_apply_time
					FROM ods.ods_toutiao_shouxin_input 
				) AS a

			left JOIN
  			(
  				SELECT 
  					session_id,
  					user_id,
          			out_credit_score
   				FROM ods.ods_toutiao_shouxin_output
  			) AS b ON a.session_id = b.session_id
		) as t
) as a group by a.score_interval

"""
toutiao = hive_df(hive_sql)

In [295]:
toutiao
#940条

Unnamed: 0,score_interval,user_num
0,,272
1,"(595,610]",103
2,"(610,630]",96
3,"(630,650]",55
4,"(650,680]",46
5,<=595,362
6,>680,6


In [300]:
toutiao

Unnamed: 0,score_interval,user_num
0,,272
1,"(595,610]",121
2,"(610,630]",114
3,"(630,650]",62
4,"(650,680]",53
5,<=595,483
6,>680,8
