## 设置

In [9]:
#pyspark package 
from pyspark.sql.session import SparkSession
import pyspark.sql.functions as f
from pyspark.sql.functions import *
from pyspark.sql import Window

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import *

spark = SparkSession.builder.getOrCreate()

#data processing 
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import average_precision_score
import torch.nn as nn

#可视化
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

#### 3. 基于 customer_id 下的做 merchant_id 的 embedding，你会怎么做? (实现代码会加分，不知道写可以描述思想)

例子： 通过计算每个merchant_id下所有用户的消费行为，比如所有用户在这个产品上平均消费金额，购买产品总数，用户平均年龄等数值型特征均可作为每款产品的embedding   
下面的数据我简单挑选了两个feature: 所有用户在这个产品上平均消费金额，购买产品总数,当拥有更多数据的时候可以增加更多feature, 目前根据这两个值产生metrics便是每个产品与用户的联系（embedding 数据库）

* 每个商家的流量（客户数量，消费金额）

In [None]:
# df_q3 = spark.read.csv("data_train.csv",header=True)

# for col in df_q3.columns:
#     df_q3 = df_q3.withColumn( col, regexp_replace(col , "'", ""))
    
# #给数据分类
# str_cols = ['customer_id','age_group','gender','merchant_id','type']
# int_cols = ['id','time','zipcode_customer','zipcode_merchant','fraud']
# double_cols = [ 'amount']

# for col in df_q3.columns:
#     if col in int_cols:
#         df_q3 = df_q3.withColumn(col,f.col(col).cast("integer"))
#     elif col in str_cols:
#         df_q3 = df_q3.withColumn(col,f.col(col).cast("string"))
#     elif col in double_cols:
#         df_q3 = df_q3.withColumn(col,f.col(col).cast("double"))  
        
        
        
# df_q3.withColumn("用户平均消费金额",avg("amount").over(Window.partitionBy("merchant_id")))\
# .withColumn("产品售出总数",count("merchant_id").over(Window.partitionBy("merchant_id")))\
#     .withColumn("产品客户数量",f.approx_count_distinct("customer_id").over(Window.partitionBy("merchant_id")))\
#     .withColumn("平均购买数量",f.col("产品售出总数")/f.col("产品客户数量"))\
#     .select("merchant_id","用户平均消费金额","平均购买数量")\
#     .distinct()\
#     .orderBy(desc("平均购买数量"))\
#     .show()

#### 4. 对于支付宝反欺诈模型，你觉得哪些变量会比较重要(请考虑交易双方的特征)?

> 1. 登陆地理位置比较异常，与平时的不太符合，ip地址,zipcode
> 2. 突然出现数额异常大的交易额
> 3. 突然出现新的收款方或付款方（之前从未出现过，或者没什么交集）
> 1. 最近支付比较频繁，与历史的支付习惯差异较大
> 1. 账户比较新
> 1. 账户里出现比较异常的行为，比如突然改了头像，改了名字
> 1. 与平时的转账行为不太一样
> 1. 突然加好友
> 1. 之前账号是否有被举报的情况或者被盗用的情况


## 辅助函数

* 处理数据，根据数据类型做对应的处理
* 对于类别数据用dummy处理
* 对于数值型,用standardscaler处理

In [10]:
def process_data(df):
    
    cat_arry = OneHotEncoder(handle_unknown="ignore").fit_transform(df.select("age_group","gender","type").collect())\
                .toarray().astype(np.float32)
    
    scale_ary = StandardScaler().fit_transform(
    np.array(df.select("amount").collect(),dtype="float").astype(np.float32))
    
    reg_ary = np.array(df.select("用户平均消费金额","平均购买数量").collect(),dtype="float").astype(np.float32)
    final_ary = np.concatenate((cat_arry,scale_ary,reg_ary),axis=1)
    
    return final_ary

* 返回每个预测结果的AP值

In [11]:
def result(y_vali,y_pred):
    print("AP值为：",average_precision_score(y_vali, y_pred))

## 读取数据
* 训练数据集加一列标为train, 测试数据机加一列标为test,方便之后数据处理的分割
* 集合训练集和测试集为df

In [12]:
df_train_raw = spark.read.csv("data_train.csv",header=True).withColumn("kind",lit("train")) 
df_test_raw = spark.read.csv("data_test.csv",header=True).withColumn("fraud",lit("unknown"))\
                               .withColumn("kind",lit("test")) 

df_train_label = df_train_raw.select("id","fraud")\
                        .withColumn("id",f.col("id").cast("integer"))\
                        .withColumn("fraud",f.col("fraud").cast("integer")) 

df = df_train_raw.union(df_test_raw.select(df_train_raw.columns)) 

## 数据预处理

* 去掉数据中的标点符号
* 给数据分类，具体分为：integer, string,double
* 看预处理后数据的大致情况，先做最显而易见的处理，比如出现不合理的值
* 只有训练集出现了5条金额为负数的记录，视为异常值，排除掉,主表重新命名为main_df


In [62]:
# 去标点符号
for col in df.columns:
    df = df.withColumn( col, regexp_replace(col , "'", ""))
    
#给数据分类
str_cols = ['customer_id','age_group','gender','merchant_id','type']
int_cols = ['id','time','zipcode_customer','zipcode_merchant','fraud']
double_cols = [ 'amount']

for col in df.columns:
    
    if col in int_cols:
        df = df.withColumn(col,f.col(col).cast("integer"))
    elif col in str_cols:
        df = df.withColumn(col,f.col(col).cast("string"))
    elif col in double_cols:
        df = df.withColumn(col,f.col(col).cast("double"))   

#看每个feature的大概情况
# for col in df.columns:
#     df.select(col).describe().show()
    
main_df = df.filter(f.col("amount")>=0)

In [63]:
for a in main_df.columns:
    main_df.filter(f.col("kind")=="train").select(a).describe().show()

+-------+------------------+
|summary|                id|
+-------+------------------+
|  count|            296503|
|   mean|148252.87730984172|
| stddev| 85594.90396831403|
|    min|                 0|
|    max|            296507|
+-------+------------------+

+-------+-----------------+
|summary|             time|
+-------+-----------------+
|  count|           296503|
|   mean|94.97524476986742|
| stddev|50.98429852862592|
|    min|                0|
|    max|              179|
+-------+-----------------+

+-------+-----------+
|summary|customer_id|
+-------+-----------+
|  count|     296503|
|   mean|       null|
| stddev|       null|
|    min|C1000148617|
|    max| C999723254|
+-------+-----------+

+-------+------------------+
|summary|         age_group|
+-------+------------------+
|  count|            296503|
|   mean|3.0249833703069657|
| stddev|1.3611465801196447|
|    min|                 0|
|    max|                 U|
+-------+------------------+

+-------+------+
|summar

## 数据分析
1. 观察得出 train & test dataset年龄层，性别还有交易类型均匀分布
2. 了解测试和训练数据中数据分布情况
> a. 训练数据里标签非常不平衡，只有1%的数据是fraud  
> b. 顾客的区域均为28007，商铺的区域也均为28007  
> c. 消费金额平均在37，38上下  
> d. 测试和训练数据中消费总流量（包括重复的客户）比较接近，总客户数一致（unique values),总商铺数量一致


In [None]:
# analyze_cols = ['age_group','gender','type']
# describe_cols = ['time','zipcode_customer','zipcode_merchant','amount','fraud']
# unique_cols = ['customer_id','merchant_id']

# #训练和预测数据里年龄性别还有产品交易类型分布均匀
# print("展示categorical数据分布情况")
# for col in analyze_cols:
#     main_df.groupBy(col,'kind').count()\
#     .groupBy(col).pivot('kind').agg(first("count"))\
#     .withColumn("cts_dif",f.col("train")-f.col("test"))\
#     .withColumn("dif_pct",f.round(f.col("cts_dif")/(f.col("train")+f.col("test"))/2 ,2)).show()

# print("训练数据集情况")
# main_df.filter(f.col("kind")=="train").select(describe_cols).describe().show()
# print("测试数据集情况")
# main_df.filter(f.col("kind")=="test").select(describe_cols).describe().show()

# print("在训练和测试集中顾客总流量（包括重复的顾客）")
# main_df.select("customer_id","kind").groupBy("kind").count().show()
# print("unique 顾客的数量")
# main_df.select("customer_id","kind").distinct().groupBy("kind").count().show()

# print("在训练和测试集中商铺总数量（不包括重复的）")
# # main_df.select("merchant_id","kind").groupBy("kind").count().show()
# main_df.select("merchant_id","kind").distinct().groupBy("kind").count().show()

## 特征工程1: 分割数据为df_train,df_vali,df_test然后做特征处理

 > 1. 每个商铺的用户平均消费金额
 > 1. 每个用户在商铺下的平均购买数量
 > 5. 交易金额
 > 6. 交易产品类型
 > 1. 性别
 > 1. 年龄层

* 同一个客户一共购买的次数，=》 在这家店购买次数的比例，同一商品种类下

In [24]:
df_train_main = main_df.filter(f.col("kind")=="train") 
df_train, df_vali = df_train_main.randomSplit([0.75, 0.25],seed = 10)

embedding1 = df_train.withColumn("用户平均消费金额",avg("amount").over(Window.partitionBy("merchant_id")))\
.withColumn("产品售出总数",count("merchant_id").over(Window.partitionBy("merchant_id")))\
    .withColumn("产品客户数量",f.approx_count_distinct("customer_id").over(Window.partitionBy("merchant_id")))\
    .withColumn("平均购买数量",f.col("产品售出总数")/f.col("产品客户数量"))\
    .select("merchant_id","用户平均消费金额","平均购买数量")\
    .distinct()

df_train = df_train.join(embedding1,"merchant_id","left")\
            .drop("customer_id","merchant_id","zipcode_customer","zipcode_merchant","kind")
            

df_vali = df_vali.join(embedding1,"merchant_id","left")\
            .drop("customer_id","merchant_id","zipcode_customer","zipcode_merchant","kind")

In [25]:
df_vali.columns

['id',
 'time',
 'age_group',
 'gender',
 'type',
 'amount',
 'fraud',
 '用户平均消费金额',
 '平均购买数量']

In [None]:
# df_train_main = main_df.filter(f.col("kind")=="train") 
# df_train, df_vali = df_train_main.randomSplit([0.75, 0.25],seed = 10)

# c_m_relation = df_train.withColumn("客户流量",count("customer_id").over(Window.partitionBy("merchant_id")))\
# .withColumn("客户distinct数量",f.approx_count_distinct("customer_id").over(Window.partitionBy("merchant_id")))\
# .withColumn("用户总消费金额",sum("amount").over(Window.partitionBy("merchant_id")))\
# .withColumn("平均消费金额",f.col("用户总消费金额") / f.col("客户distinct数量"))\
# .select("merchant_id","客户流量","平均消费金额")\
#     .distinct()

# df_train = df_train.join(c_m_relation,"merchant_id","left").drop("customer_id","merchant_id","zipcode_customer","zipcode_merchant","kind")

# df_vali = df_vali.join(c_m_relation,"merchant_id","left").drop("customer_id","merchant_id","zipcode_customer","zipcode_merchant","kind")


In [None]:
# df_train_main = main_df.filter(f.col("kind")=="train") 
# df_train, df_vali = df_train_main.randomSplit([0.75, 0.25],seed = 10)

# c_m_relation = df_train.withColumn("客户流量",count("customer_id").over(Window.partitionBy("merchant_id")))\
# .withColumn("客户distinct数量",f.approx_count_distinct("customer_id").over(Window.partitionBy("merchant_id")))\
# .withColumn("用户总消费金额",sum("amount").over(Window.partitionBy("merchant_id")))\
# .withColumn("平均消费金额",f.col("用户总消费金额") / f.col("客户distinct数量"))\
# .withColumn("客户在同一家消费次数",count("customer_id").over(Window.partitionBy("merchant_id","customer_id")))\
# .select("merchant_id","客户流量","平均消费金额","客户在同一家消费次数")\
#     .distinct()

# df_train = df_train.join(c_m_relation,"merchant_id","left").drop("customer_id","merchant_id","zipcode_customer","zipcode_merchant","kind")

# df_vali = df_vali.join(c_m_relation,"merchant_id","left").drop("customer_id","merchant_id","zipcode_customer","zipcode_merchant","kind")\
# fillna(1, subset=["客户在同一家消费次数"])


In [None]:
# print("Safe 信息")
# print(df_train.filter(f.col("fraud")== 0).drop("id").describe().toPandas())
# print("\n")
# print("Fraud 信息")
# print(df_train.filter(f.col("fraud")== 1).drop("id").describe().toPandas())

## 特征工程 2

* 将交易类型做dummy处理
* 交易金额，时间戳，每家店顾客总流量用standardscaler处理
* 其他数据保持不变

In [26]:
df_train.columns

['id',
 'time',
 'age_group',
 'gender',
 'type',
 'amount',
 'fraud',
 '用户平均消费金额',
 '平均购买数量']

In [27]:
def process_data(df):
    
    cat_arry = OneHotEncoder(handle_unknown="ignore").fit_transform(df.select("age_group","gender","type").collect())\
                .toarray().astype(np.float32)
    
    scale_ary = StandardScaler().fit_transform(
    np.array(df.select("amount","time","用户平均消费金额","平均购买数量").collect(),dtype="float").astype(np.float32))
    
#     reg_ary = np.array(df.select("amount","time","客户在同一家消费次数").collect(),dtype="float").astype(np.float32)
    final_ary = np.concatenate((cat_arry,scale_ary),axis=1)
    
    return final_ary

In [28]:
X_train = process_data(df_train)
y_train = np.array(df_train.select("fraud").collect(),dtype="float").astype(np.float32)

X_vali = process_data(df_vali)
y_vali = np.array(df_vali.select("fraud").collect(),dtype="float").astype(np.float32)

print("X_train shape",X_train.shape)
print("y_train shape",y_train.shape)
print("X_vali shape",X_vali.shape)
print("y_vali shape",y_vali.shape)

X_train shape (222087, 31)
y_train shape (222087, 1)
X_vali shape (74416, 31)
y_vali shape (74416, 1)


In [None]:
# np.isnan(y_train).any()

In [None]:
# np.isnan(y_vali).any()

## 模型训练

#### BASELINE

In [None]:
# 简单猜测预测1的可能性
p_1 = np.sum(y_train)/(len(y_train))
y_base = np.full(y_vali.shape , p_1, dtype=float)

result(y_vali,y_base)

### 线性

In [29]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

# Predicting the Test set results
y_pred_line = regressor.predict(X_vali)
result(y_vali,y_pred_line)

AP值为： 0.7893854605187441


In [31]:
from scipy import stats
stats.describe(y_pred_line)

DescribeResult(nobs=74416, minmax=(array([-2.1622343], dtype=float32), array([2.7754402], dtype=float32)), mean=array([0.01255421], dtype=float32), variance=array([0.00706812], dtype=float32), skewness=array([10.628092], dtype=float32), kurtosis=array([172.67578], dtype=float32))

### 逻辑回归

In [32]:
# Fitting Logistic Regression to the Training set
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred_log = classifier.predict_proba(X_vali)
result(y_vali,y_pred_log[:,1])

AP值为： 0.8457885779639692


In [33]:
stats.describe(y_pred_log[:,1])

DescribeResult(nobs=74416, minmax=(1.7915749998505477e-06, 1.0), mean=0.011551131278114095, variance=0.006922456797977055, skewness=10.16078240925566, kurtosis=108.34485258005037)

### 决策树

In [None]:
# Fitting Decision Tree Classification to the Training set
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0,max_depth=8)
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred_rdtree = classifier.predict_proba(X_vali)
result(y_vali,y_pred_rdtree[:,1])

### Navie Bayes

In [None]:
# Fitting Naive Bayes to the Training set
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict_proba(X_vali)
result(y_vali,y_pred_rdtree[:,1])

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(X_train,y_train)

y_rfc = rfc.predict_proba(X_vali)
result(y_vali,y_rfc[:,1])

### Xgboost

In [35]:
from xgboost import XGBClassifier
# Fitting XGBoost to the Training set

classifier = XGBClassifier(
                            eta = 0.1,
                            gamma = 0.05)

classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred_xg = classifier.predict_proba(X_vali)

result(y_vali,y_pred_xg[:,1])

AP值为： 0.8821342987982309


In [36]:
stats.describe(y_pred_xg[:,1])

DescribeResult(nobs=74416, minmax=(2.6233061e-05, 0.9986639), mean=0.010523386, variance=0.0075099496, skewness=10.009506225585938, kurtosis=103.08457975240842)

In [None]:
classifier =  XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, 
                                      monotone_constraints=None,
                                     n_estimators=1000,
                                     num_parallel_tree=None, random_state=None,
                                     reg_alpha=None, reg_lambda=None,
                                     scale_pos_weight=None, subsample=None,
                                     tree_method=None, validate_parameters=None,
                                     verbosity=None,
             eta =0.05)

classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred_xg = classifier.predict_proba(X_vali)

result(y_vali,y_pred_xg[:,1])

### Xgboosting 做模型优化

In [None]:
parameters = {'objective':['binary:logistic'],
              'eta': [0.05,0.1],
              'gamma':[0.00,0.05]
              
#               'eta': [0.05,0.1,0.25,0.3],
#               'max_depth': [5,10,15,20],
#               'subsample': [0.5,0.6],
#               'gamma':[0.00,0.05,0.10,0.15],
#               'scale_pos_weight': [1, 30],
#               'min_child_weight': [1,3],
#               'n_estimators': [100,200,500,1000]
             }

# {'objective':['binary:logistic'],
#               'eta': np.arange(0.01,0.4,0.05),
#               'max_depth': np.arange(5,35,5),
#               'subsample': [0.5,0.6,0.7,0.8,0.9,1.0],
#               'colsample_bytree': [0.5,0.6,0.7,0.8,0.9,1.0],
#               'gamma':[0.00,0.05,0.10,0.15,0.20],
#               'scale_pos_weight': [1, 30,40,50,300,400,500,600,700],
#               'min_child_weight': [1,3,6,8,11],
#               'n_estimators': np.arange(100,1100,120)}

# parameters = {'objective':['binary:logistic'],
#               'eta': [0.01,0.02],
#               'max_depth': [10,15],
#               'subsample': [0.5,0.6],
#               'colsample_bytree': [0.5,0.6],
#               'gamma':[0.00,0.05],
#               'scale_pos_weight': [1, 30],
#               'min_child_weight': [1,3],
#               'n_estimators': [100]}

xgb_model = xgb.XGBClassifier()


clf = GridSearchCV(xgb_model, 
                   parameters, 
                   n_jobs=5, 
                   cv = StratifiedShuffleSplit(n_splits=5,test_size=0.25,random_state = 0),
                   scoring='average_precision',
                   verbose=2, refit=True)

clf.fit(X_train, y_train)


In [None]:
clf

In [None]:
result_df = pd.concat([pd.DataFrame(clf.cv_results_["params"]),pd.DataFrame(clf.cv_results_["mean_test_score"], columns=["average_precision"])],axis=1)
result_df

In [None]:
# 参数字典
param_grid = {'learning_rate':[0.05,0.1,0.25,0.3],
              'max_depth':range(2,10),
              'n_estimators':range(100,1100,120)}
kfold = StratifiedKFold(n_splits=10,shuffle=True,random_state=7)

model.fit(X_train,
          y_train,
          eval_set = [(x_test,y_test)],          # 评估数据集
          eval_metric = "mlogloss",              # 评估标准
          early_stopping_rounds = 10,            # 当loss有10次未变，提前结束评估
          verbose = False)                       # 显示提前结束


# 参数字典
param_grid = {'learning_rate':[0.05,0.1,0.25,0.3],
              'max_depth':range(2,10),
              'n_estimators':range(100,110,120)}

kfold = StratifiedKFold(n_splits=10,shuffle=True,random_state=7)
grid_search = GridSearchCV(model,                   # 模型
                           param_grid,              # 待调参数（字典）
                           scoring="neg_log_loss",  # 模型评估准则
                           n_jobs=1,               # -1表示使用全部的cpu运算
                           cv=kfold)


In [None]:
import pandas as pd
import xgboost as xgb
import operator
from matplotlib import pylab as plt

def ceate_feature_map(features):
    outfile = open('xgb.fmap', 'w')
    i = 0
    for feat in features:
        outfile.write('{0}\t{1}\tq\n'.format(i, feat))
        i = i + 1

    outfile.close()

# def get_data():
#     train = pd.read_csv("../input/train.csv")

#     features = list(train.columns[2:])

#     y_train = train.Hazard

#     for feat in train.select_dtypes(include=['object']).columns:
#         m = train.groupby([feat])['Hazard'].mean()
#         train[feat].replace(m,inplace=True)

#     x_train = train[features]

#     return features, x_train, y_train


# features, x_train, y_train = get_data()
# ceate_feature_map(features)

xgb_params = {"objective": "reg:linear", "eta": 0.01, "max_depth": 8, "seed": 42}
num_rounds = 1000

dtrain = xgb.DMatrix(X_train, label=y_train)
gbdt = xgb.train(xgb_params, dtrain, num_rounds)

importance = gbdt.get_fscore(fmap='xgb.fmap')
importance = sorted(importance.items(), key=operator.itemgetter(1))

df = pd.DataFrame(importance, columns=['feature', 'fscore'])
df['fscore'] = df['fscore'] / df['fscore'].sum()

# plt.figure()
# df.plot()
# df.plot(kind='barh', x='feature', y='fscore', legend=False, figsize=(6, 10))
# plt.title('XGBoost Feature Importance')
# plt.xlabel('relative importance')
# plt.gcf().savefig('feature_importance_xgb.png')

In [None]:
## parameters to be tuned
tune_dic = {}
tune_dic['max_depth']= [5,10,15,20,25] ## 树的最大深度
tune_dic['eta']= [0.01,0.025,0.05,0.10,0.20]  ## learning rate
tune_dic['gamma']= [0.00,0.05,0.10,0.15,0.20]  ## minimum loss function reduction required for a split

res = {'result':[[[0,0,0],0]]}
res_ary =np.array([])

for depth in tune_dic['max_depth']:

    for eta in tune_dic['eta']:
        
        for gamma in tune_dic['gamma']:

            print("depth:",depth, "eta:",eta,"gamma:",gamma, "training starts")
            classifier = XGBClassifier(max_depth = depth,
                                      eta = eta,
                                      gamma = gamma
                                      )

            classifier.fit(X_train, y_train)
            y_pred = classifier.predict_proba(X_vali)
            ap = average_precision_score(y_vali, y_pred[:,1])
            
            res_ary = np.append(res_ary,ap)
            res["result"].append([[depth,eta,gamma],ap])
            
            print("depth:",depth, "eta:",eta,"gamma:",gamma, "test finish", "ap value",ap)

In [None]:
np.max(res_ary)
#depth: 5 eta: 0.1 gamma: 0.2 test finish ap value 0.8982911924028751

### 深度学习

* ANN

In [None]:
# Importing the Keras libraries and packages
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout


def ANN_model(X_train,y_train,X_test,Batch_size=500, Epochs = 20):
    
    keras.backend.clear_session()
    # Initialising the ANN
    classifier = Sequential()
    # 建立模型
    classifier.add(Dense(units = 14,  activation = 'relu', input_dim = X_train.shape[1]))
    classifier.add(Dense(units = 8, activation = 'relu'))
    classifier.add(Dense(units = 4, activation = 'relu'))
    classifier.add(Dropout(0.3))
    #输出层
    classifier.add(Dense(units = 1, activation = 'sigmoid'))
    # Compiling the ANN
    classifier.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

    # 将模型运用到训练集中
    classifier.fit(X_train, y_train, batch_size = 500, epochs=Epochs)

    # 预测
    y_pred = classifier.predict(X_test)
    print("预测完成,返回预测值")
    
    return y_pred

In [None]:
keras.backend.clear_session()

In [None]:
y_pred_ann = ANN_model(X_train,y_train,X_vali,Epochs = 200)

In [None]:
result(y_vali,(y_pred_ann.reshape(y_pred_ann.shape[0],)+y_pred_xg[:,1])/2)

### 深度学习模型和xgboost都表现的很好， 结合两个模型结果更好

In [None]:
result(y_vali,(y_pred_ann.reshape(y_pred_ann.shape[0],)+y_pred_xg[:,1])/2)

In [None]:
c_m_train_vali = main_df.filter(f.col("kind")=="train").withColumn("用户平均消费金额",avg("amount").over(Window.partitionBy("merchant_id")))\
.withColumn("产品售出总数",count("merchant_id").over(Window.partitionBy("merchant_id")))\
    .withColumn("产品客户数量",f.approx_count_distinct("customer_id").over(Window.partitionBy("merchant_id")))\
    .withColumn("平均购买数量",f.col("产品售出总数")/f.col("产品客户数量"))\
    .select("merchant_id","用户平均消费金额","平均购买数量")\
    .distinct() 


df_trainVali = main_df.filter(f.col("kind")=="train").join(c_m_train_vali,"merchant_id","left").drop("customer_id","merchant_id")

df_test = main_df.filter(f.col("kind")=="test").join(c_m_train_vali,"merchant_id","left").drop("customer_id","merchant_id")

test_id = np.array(df_test.select("id").collect(),dtype="float").astype(np.float32)

X_train_m = process_data(df_trainVali)
y_train_m = np.array(df_trainVali.select("fraud").collect(),dtype="float").astype(np.float32)

In [None]:
###Xgboost
classifier = XGBClassifier(max_depth = 5,
                            eta = 0.1,
                            gamma = 0.2)
classifier.fit(X_train_m, y_train_m)
y_xg_final = classifier.predict_proba(X_test)[:,1]
print("Xgboost预测完成")

y_ann_final = ANN_model(X_train_m,y_train_m,X_test,Epochs = 200)
print("ANN预测完成")

In [None]:
pred_ary = (y_xg_final + y_ann_final.reshape(y_ann_final.shape[0],))/2
test_result = np.concatenate((test_id,np.array(pred_ary).reshape(pred_ary.shape[0],1)),axis=1)

## 输出结果 存为csv 

In [None]:
pd.DataFrame(test_result,columns = ["id","预测为1的可能性"]).to_csv("update_result.csv")

## 无监督学习 Autoencoder： 效果非常好！

In [None]:
import torch
import torch.optim as optim
from torch.utils.data import *
import torch.nn.functional as F

In [None]:
class Autoencoder(nn.Module):

    def __init__(self, **kwargs):
        
        super(Autoencoder, self).__init__()

        first_layer,second_layer,third_layer,middle_layer = kwargs["layers"]

        # encoder
        self.enc1 = nn.Linear(in_features=first_layer, out_features=second_layer)
    #     self.doe1 = nn.Dropout(p=0.5, inplace=False)
        self.enc2 = nn.Linear(in_features=second_layer, out_features=third_layer)
    #     self.doe2 = nn.Dropout(p=0.5, inplace=False)
        self.enc3 = nn.Linear(in_features=third_layer, out_features=middle_layer)
    #     self.doe3 = nn.Dropout(p=0.5, inplace=False)

        # mid
        self.mid = nn.Linear(in_features=middle_layer, out_features=middle_layer)  

        # decoder 
    #     self.dod3 = nn.Dropout(p=0.5, inplace=False)
        self.dec3 = nn.Linear(in_features=middle_layer, out_features=third_layer)
    #     self.dod2 = nn.Dropout(p=0.5, inplace=False)
        self.dec2 = nn.Linear(in_features=third_layer, out_features=second_layer)
    #     self.dod1 = nn.Dropout(p=0.5, inplace=False)
        self.dec1 = nn.Linear(in_features=second_layer, out_features=first_layer)

    def forward(self, x):
        
        x = F.relu(self.enc1(x))
        #     x = nn.Dropout(p=0.5, inplace=False)(x)
        x = F.relu(self.enc2(x))
        #     x = nn.Dropout(p=0.5, inplace=False)(x)
        x = F.relu(self.enc3(x))
        #     x = nn.Dropout(p=0.5, inplace=False)(x)
        x = F.relu(self.mid(x))
        #     x = nn.Dropout(p=0.5, inplace=False)(x)
        x = F.relu(self.dec3(x))
        #     x = nn.Dropout(p=0.5, inplace=False)(x)
        x = F.relu(self.dec2(x))             
        #     x = nn.Dropout(p=0.5, inplace=False)(x)
        x = F.relu(self.dec1(x))
        
        return x 

In [None]:
def AE_Train(X_train, model_layers, BATCH_SIZE , NUM_EPOCHS = 100,LEARNING_RATE= 0.0002):

    manualSeed = 888
    #manualSeed = random.randint(1, 10000) # use if you want new results
    print("Random Seed: ", manualSeed)
    torch.manual_seed(manualSeed)    

    model = Autoencoder(layers = model_layers)
    print(model)

    print("Reset model parameters")
    for name, module in model.named_children():
        module.reset_parameters()

    ngpu = 4
    device = torch.device("cuda:0" if (torch.cuda.is_available() and ngpu > 0) else "cpu")
    print(device)
    
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
    model.to(device)

    trainloader = DataLoader(X_train, batch_size=BATCH_SIZE, shuffle=True) ##number of batches 
    testloader = DataLoader(X_train, batch_size=1, shuffle=False)

    train_loss = []
    test_loss= []
    for epoch in range(NUM_EPOCHS):
        running_loss = 0.0
        for data in trainloader:
            batch = data
            batch = batch.to(device)
            optimizer.zero_grad()
            outputs = model(batch)
            loss = criterion(outputs, batch)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

        loss = running_loss / len(trainloader)
        train_loss.append(loss)

        if epoch % 25 == 0:
            print('Epoch {} of {}, Train Loss: {:.3f}'.format(epoch+1, NUM_EPOCHS, loss))

    print("test starts")

    for idx,data in enumerate(testloader):
        batch = data.to(device)
        outputs = model(batch)
        loss = criterion(outputs, batch)
        test_loss.append(loss)

    print("test finished")

    return train_loss,test_loss

In [None]:
train_loss,test_loss = AE_Train(X_train_m, [30,20,10,4], 200)

#### 计算模型输出值于输入值的MSE, MSE越大，说明越有可能是fraud

In [None]:
id_li = [int(row.id) for row in df_trainVali.select("id").collect()]
los = [float(i) for i in test_loss]

df_loss = spark.createDataFrame(zip(id_li,los),["id","loss"])
# df_loss.show()

result_df = df_trainVali.join(df_loss,"id","left").select("id","fraud","loss")

result_df.orderBy(desc("loss")).show(100)

In [None]:
result_df.filter(f.col('fraud') == 1).describe().show()

In [None]:
result_df.filter(f.col('fraud') == 0).describe().show()

### 下面这张图 表示通过autoencoder模型算出来的loss（MSE)值大于15（左右），结果100%会是fraud 当loss值在3～15之间时，结果有可能是fraud 也可能不是fraud, 但整体而言这个预测的非常好，下一步可以做的是去掉loss值大的数据，重新训练autoencoder模型，也可以适当增多feature, 再进行预测

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
ls = [float(i) for i in test_loss]
plt.scatter(ls,y_train_m);