In [2]:
# 读取train数据和test数据
import pandas as pd
import numpy as np
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")
train_data

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
0,2.131956,56.372401,6.358667,1,0,0,1,1
1,3.803057,67.241081,1.872950,1,0,0,1,1
2,15.694986,175.989182,0.855623,1,0,0,1,1
3,26.711462,1.552008,4.603601,1,1,0,1,1
4,10.664474,1.565769,4.886521,1,0,0,1,1
...,...,...,...,...,...,...,...,...
19995,2.895796,0.625744,2.734760,1,1,0,1,0
19996,7.386646,225.860550,2.019975,1,0,1,1,0
19997,2.977114,1.097265,0.429777,1,1,0,0,0
19998,13.794286,0.309331,1.838496,1,0,0,1,0


In [9]:
# 将训练数据的标签和内容分开
x_train_data = train_data.iloc[:,:-1].values
y_train_data = train_data.iloc[:,-1].values
x_test_data = test_data.values
# x_train_data
x_train_data

array([[  2.13195567,  56.37240054,   6.35866732, ...,   0.        ,
          0.        ,   1.        ],
       [  3.80305735,  67.24108053,   1.87294961, ...,   0.        ,
          0.        ,   1.        ],
       [ 15.69498554, 175.9891815 ,   0.85562283, ...,   0.        ,
          0.        ,   1.        ],
       ...,
       [  2.97711433,   1.09726524,   0.42977677, ...,   1.        ,
          0.        ,   0.        ],
       [ 13.79428574,   0.30933093,   1.83849642, ...,   0.        ,
          0.        ,   1.        ],
       [ 19.1842828 ,  12.84247075,   0.54205032, ...,   0.        ,
          0.        ,   1.        ]])

In [10]:
# 数据归一化
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0,1))
x_train_data_scaler = scaler.fit_transform(x_train_data)
x_test_data_scaler = scaler.transform(x_test_data)
x_train_data_scaler

array([[6.94197902e-04, 2.60920748e-02, 2.38013134e-02, ...,
        0.00000000e+00, 0.00000000e+00, 1.00000000e+00],
       [1.24935819e-03, 3.11227079e-02, 6.98060701e-03, ...,
        0.00000000e+00, 0.00000000e+00, 1.00000000e+00],
       [5.20000123e-03, 8.14574126e-02, 3.16579826e-03, ...,
        0.00000000e+00, 0.00000000e+00, 1.00000000e+00],
       ...,
       [9.74969881e-04, 5.07650032e-04, 1.56894531e-03, ...,
        1.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [4.56856566e-03, 1.42949953e-04, 6.85141320e-03, ...,
        0.00000000e+00, 0.00000000e+00, 1.00000000e+00],
       [6.35918818e-03, 5.94398819e-03, 1.98995274e-03, ...,
        0.00000000e+00, 0.00000000e+00, 1.00000000e+00]])

In [12]:
# 将训练数据划分为训练集和验证集
from sklearn.model_selection import train_test_split
x_train,x_valid, y_train, y_valid = train_test_split(x_train_data_scaler, y_train_data, test_size=0.2, random_state=2023)
len(x_train)

16000

In [13]:
# 创建模型
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

ori_models = [LogisticRegression(random_state=10),
              SVC(random_state=10),
              DecisionTreeClassifier(criterion='gini',random_state=0),
              RandomForestClassifier(random_state=10)]
final_model = GradientBoostingClassifier(n_estimators=20)

In [25]:
from sklearn.model_selection import KFold
class MyStacking:
    def __init__(self, ori_models, final_model):
        self.ori_models = ori_models
        self.final_model = final_model
    
    def train(self, x, y):
        dataset_train = self.stacking(x, y)
        self.final_model.fit(dataset_train, y)
        
    def stacking(self, x, y):
        kf = KFold(n_splits=5, shuffle=True, random_state=2021)
        dataset_train = np.zeros((x.shape[0], len(self.ori_models)))
        for i, model in enumerate(self.ori_models):
            for (train, val) in kf.split(x, y):
                x_train = x[train]
                x_val = x[val]
                y_train = y[train]

                y_val_pred = model.fit(x_train, y_train).predict(x_val)
                dataset_train[val, i] = y_val_pred
            self.ori_models[i] = model
        return dataset_train
    
    def predict(self, x):
        dataset_test = np.zeros((x.shape[0], len(self.ori_models)))
        for i, model in enumerate(self.ori_models):
            dataset_test[:, i] = model.predict(x)
            print('the {} of {} finish'.format(i, model))
        return final_model.predict(dataset_test)
    
    def score(self, x, y):
        dataset_test = np.zeros((x.shape[0], len(self.ori_models)))
        for i, model in enumerate(self.ori_models):
            dataset_test[:, i] = model.predict(x)
        return self.final_model.score(dataset_test, y)

In [17]:
x_train.shape[0]

16000

In [26]:
my_stacking = MyStacking(ori_models, final_model)

In [27]:
my_stacking.train(x_train, y_train)

In [28]:
y_pred = my_stacking.predict(x_test_data_scaler)
y_pred

the 0 of LogisticRegression(random_state=20) finish
the 1 of SVC(random_state=20) finish
the 2 of DecisionTreeClassifier(random_state=0) finish
the 3 of RandomForestClassifier(random_state=20) finish


array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [29]:
out = pd.DataFrame(y_pred)
out

out.to_csv(".pred.csv", index=False, header=False)