***Author: 叫我月月鸟***

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import roc_auc_score
import lightgbm as lgb

In [3]:
import warnings
from jupyterthemes import jtplot

In [4]:
jtplot.style()
plt.rcParams['font.family'] = ['sans-serif']
plt.rcParams['font.sans-serif'] = ['SimHei']
warnings.filterwarnings('ignore')
pd.options.display.max_columns = None

In [13]:
class BaseModel:
    def __init__(self):
        self.train = None # 训练集
        self.test = None # 测试集
        self.train_label = None # 训练集标签
        self.features = None # 训练时使用的参数
        self.cat_feats = None # 训练时使用的参数中的类别参数
    
    def load_data(self, path):
        """
        读取数据
        @param: path:数据存放根目录
        """
        print('Loading data...', end='\t')
        self.train = pd.read_csv(path+'train.csv')
        self.test = pd.read_csv(path+'test.csv')
        self.train_label = pd.read_csv(path+'train_label.csv')
        self.submit = pd.read_csv('F:\PycharmProjects\Identify-untrustworthy-companies\dataset\submission.csv')
        print('Done!')
    
    def _missing_counter(self, df, cols=None):
        """
        统计数据缺失情况
        @param: df:数据集
        @param: cols:要统计缺失情况的特征列表
        """
        if cols is None:
            cols = df.columns
        counter = pd.DataFrame(columns=['Feature','Count','Percent'])
        length = df.shape[0]
        counter['Feature'] = cols
        counter = counter.set_index('Feature')
        for f in cols:
            n = sum(df[f].isna())
            p = n / length
            counter['Count'].loc[f] = n
            counter['Percent'].loc[f] = p
        return counter[counter['Count']!=0]
    def preprocess(self):
        """
        数据预处理（数据清洗等）
        """
        print('Preprocessing...', end='\t')
        # 处理缺失值
        counter = self._missing_counter(self.train)
        counter.reset_index(inplace=True)
        feats1 = counter[counter['Count']==8343]['Feature'].values.tolist()
        counter = counter[counter['Count']!=8343]
        counter = counter[counter['Percent']<0.5]
        feats2 = counter['Feature'].values.tolist()
        feats = feats1 + feats2
        all_df = pd.concat([self.train, self.test])
        for df in [self.train, self.test]:
            df = df[feats]
            for f in feats2:
                if df[f].dtype!=object:
                    df[f] = df[f].fillna(all_df[f][~all_df[f].isna()].mean()) 
        print("Done!")
    
    def feature_engineering(self):
        """
        特征工程
        """
        print('Feature engineering ...', end='\t')
        self.train = pd.merge(self.train, self.train_label, on='ID', how='left')
        #* ---------------------------------------------------------------------------
        train = self.train.copy()
        test = self.test.copy()
        for df in [train, test]:
            df['总税'] = df['印花税'] + df['增值税'] + df['企业所得税'] + df['城建税']
            df['注册资本税收比'] = df['注册资本'] / df['总税']
            #******************
            #在这里添加自己的特征工程部分，参考上两行
            #******************
            
        self.train = train.copy()
        self.test = test.copy()
        #* ---------------------------------------------------------------------------
        self.features = [_ for _ in self.train.columns if _ not in ['ID','Label','经营范围','经营期限至', '核准日期', '注销时间', '经营期限自', '成立日期']]
        # 解决新版本LGB输入数据集不支持中文特征的问题：临时将中文特征编码为整数
        map_columns = {self.features[i]:i for i in range(len(self.features))}
        self.train.rename(columns=map_columns, inplace=True)
        self.test.rename(columns=map_columns, inplace=True)
        self.cat_feats = ['企业类型','登记机关','企业状态','邮政编码','行业代码','行业门类','企业类别','管辖机关']
        self.features = [map_columns[i] for i in self.features]
        self.cat_feats = [map_columns[i] for i in self.cat_feats]
        self.map_columns = {i[1]:i[0] for i in map_columns.items()}
        
        print("Done!")
    
    def gen_dataset(self, path='dataset/'):
        """
        高级封装，数据读取+数据预处理+特征工程
        @param: path:数据存储的根目录
        """
        self.load_data(path)
        self.preprocess()
        self.feature_engineering()
    
    def model_train(self, model, params, seed, early_stop=200):
        """
        模型训练
        @param: model:模型类型
        @param: params:模型参数
        @param: seed:随机数种子
        @param: early_stop:模型训练时的早停参数
        """
        if model=='LGB':
            oof, predictions, feature_importance_df = self._lgb_model(params, seed, early_stop)
        return oof, predictions, feature_importance_df
    def _lgb_model(self, params, seed=9816, early_stop=200):
        """
        使用LightGBM进行五折交叉训练
        @param: params:参数
        @param: seed:五折交叉验证时的随机数种子
        @param: early_stop:模型训练时的早停参数
        """
        train = self.train.copy()
        test = self.test.copy()
        target = train['Label']
        features = self.features.copy()
        cat_feats = self.cat_feats.copy()
        for f in cat_feats:
            for df in [train, test]:
                df[f] = df[f].astype('category')
                
        oof = np.zeros(train.shape[0])
        predictions = np.zeros(test.shape[0])
        feature_importance_df = pd.DataFrame()
        folds = StratifiedKFold(n_splits=5,random_state=9816,shuffle=True)
        for fold, (trn_idx,val_idx) in enumerate(folds.split(train.values, target.values)):
            print("Fold {}".format(fold))
            trn_data = lgb.Dataset(train.iloc[trn_idx][features], label=target.iloc[trn_idx])
            val_data = lgb.Dataset(train.iloc[val_idx][features], label=target.iloc[val_idx])
            clf = lgb.train(
                params, 
                trn_data, 
                20000, 
                valid_sets = [trn_data, val_data], 
                verbose_eval=200, 
                early_stopping_rounds = early_stop,
                categorical_feature=cat_feats,
                )
            oof[val_idx] = clf.predict(train.iloc[val_idx][features], num_iteration=clf.best_iteration)

            fold_importance_df = pd.DataFrame()
            fold_importance_df["Feature"] = [self.map_columns[i] for i in features]
            fold_importance_df["importance"] = clf.feature_importance()
            fold_importance_df["fold"] = fold + 1
            feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)

            predictions += clf.predict(test[features], num_iteration=clf.best_iteration) / folds.n_splits
        print("CV score: {:<8.5f}".format(roc_auc_score(target, oof)))
        return oof, predictions, feature_importance_df
    
    def gen_submit(self, pred, name='submit'):
        """
        生成提交文件
        @param: pred:预测结果
        @param: name:提交文件名称
        """
        submit = self.submit.copy()
        tmp = self.test.copy()
        tmp['pred'] = pred
        del submit['Label']
        submit = pd.merge(submit, tmp[['ID','pred']], on='ID', how='left')
        submit.rename(columns={'pred':'Label'},inplace=True)
        submit.to_csv(name+'.csv',index=False)

In [14]:
model = BaseModel()

In [15]:
model.gen_dataset()

Loading data...	

FileNotFoundError: [Errno 2] File b'dataset/train.csv' does not exist: b'dataset/train.csv'

**注意：本例子中的所有随机种子都采用随机生成，即`int(np.random.rand()*100)`，这是为了避免大家直接 裸跑 这份代码产生和我一样的提交结果导致被主办方检测作弊封号。为了保证结果可复现，建议大家替换成一个自己设定的整数（别用2019，因为我跑这份代码的时候用的这个，当然如果你加入了自己的特征工程不是直接裸跑代码的话除外）**  
**强烈建议大家不要直接裸跑代码，没有意义。最好加入一些自己的特征工程，避免和别人跑出一样的分被封号。**

In [16]:
params = {
    'bagging_freq': 1,
    'bagging_fraction': 0.85,
    'bagging_seed': int(np.random.rand()*100),
    'boost': 'gbdt',
    'feature_fraction': 0.85,
    'feature_fraction_seed':int(np.random.rand()*100),
    'learning_rate': 0.01,
    'max_depth': 8,  
    'metric':'auc',
    'min_data_in_leaf': 15,
    'num_leaves': 32,
    'num_threads': 4,
    'objective': 'binary', 
    "lambda_l1": 0.5,
    'lambda_l2': 1.2,
    'verbosity': 1,
    'is_unbalance':True
}

In [25]:
oof, pred, feat_importance = model.model_train('LGB', params, seed=int(np.random.rand()*100))

AttributeError: 'NoneType' object has no attribute 'copy'

In [26]:
# 线下CV score:0.90713 线上0.923501

In [27]:
model.gen_submit(pred)

NameError: name 'pred' is not defined

In [28]:
plt.figure(figsize=(16,30))
sns.barplot(x="importance",
            y="Feature",
            data=(feat_importance.sort_values(by="importance",ascending=False)))
plt.title('LightGBM Features (avg over folds)')
plt.tight_layout()

NameError: name 'feat_importance' is not defined

<Figure size 1152x2160 with 0 Axes>