In [None]:
import warnings
warnings.filterwarnings('ignore')
import re, jieba, requests, json, time, random
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer

from xgboost import XGBClassifier 

# from shutil import copyfile 
#from googletrans import Translator
#copyfile(src = '../input/law-text/useful_tools.py', dst = '../working/useful_tools.py')
#from useful_tools import * 

In [None]:
test_set = pd.read_csv('../input/law-text/TestSet.csv',encoding='gb18030')

test_set['content'] = test_set['content'].astype(str)
test_set['content'] = test_set['content'].apply(lambda x: x.replace('\u3000','') \
                                                      .replace('\n','') \
                                                      .replace('\r','') \
                                                      .strip()) 
test_set

In [None]:
train = pd.read_csv('../input/law-text/train.csv')
train[train['class']=='使用者要求/运营者要求'] = '使用者要求'
train[train['class']=='运营者要求/使用者要求'] = '运营者要求'
train[train['class']=='使用者要求（运营者）'] = '使用者要求'
train[train['class']=='职责区分/违规处理'] = '职责区分'
train.dropna(inplace=True)
train['content'] = train['content'].astype(str)
train['content'] = train['content'].apply(lambda x: x.replace('\u3000','') \
                                                      .replace('\n','') \
                                                      .replace('\r','') \
                                                      .strip()) 
le = LabelEncoder()
y1 = le.fit_transform(train['class'])
train['label'] = y1
print('train has {} rows and {} columns'.format(train.shape[0], train.shape[1]))
train.head()

In [None]:
test = train.sample(n=380, weights=y1, random_state=786)
test.sort_index(inplace=True)
#test.to_csv('test_sampled.csv',index=False,encoding='utf_8_sig')
#print(len(test))
#train_new = train.append(test)
#print('train has {} rows and {} columns'.format(train_new.shape[0], train_new.shape[1]))
#train_new.drop_duplicates(subset=['content'],keep=False,inplace=True)
#train_new # 基于507个样本进行训练

In [None]:
#train_fr = train_new.copy()
#train_en = train_new.copy()
#train_de = train_new.copy()
#start = time.time()
#train_fr['content'] = train_fr['content'].astype(str)
#train_fr['content'] = train_fr['content'].apply(lambda x: reverse_trans_google(x))
#train_en['content'] = train_en['content'].apply(lambda x: reverse_trans_google(x,dest='en'))
#train_de['content'] = train_de['content'].apply(lambda x: reverse_trans_google(x,dest='de'))
#print('This program costs {:.2f} seconds'.format(time.time()-start))
#train_augment = train_new.append(train_fr)
#train_augment = train_augment.append(train_en)
#train_augment = train_augment.append(train_de)
#print('train has {} rows and {} columns'.format(train_augment.shape[0], train_augment.shape[1]))
#train_augment.to_csv('train_augment.csv',index=False,encoding='utf_8_sig')
train_augment = pd.read_csv('../input/law-text/train_augment.csv')
train_augment.head()

In [None]:
def get_vectorize(wordlist,vector = 'CountVectorizer',feats =150):
    '''得到特征X矩阵
    '''
    if vector == 'CountVectorizer':
        cv = CountVectorizer(max_features = feats,token_pattern='[\u4e00-\u9fa5_a-zA-Z0-9]{1,}')
        cv_fit = cv.fit_transform(wordlist).toarray()
        colnames = cv.get_feature_names()
        word_matrix = pd.DataFrame(cv_fit, columns=colnames)   
        return word_matrix
    elif vector == 'TfidfVectorizer':
        cv = TfidfVectorizer(max_features = feats,token_pattern='[\u4e00-\u9fa5_a-zA-Z0-9]{1,}')
        cv_fit = cv.fit_transform(wordlist).toarray()
        colnames = cv.get_feature_names()
        word_matrix = pd.DataFrame(cv_fit, columns=colnames)   
        return word_matrix
    elif vector == 'HashingVectorizer':
        cv =  HashingVectorizer(n_features = feats)
        cv_fit = cv.fit_transform(wordlist).toarray()
        word_matrix = pd.DataFrame(cv_fit)
        return word_matrix
    
stopword_list = [k.strip() for k in open('../input/english-and-chinese-stopwords/stopwords.txt', encoding='utf8').readlines() if k.strip() != '']

def get_cutword(string):
    '''jieba分词,正则替换数字
    '''
    string = re.sub("[0-9]"," ",string) # 正则替换数字
    cutWords = [k for k in jieba.cut(string) if k != '' if k not in stopword_list]
    combined = ' '.join(cutWords)
    return combined

In [None]:
cutword_series = train_augment['content'].apply(lambda x: get_cutword(x)) # 得到的是pandas series
x_train = get_vectorize(cutword_series,feats = 100)

cutword_series = test['content'].apply(lambda x: get_cutword(x)) # 得到的是pandas series
x_test = get_vectorize(cutword_series,feats = 100)

cutword_series = test_set['content'].apply(lambda x: get_cutword(x)) # 得到的是pandas series
test_set = get_vectorize(cutword_series,feats = 100)

In [None]:
x_test

In [None]:
y_train = train_augment['label']
y_test = test['label']

'''
model = XGBClassifier(n_estimators = 250 ,
                      max_depth = 6, 
                      learning_rate = 0.2,
                      min_child_weight = 10, 
                      colsample_bytree = 0.7, 
                      subsample = 0.8)
cross_print_info(model, x_train, y_train, cv = 5)
'''

In [None]:
def consist_train_test(test, train_col):
    '''使得测试集列名顺序与训练集一致
    '''
    new_df = pd.DataFrame()
    for i in train_col:
        if i in test.columns:
            new_df[i] = test[i]
        else:
            new_df[i] = 0
    new_df.fillna(0, inplace = True)
    order = train_col
    new_df[order]
    return new_df

x_test = consist_train_test(x_test,x_train.columns)
test_set = consist_train_test(test_set,x_train.columns)

In [None]:

#for i, t in enumerate(label_cols):
#    print(t)
#    y = train_df.loc[:, [t]].values.reshape(-1)
def run_crossval(x_train, y_train, x_test): 
    '''交叉验证'''
    folds = StratifiedKFold(n_splits=5)
    val_score = []
    test_pred = []
    test_pred_set = []
    #pred_test = np.zeros((len(test_X), len(label_cols)))
    for fold, (trn_idx, val_idx) in enumerate(folds.split(x_train, y_train)):
        print(f'Training fold {fold + 1}')
        x_trn, x_val = x_train.iloc[trn_idx], x_train.iloc[val_idx]
        y_trn, y_val = y_train.iloc[trn_idx], y_train.iloc[val_idx]
        #model = LogisticRegression(C=9.0, class_weight='balanced')
        #model = LogisticRegression(C=9.0)
        model = XGBClassifier(n_estimators = 250 ,
                      max_depth = 6, 
                      learning_rate = 0.2,
                      min_child_weight = 10, 
                      colsample_bytree = 0.7, 
                      subsample = 0.8,
                      random_state=233,
                      tree_method='gpu_hist')
        model.fit(x_trn, y_trn)
        pred_train = model.predict(x_trn)
        acc_train = accuracy_score(y_trn, pred_train)
        pred_val = model.predict(x_val)
        acc_val = accuracy_score(y_val, pred_val)
        val_score.append(acc_val)
        
        pred_test = model.predict(x_test)
        test_pred.append(pred_test)
        
        pred_test_set = model.predict(test_set)
        test_pred_set.append(pred_test_set)
        #pred_val = model.predict(xval)
        #pred_test[:, i] += model.predict(test)
        print('val accuracy:{:.5f}, train accuracy:{:.5f}'.format(acc_val, acc_train))
    print('-' * 50)
    print('OOF val accuracy:{:.5f}'.format(np.array(val_score).mean()))
    return pd.DataFrame(np.array(test_pred).T), pd.DataFrame(np.array(test_pred_set).T)

In [None]:
test_pred, test_pred_set = run_crossval(x_train, y_train, x_test)

In [None]:
test_pred

In [None]:
voted_test_pred = test_pred.mode(axis=1)[0].astype(int)
voted_test_pred

acc_test = accuracy_score(y_test, voted_test_pred)
print('pseudo test set accuracy:{:.5f}'.format(acc_test))

In [None]:
test_pred_set

In [None]:
voted_test_pred = test_pred_set.mode(axis=1)[0].astype(int)
voted_test_pred

In [None]:
recode =  {0 : '使用者要求', 1 : '名词解释',2 : '服务监督', 3 : '法规倡议', 
           4 : '法规目的',5 : '职责区分', 6 : '运营者要求', 7 : '违规处理'}
y_pred = pd.Series(voted_test_pred).map(recode)

In [None]:
test_set = pd.read_csv('../input/law-text/TestSet.csv',encoding='gb18030')

In [None]:
submission = pd.DataFrame({'content': test_set['content'], 'class': y_pred})
submission.to_csv('pred_xgb.csv', index = False)
submission