In [None]:
import sys
!cp ../input/rapids/rapids.0.16.0 /opt/conda/envs/rapids.tar.gz
!cd /opt/conda/envs/ && tar -xzvf rapids.tar.gz > /dev/null
sys.path = ["/opt/conda/envs/rapids/lib/python3.7/site-packages"] + sys.path
sys.path = ["/opt/conda/envs/rapids/lib/python3.7"] + sys.path
sys.path = ["/opt/conda/envs/rapids/lib"] + sys.path 
!cp /opt/conda/envs/rapids/lib/libxgboost.so /opt/conda/lib/

In [None]:
import warnings
warnings.filterwarnings('ignore')
import re, jieba, requests, json, time, random
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer

from xgboost import XGBClassifier 
#import cudf
from cuml.ensemble import RandomForestClassifier
#from cuml.svm import SVC
#from cuml.naive_bayes import MultinomialNB

from sklearn.naive_bayes import BernoulliNB
#from sklearn.ensemble import RandomForestClassifier 
from sklearn.svm import SVC

from shutil import copyfile 
#!pip install googletrans
#from googletrans import Translator
#copyfile(src = '../input/law-text/useful_tools.py', dst = '../working/useful_tools.py')
#from useful_tools import * 

In [None]:
test_set = pd.read_csv('../input/law-text/TestSet.csv',encoding='gb18030')

test_set['content'] = test_set['content'].astype(str)
test_set['content'] = test_set['content'].apply(lambda x: x.replace('\u3000','') \
                                                      .replace('\n','') \
                                                      .replace('\r','') \
                                                      .strip()) 
test_set

In [None]:
train = pd.read_csv('../input/law-text/train.csv')
train[train['class']=='使用者要求/运营者要求'] = '使用者要求'
train[train['class']=='运营者要求/使用者要求'] = '运营者要求'
train[train['class']=='使用者要求（运营者）'] = '使用者要求'
train[train['class']=='职责区分/违规处理'] = '职责区分'
train.dropna(inplace=True)
train['content'] = train['content'].astype(str)
train['content'] = train['content'].apply(lambda x: x.replace('\u3000','') \
                                                      .replace('\n','') \
                                                      .replace('\r','') \
                                                      .strip()) 
le = LabelEncoder()
y1 = le.fit_transform(train['class'])
train['label'] = y1
print('train has {} rows and {} columns'.format(train.shape[0], train.shape[1]))
train.head()

In [None]:
test = train.sample(n=380, weights=y1, random_state=786)
test.sort_index(inplace=True)

In [None]:
train_augment = pd.read_csv('../input/law-text/train_augment.csv')
print('train_augment has {} rows and {} columns'.format(train_augment.shape[0], train_augment.shape[1]))
train_augment.head()

In [None]:
def get_vectorize(wordlist,vector = 'CountVectorizer',feats =150):
    '''得到特征X矩阵
    '''
    if vector == 'CountVectorizer':
        cv = CountVectorizer(max_features = feats,token_pattern='[\u4e00-\u9fa5_a-zA-Z0-9]{1,}')
        cv_fit = cv.fit_transform(wordlist).toarray()
        colnames = cv.get_feature_names()
        word_matrix = pd.DataFrame(cv_fit, columns=colnames)   
        return word_matrix
    elif vector == 'TfidfVectorizer':
        cv = TfidfVectorizer(max_features = feats,token_pattern='[\u4e00-\u9fa5_a-zA-Z0-9]{1,}')
        cv_fit = cv.fit_transform(wordlist).toarray()
        colnames = cv.get_feature_names()
        word_matrix = pd.DataFrame(cv_fit, columns=colnames)   
        return word_matrix
    elif vector == 'HashingVectorizer':
        cv =  HashingVectorizer(n_features = feats)
        cv_fit = cv.fit_transform(wordlist).toarray()
        word_matrix = pd.DataFrame(cv_fit)
        return word_matrix
    
stopword_list = [k.strip() for k in open('../input/english-and-chinese-stopwords/stopwords.txt', encoding='utf8').readlines() if k.strip() != '']

def get_cutword(string):
    '''jieba分词,正则替换数字
    '''
    string = re.sub("[0-9]"," ",string) # 正则替换数字
    cutWords = [k for k in jieba.cut(string) if k != '' if k not in stopword_list]
    combined = ' '.join(cutWords)
    return combined

In [None]:
cutword_series = train_augment['content'].apply(lambda x: get_cutword(x)) # 得到的是pandas series
x_train = get_vectorize(cutword_series,feats = 100)

cutword_series = test['content'].apply(lambda x: get_cutword(x)) # 得到的是pandas series
x_test = get_vectorize(cutword_series,feats = 100)

cutword_series = test_set['content'].apply(lambda x: get_cutword(x)) # 得到的是pandas series
test_set = get_vectorize(cutword_series,feats = 100)

In [None]:
x_test

In [None]:
y_train = train_augment['label']
y_test = test['label']

In [None]:
def consist_train_test(test, train_col):
    '''使得测试集列名顺序与训练集一致
    '''
    new_df = pd.DataFrame()
    for i in train_col:
        if i in test.columns:
            new_df[i] = test[i]
        else:
            new_df[i] = 0
    new_df.fillna(0, inplace = True)
    order = train_col
    new_df[order]
    return new_df

x_test = consist_train_test(x_test,x_train.columns)
test_set = consist_train_test(test_set,x_train.columns)

In [None]:
def evaluating(test_pred):
    voted_test_pred = test_pred.mode(axis=1)[0].astype(int)
    acc_test = accuracy_score(y_test, voted_test_pred)
    print('pseudo test set accuracy:{:.5f}'.format(acc_test))

def vote(test_pred):
    voted_test_pred = test_pred.mode(axis=1)[0].astype(int)
    recode =  {0 : '使用者要求', 1 : '名词解释',2 : '服务监督', 3 : '法规倡议', 
           4 : '法规目的',5 : '职责区分', 6 : '运营者要求', 7 : '违规处理'}
    y_pred = pd.Series(voted_test_pred).map(recode)
    return voted_test_pred

In [None]:

#for i, t in enumerate(label_cols):
#    print(t)
#    y = train_df.loc[:, [t]].values.reshape(-1)
def run_crossval_xgb(x_train, y_train, x_test): 
    '''交叉验证'''
    x_train = x_train.astype('float32')
    y_train = y_train.astype('float32')
    x_test = x_test.astype('float32')
    folds = StratifiedKFold(n_splits=5)
    val_score = []
    test_pred = []
    test_pred_set = []
    #pred_test = np.zeros((len(test_X), len(label_cols)))
    print('start training xgboost ... ')
    for fold, (trn_idx, val_idx) in enumerate(folds.split(x_train, y_train)):
        print(f'Training fold {fold + 1}')
        x_trn, x_val = x_train.iloc[trn_idx], x_train.iloc[val_idx]
        y_trn, y_val = y_train.iloc[trn_idx], y_train.iloc[val_idx]
        #model = LogisticRegression(C=9.0, class_weight='balanced')
        #model = LogisticRegression(C=9.0)
        model = XGBClassifier(n_estimators = 250 ,
                      max_depth = 6, 
                      learning_rate = 0.2,
                      min_child_weight = 10, 
                      colsample_bytree = 0.7, 
                      subsample = 0.8,
                      random_state=233,
                      eval_metric='merror',
                      tree_method='gpu_hist')
        model.fit(x_trn, y_trn)
        pred_train = model.predict(x_trn)
        acc_train = accuracy_score(y_trn, pred_train)
        pred_val = model.predict(x_val)
        acc_val = accuracy_score(y_val, pred_val)
        val_score.append(acc_val)
        
        pred_test = model.predict(x_test)
        test_pred.append(pred_test)
        
        pred_test_set = model.predict(test_set)
        test_pred_set.append(pred_test_set)
        #pred_val = model.predict(xval)
        #pred_test[:, i] += model.predict(test)
        print('val accuracy:{:.5f}, train accuracy:{:.5f}'.format(acc_val, acc_train))
    print('-' * 50)
    print('OOF val accuracy:{:.5f}'.format(np.array(val_score).mean()))
    return pd.DataFrame(np.array(test_pred).T), pd.DataFrame(np.array(test_pred_set).T)


In [None]:
def run_crossval_rf(x_train, y_train, x_test): 
    '''交叉验证'''
    x_train = x_train.astype('float32')
    y_train = y_train.astype('float32')
    x_test = x_test.astype('float32')
    folds = StratifiedKFold(n_splits=5)
    val_score = []
    test_pred = []
    test_pred_set = []
    print('start training random forests ... ')
    for fold, (trn_idx, val_idx) in enumerate(folds.split(x_train, y_train)):
        print(f'Training fold {fold + 1}')
        x_trn, x_val = x_train.iloc[trn_idx], x_train.iloc[val_idx]
        y_trn, y_val = y_train.iloc[trn_idx], y_train.iloc[val_idx]
        #model = LogisticRegression(C=9.0, class_weight='balanced')
        #model = LogisticRegression(C=9.0)
        model = RandomForestClassifier(n_estimators=250,
                                       rows_sample = 0.7,
                                       max_depth=8,
                                       max_features=75)
        model.fit(x_trn, y_trn)
        pred_train = model.predict(x_trn)
        acc_train = accuracy_score(y_trn, pred_train)
        pred_val = model.predict(x_val)
        acc_val = accuracy_score(y_val, pred_val)
        val_score.append(acc_val)
        
        pred_test = model.predict(x_test)
        test_pred.append(pred_test)
        
        pred_test_set = model.predict(test_set)
        test_pred_set.append(pred_test_set)
        #pred_val = model.predict(xval)
        #pred_test[:, i] += model.predict(test)
        print('val accuracy:{:.5f}, train accuracy:{:.5f}'.format(acc_val, acc_train))
    print('-' * 50)
    print('OOF val accuracy:{:.5f}'.format(np.array(val_score).mean()))
    return pd.DataFrame(np.array(test_pred).T), pd.DataFrame(np.array(test_pred_set).T)


In [None]:
def run_crossval_svm(x_train, y_train, x_test): 
    '''交叉验证'''
    x_train = x_train.astype('float32')
    y_train = y_train.astype('float32')
    x_test = x_test.astype('float32')
    folds = StratifiedKFold(n_splits=5)
    val_score = []
    test_pred = []
    test_pred_set = []
    print('start training svm ... ')
    for fold, (trn_idx, val_idx) in enumerate(folds.split(x_train, y_train)):
        print(f'Training fold {fold + 1}')
        x_trn, x_val = x_train.iloc[trn_idx], x_train.iloc[val_idx]
        y_trn, y_val = y_train.iloc[trn_idx], y_train.iloc[val_idx]
        #model = LogisticRegression(C=9.0, class_weight='balanced')
        #model = LogisticRegression(C=9.0)
        model = SVC(gamma='auto')
        model.fit(x_trn, y_trn)
        pred_train = model.predict(x_trn)
        acc_train = accuracy_score(y_trn, pred_train)
        pred_val = model.predict(x_val)
        acc_val = accuracy_score(y_val, pred_val)
        val_score.append(acc_val)
        
        pred_test = model.predict(x_test)
        test_pred.append(pred_test)
        
        pred_test_set = model.predict(test_set)
        test_pred_set.append(pred_test_set)
        #pred_val = model.predict(xval)
        #pred_test[:, i] += model.predict(test)
        print('val accuracy:{:.5f}, train accuracy:{:.5f}'.format(acc_val, acc_train))
    print('-' * 50)
    print('OOF val accuracy:{:.5f}'.format(np.array(val_score).mean()))
    return pd.DataFrame(np.array(test_pred).T), pd.DataFrame(np.array(test_pred_set).T)

In [None]:
def run_crossval_nb(x_train, y_train, x_test): 
    '''交叉验证'''
    x_train = x_train.astype('float32')
    y_train = y_train.astype('float32')
    x_test = x_test.astype('float32')
    folds = StratifiedKFold(n_splits=5)
    val_score = []
    test_pred = []
    test_pred_set = []
    #pred_test = np.zeros((len(test_X), len(label_cols)))
    print('start training naive bayes ...')
    for fold, (trn_idx, val_idx) in enumerate(folds.split(x_train, y_train)):
        print(f'Training fold {fold + 1}')
        x_trn, x_val = x_train.iloc[trn_idx], x_train.iloc[val_idx]
        y_trn, y_val = y_train.iloc[trn_idx], y_train.iloc[val_idx]
        #model = LogisticRegression(C=9.0, class_weight='balanced')
        #model = LogisticRegression(C=9.0)
        model = BernoulliNB()
        model.fit(x_trn, y_trn)
        pred_train = model.predict(x_trn)
        acc_train = accuracy_score(y_trn, pred_train)
        pred_val = model.predict(x_val)
        acc_val = accuracy_score(y_val, pred_val)
        val_score.append(acc_val)
        
        pred_test = model.predict(x_test)
        test_pred.append(pred_test)
        
        pred_test_set = model.predict(test_set)
        test_pred_set.append(pred_test_set)
        #pred_val = model.predict(xval)
        #pred_test[:, i] += model.predict(test)
        print('val accuracy:{:.5f}, train accuracy:{:.5f}'.format(acc_val, acc_train))
    print('-' * 50)
    print('OOF val accuracy:{:.5f}'.format(np.array(val_score).mean()))
    return pd.DataFrame(np.array(test_pred).T), pd.DataFrame(np.array(test_pred_set).T)

In [None]:
test_pred, test_pred_set = run_crossval_xgb(x_train, y_train, x_test)
evaluating(test_pred)
test_pred_xgb = vote(test_pred_set)

test_pred, test_pred_set = run_crossval_rf(x_train, y_train, x_test)
evaluating(test_pred)
test_pred_rf = vote(test_pred_set)

test_pred, test_pred_set = run_crossval_svm(x_train, y_train, x_test)
evaluating(test_pred)
test_pred_svm = vote(test_pred_set)

test_pred, test_pred_set = run_crossval_nb(x_train, y_train, x_test)
evaluating(test_pred)
test_pred_nb = vote(test_pred_set)

In [None]:
test_set = pd.read_csv('../input/law-text/TestSet.csv',encoding='gb18030')

def get_submission(y_pred, name):
    submission = pd.DataFrame({'content': test_set['content'], 'class': y_pred})
    filename = name + '.csv'
    submission.to_csv(filename, index = False)

get_submission(test_pred_xgb, 'xgb_pred')

get_submission(test_pred_rf, 'rf_pred')

get_submission(test_pred_svm, 'svm_pred')

get_submission(test_pred_nb, 'naive_bayes_pred')