In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
! git clone --recursive https://github.com/Microsoft/LightGBM

In [None]:
!apt install opencl-headers -y
!apt install -y ocl-icd-opencl-dev

In [None]:
! cd LightGBM && rm -rf build && mkdir build && cd build && cmake -DUSE_GPU=1 ../../LightGBM && make -j4 && cd ../python-package && python3 setup.py install --precompile --gpu;

In [None]:
data_set = '/kaggle/input/nlp-news-text/train_set.csv'
test_set_a = '/kaggle/input/nlp-news-text/test_a.csv'
test_set_b = '/kaggle/input/nlp-news-text/test_b.csv'

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#%matplotlib inline
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

train_df = pd.read_csv(data_set,sep='\t')
test_df = pd.read_csv(test_set_a,sep='\t')

train_df['text_split'] = train_df['text'].apply(lambda x: str(x.split()))
test_df['text_split'] = test_df['text'].apply(lambda x: str(x.split()))

In [None]:
# text_split->tfidf
word_vec = TfidfVectorizer(analyzer='word',
            ngram_range=(1,2),#(1,3)
            min_df=3,  # 4  5
            max_df=0.9, # 0.95 1.0 
            use_idf=True,
            max_features = 3000,
            smooth_idf=True, 
            sublinear_tf=True)
train_term_doc = word_vec.fit_transform(train_df['text_split'])
test_term_doc = word_vec.transform(test_df['text_split'])

In [None]:
# F1 score-线下
from sklearn.metrics import f1_score
#[1,2,3,2,1,3]
#[1,2,3,1,1,3]
def cal_macro_f1(y_true,y_pred):
    score = f1_score(y_true,y_pred,average='macro')
    return score

X_train, X_eval, y_train, y_eval  = train_test_split(train_term_doc,train_df['label'],test_size=0.2,shuffle=True,random_state=2019) # split the training data

In [None]:
# CV
from sklearn.model_selection import KFold
kf = KFold(n_splits=5, shuffle=True, random_state=666)
train_matrix = np.zeros((train_df.shape[0],14)) #记录验证集的概率
##!!!!!
test_pre_matrix = np.zeros((10,test_df.shape[0],14)) #将5轮的测试概率分别保存起来
cv_scores=[] #每一轮线下的验证成绩

In [None]:
!mkdir model
import pickle
def dump_obj(data,name):
    with open("model/"+name,'wb') as f:
        pickle.dump(data,f)
def load(name):
    with open("model/"+name,'rb') as f:
        return pickle.load(f)

In [None]:
import lightgbm as lgb
from sklearn.linear_model import LogisticRegression
last_score = 100
for i,(train_index,eval_index) in enumerate(kf.split(train_term_doc)):
    print("第%d轮训练"%i)
    print(len(train_index),len(eval_index))
    
    print("加载训练集和验证集")
    #训练集
    X_train = train_term_doc[train_index]
    y_train = train_df['label'][train_index]
    
    #验证集
    X_eval = train_term_doc[eval_index]
    y_eval = train_df['label'][eval_index]
    print("训练lgbm分类器")
    # model = LogisticRegression(C=4, dual=False) 
    # model.fit(X_train,y_train)
    model =lgb.LGBMClassifier(device_type='gpu',
                   boosting_type='gbdt', 
                   num_leaves=2**5,
                   max_depth=6, 
                   learning_rate= 0.1,
                   n_estimators=500, #迭代次数
                   objective='multiclass',
                   subsample=0.7,#
                   colsample_bytree=0.5,#
                   reg_lambda=10,#l2
                   n_jobs=16, #
                   num_class=19,#
                   silent=True, 
                   random_state=2019,
#                    class_weight=20,
                   colsample_bylevel=0.5,
                   min_child_weight=1.5,
                   metric='multi_logloss',
                   num_threads = 4
                  )
    model.fit(X_train,y_train,eval_set=(X_eval,y_eval), early_stopping_rounds=50)
    dump_obj(model,'lgbm'+str(i))
    #model.booster_.savemodel("lgbm_model%d.txt"%i)
    ####对于验证集进行预测
    eval_prob = model.predict_proba(X_eval)
    train_matrix[eval_index] = eval_prob.reshape((X_eval.shape[0], 14))#array
    
    eval_pred = np.argmax(eval_prob,axis=1)
    score = cal_macro_f1(y_eval,eval_pred)
    cv_scores.append(score)
    print("validation score is",score)
    
    ###对于测试集进行预测
    test_prob = model.predict_proba(test_term_doc)
    test_pre_matrix[i,:,:] = test_prob.reshape((test_term_doc.shape[0], 14))
    if score-last_score>=0.0005:
        break
    last_score = min(last_score,score)

In [None]:
all_pred = np.argmax(train_matrix,axis=1)
score = cal_macro_f1(train_df['label'],all_pred)
print("all validation score is",score)

In [None]:
test_pred = test_pre_matrix.mean(axis=0)
test_pred = np.argmax(test_pred,axis=1)
test_df['label'] = test_pred
test_df['label'].to_csv("submission_tfidf_lightGBM_10fold.csv",index=False,header=True,encoding='utf-8')

In [None]:
!rm -rf LightGBM