## Doc2Vec & LightGBM

### Import module

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
#jsonモジュールのインポート
import json
import matplotlib.pyplot as plt
import seaborn as sns
import requests
from gensim.models.doc2vec import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from tqdm import tqdm
from collections import Counter
import re

import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn import metrics

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

import random
from sklearn.datasets import make_classification
from sklearn.neighbors import NearestNeighbors

### create TaggedDocument from Json File

In [None]:
def create_taggedDocument_from_json(dataInd,fileId):
    
    filename = "/kaggle/input/coleridgeinitiative-show-us-the-data/" + dataInd + "/" + fileId + ".json"
    
    fd = open(filename, mode='r')
    data = json.load(fd)
    fd.close()
    json_text = ''
    for sections in data:
        json_text = json_text + ' ' + sections.get('text')
    
    json_text = json_text.replace('\\n',' ').replace('\\f',' ').replace('\\u','!!!').replace('\\b',' ').replace('\\t',' ').replace('\\',' ')
    json_text = re.sub('!{3}[A-Za-z0-9]{4}',' ',json_text)
    json_text= re.sub('r[^\w\s]',' ',json_text)
    
    textWordlist = nltk.word_tokenize(json_text)

    #STOPWORDなし
    #return TaggedDocument(words=textWordlist, tags=[fileId])

    #STOPWORDあり
    stopWords = stopwords.words('english') + \
    ['"','{', '}', '[', ']', '(',')',  ',', ':', '``', "''", ';', '.']
    
    wordlist = [snowball.stem(word.lower()) for word in textWordlist if word.lower() not in stopWords]
    return TaggedDocument(words=wordlist, tags=[fileId])

### read CSV File

In [None]:
sample_submission_df = pd.read_csv("/kaggle/input/coleridgeinitiative-show-us-the-data/sample_submission.csv")
train_df = pd.read_csv("/kaggle/input/coleridgeinitiative-show-us-the-data/train.csv")

### Train Doc2Vec Model

In [None]:
snowball = SnowballStemmer(language='english')

In [None]:
# 空のリストを作成（学習データとなる各文書を格納）
training_docs = []

distinct_train_df = train_df.drop_duplicates(subset=["Id"])

# 学習データを取り込み
for Id in distinct_train_df["Id"]:
    training_docs.append(create_taggedDocument_from_json("train", Id))

# テストデータを取り込み
for Id in sample_submission_df["Id"]:
    training_docs.append(create_taggedDocument_from_json("test", Id))

# 学習実行（パラメータを調整可能）
# documents:学習データ（TaggedDocumentのリスト）
# min_count=1:最低1回出現した単語を学習に使用する
# dm=0:学習モデル=DBOW（デフォルトはdm=1:学習モデル=DM）
model = Doc2Vec(documents=training_docs, 
                vector_size=200, 
                alpha=0.0025, 
                min_alpha=0.000001, 
                window=15, 
                min_count=1, 
                dm=1)

### Extract document vector

In [None]:
#Doc2Vecからベクトルを特徴量として抽出
train_docvecs_df = pd.DataFrame()
submit_docvecs_df = pd.DataFrame()


for Id in distinct_train_df["Id"]:
    train_docvecs_df[Id] = model.docvecs[Id]
for Id in sample_submission_df["Id"]:
    submit_docvecs_df[Id] = model.docvecs[Id]

train_docvecs_df = train_docvecs_df.T
train_docvecs_df = train_docvecs_df.rename_axis('Id').reset_index()
train_docvecs_df = train_docvecs_df.sort_values('Id')
train_docvecs_df = train_docvecs_df.drop("Id", axis=1)

submit_docvecs_df = submit_docvecs_df.T
submit_docvecs_df = submit_docvecs_df.rename_axis('Id').reset_index()

### Create label

In [None]:
#ラベルを作成
label_df = pd.DataFrame(train_df['Id'])
work_df = pd.get_dummies(train_df['cleaned_label']) 
label_list = list(work_df.columns)
label_df = pd.concat([label_df, work_df], axis=1)
label_df = label_df.groupby(by=['Id']).sum()
label_df = label_df.sort_values('Id')

### Initialize the data for submission

In [None]:
# 提出用データ作成
my_submission = pd.DataFrame(submit_docvecs_df['Id'])
my_submission['PredictionString'] = ''

### Train LGB Model & Predict

In [None]:
for label in label_list:
    print(label)
    temp_label_df = pd.DataFrame()
    temp_label_df[label] = label_df[label]
    
    #オーバーサンプリング
    positive_count_train = temp_label_df.sum()
    ros = RandomOverSampler(random_state=71)
    X_res, y_res = ros.fit_resample(train_docvecs_df.reset_index().drop('index', axis=1), temp_label_df.reset_index().drop('Id', axis=1))
    
    
    #訓練データと検証データに分割
    train_X, val_X, train_y, val_y = train_test_split(X_res, y_res, test_size = 0.3, random_state=71)

    # データセットを生成する
    lgb_train = lgb.Dataset(train_X.values, train_y[label].values)
    lgb_eval = lgb.Dataset(val_X.values, val_y[label].values, reference=lgb_train)

    # LightGBM のハイパーパラメータ
    params = {
        # 二値分類問題
        'objective': 'binary',
        # AUC の最大化を目指す
        'metric': 'auc',
        # Fatal の場合出力
        'verbosity': -1,
    }

    # 上記のパラメータでモデルを学習する
    model = lgb.train(params, lgb_train, valid_sets=lgb_eval,
                      verbose_eval=50,  # 50イテレーション毎に学習結果出力
                      num_boost_round=1000,  # 最大イテレーション回数指定
                      early_stopping_rounds=100
                     )

    # テストデータを予測する
    y_pred = model.predict(val_X.values, num_iteration=model.best_iteration)

    fpr, tpr, thresholds = metrics.roc_curve(val_y[label].values, y_pred)
    auc = metrics.auc(fpr, tpr)
    print(auc)

    temp_df = submit_docvecs_df.drop('Id', axis=1)
    predicted = model.predict(temp_df.values, num_iteration=model.best_iteration)
    predicted = np.round(predicted)
    predicted_list = ['|' + label if i > 0 else '' for i in predicted]    
    my_submission['tempString'] = predicted_list
    my_submission['PredictionString'] = my_submission['PredictionString'] + my_submission['tempString']
    my_submission = my_submission.drop('tempString', axis=1)

### Submit

In [None]:
# you could use any filename. We choose submission here
my_submission['PredictionString'] = my_submission['PredictionString'].str[1:]
my_submission.to_csv('submission.csv', index=False)

In [None]:
my_submission.head()