In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
#jsonモジュールのインポート
import json
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import gensim
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import re

import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn import metrics

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

In [None]:
snowball = SnowballStemmer(language='english')

In [None]:
def create_wordlist_from_json(dataInd,fileId):
    
    filename = "/kaggle/input/coleridgeinitiative-show-us-the-data/" + dataInd + "/" + fileId + ".json"
    
    fd = open(filename, mode='r')
    data = json.load(fd)
    fd.close()
    json_text = ''
    for sections in data:
        json_text = json_text + ' ' + sections.get('text')
    
    json_text = json_text.replace('\\n',' ').replace('\\f',' ').replace('\\u','!!!').replace('\\b',' ').replace('\\t',' ').replace('\\',' ')
    json_text = re.sub('!{3}[A-Za-z0-9]{4}',' ',json_text)
    json_text= re.sub('r[^\w\s]',' ',json_text)
    
    textWordlist = nltk.word_tokenize(json_text)

    #STOPWORDなし
    #return TaggedDocument(words=textWordlist, tags=[fileId])

    #STOPWORDあり
    stopWords = stopwords.words('english') + \
    ['"','{', '}', '[', ']', '(',')',  ',', ':', '``', "''", ';', '.', 
     '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '%']
    
    wordlist = [snowball.stem(word.lower()) for word in textWordlist if word.lower() not in stopWords]
    return wordlist

In [None]:
#SWEM-MAXの関数
def get_doc_swem_max_vector(words, model):
    vector_size = len(model[0])
    doc_vector = np.zeros((len(words), vector_size))
    for i, word in enumerate(words):
        try:
            word_vector = model[word]
        except KeyError:
            word_vector = np.zeros(vector_size)
        
        doc_vector[i, :] = word_vector

    doc_vector = np.max(doc_vector, axis=0)
    return doc_vector

In [None]:
sample_submission_df = pd.read_csv("/kaggle/input/coleridgeinitiative-show-us-the-data/sample_submission.csv")
train_df = pd.read_csv("/kaggle/input/coleridgeinitiative-show-us-the-data/train.csv")

In [None]:
#ラベルを修正
temp_df = train_df.drop('pub_title',axis=1).groupby(by=["dataset_title","dataset_label","cleaned_label"]).count().reset_index()
temp_df = temp_df.sort_values(['dataset_title', 'Id'], ascending=[True, False])
fix_label_df = pd.DataFrame(columns=['dataset_title','cleaned_label_fix'])
temp2_df = temp_df.drop_duplicates(subset='dataset_title')
for dataset_title in temp2_df['dataset_title']:
    temp3_df = temp_df.query('dataset_title == "' + dataset_title + '"')
    cleaned_label_fix = temp3_df['cleaned_label'].values[0]
    if len(temp3_df) == 1:
        fix_label_df = fix_label_df.append({'dataset_title': dataset_title, 'cleaned_label_fix': cleaned_label_fix}, 
                                           ignore_index=True)    
    else:
        fix_words = cleaned_label_fix.split()
        second_words = temp3_df['cleaned_label'].values[1].split()
        add_word = ' '.join([word for word in second_words if word not in fix_words])
        cleaned_label_fix = cleaned_label_fix + ' ' + add_word
        fix_label_df = fix_label_df.append({'dataset_title': dataset_title, 'cleaned_label_fix': cleaned_label_fix}, 
                                           ignore_index=True)

new_train_df = pd.merge(train_df, fix_label_df, on='dataset_title')
new_train_df = new_train_df.drop('cleaned_label', axis=1).rename(columns={'cleaned_label_fix': 'cleaned_label'})

In [None]:
model = gensim.models.KeyedVectors.load_word2vec_format('../input/fasttext-pretrainedvectors-english-text/cc.en.300.vec', binary=False)

In [None]:
# 空のリストを作成（学習データとなる各文書を格納）
training_docs = []

distinct_train_df = new_train_df.drop_duplicates(subset=["Id"])
distinct_train_df = distinct_train_df.sort_values('Id')

# 学習データを取り込み
for Id in distinct_train_df["Id"]:
    training_docs.append(create_wordlist_from_json("train", Id))

# テストデータを取り込み
for Id in sample_submission_df["Id"]:
    training_docs.append(create_wordlist_from_json("test", Id))

In [None]:
X = np.zeros((len(distinct_train_df), len(model[0])))
X_submit = np.zeros((len(training_docs) - len(distinct_train_df), len(model[0])))

for i, doc in enumerate(training_docs):
    if i < len(distinct_train_df):
        X[i, :] = get_doc_swem_max_vector(doc, model)
    else :
        X_submit[i - len(distinct_train_df), :] = get_doc_swem_max_vector(doc, model)

In [None]:
#ラベルを作成
label_df = pd.DataFrame(new_train_df['Id'])
work_df = pd.get_dummies(new_train_df['cleaned_label']) 
label_list = list(work_df.columns)
label_df = pd.concat([label_df, work_df], axis=1)
label_df = label_df.groupby(by=['Id']).max()
label_df = label_df.sort_values('Id')

In [None]:
# 提出用データ作成
my_submission = pd.DataFrame(sample_submission_df['Id'])
my_submission['PredictionString'] = ''

In [None]:
for label in label_list:
    print(label)
    temp_label_df = pd.DataFrame()
    temp_label_df[label] = label_df[label]
    
    #オーバーサンプリング
    positive_count_train = temp_label_df.sum()
    ros = RandomOverSampler(sampling_strategy=0.5, random_state=71)
    X_res, y_res = ros.fit_resample(pd.DataFrame(X), temp_label_df.reset_index().drop('Id', axis=1))
    
    
    #訓練データと検証データに分割
    train_X, val_X, train_y, val_y = train_test_split(X_res, y_res, test_size = 0.3, random_state=71)

    # データセットを生成する
    lgb_train = lgb.Dataset(train_X.values, train_y[label].values)
    lgb_eval = lgb.Dataset(val_X.values, val_y[label].values, reference=lgb_train)

    # LightGBM のハイパーパラメータ
    params = {
        # 二値分類問題
        'objective': 'binary',
        # AUC の最大化を目指す
        'metric': 'auc',
        # Fatal の場合出力
        'verbosity': -1,
    }

    # 上記のパラメータでモデルを学習する
    lgbModel = lgb.train(params, lgb_train, valid_sets=lgb_eval,
                      verbose_eval=50,  # 50イテレーション毎に学習結果出力
                      num_boost_round=1000,  # 最大イテレーション回数指定
                      early_stopping_rounds=100
                     )

    # テストデータを予測する
    y_pred = lgbModel.predict(val_X.values, num_iteration=lgbModel.best_iteration)

    fpr, tpr, thresholds = metrics.roc_curve(val_y[label].values, y_pred)
    auc = metrics.auc(fpr, tpr)
    print(auc)

    predicted = lgbModel.predict(X_submit, num_iteration=lgbModel.best_iteration)
    predicted = np.round(predicted)
    predicted_list = ['|' + label if i > 0 else '' for i in predicted]    
    my_submission['tempString'] = predicted_list
    my_submission['PredictionString'] = my_submission['PredictionString'] + my_submission['tempString']
    my_submission = my_submission.drop('tempString', axis=1)

In [None]:
# you could use any filename. We choose submission here
my_submission['PredictionString'] = my_submission['PredictionString'].str[1:]
my_submission.to_csv('submission.csv', index=False)

In [None]:
my_submission.head()