In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
#jsonモジュールのインポート
import json
import matplotlib.pyplot as plt
import requests
from gensim.models.doc2vec import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from gensim.parsing.preprocessing import remove_stopwords
import nltk
from nltk.stem import SnowballStemmer
import string
import re

In [None]:
snowball = SnowballStemmer(language='english')

In [None]:
def create_taggedDocument_from_json(dataInd,fileId):
    
    filename = "/kaggle/input/coleridgeinitiative-show-us-the-data/" + dataInd + "/" + fileId + ".json"
    
    fd = open(filename, mode='r')
    data = json.load(fd)
    fd.close()
    json_text = ''
    for sections in data:
        json_text = json_text + ' ' + sections.get('text')
    
    json_text = ''.join([k for k in json_text if k not in string.punctuation])
    json_text = re.sub('[^A-Za-z0-9]+', ' ', str(json_text).lower()).strip()
    json_text = json_text.lower()
    json_text = remove_stopwords(json_text)
    
    textWordlist = nltk.word_tokenize(json_text)

    #STOPWORDあり
    wordlist = [snowball.stem(word) for word in textWordlist]
    return TaggedDocument(words=wordlist, tags=[fileId])

In [None]:
sample_submission_df = pd.read_csv("/kaggle/input/coleridgeinitiative-show-us-the-data/sample_submission.csv")
train_df = pd.read_csv("/kaggle/input/coleridgeinitiative-show-us-the-data/train.csv")

In [None]:
# 空のリストを作成（学習データとなる各文書を格納）
training_docs = []

distinct_train_df = train_df.drop_duplicates(subset=["Id"])

# 学習データを取り込み
for Id in distinct_train_df["Id"]:
    training_docs.append(create_taggedDocument_from_json("train", Id))

# テストデータを取り込み
for Id in sample_submission_df["Id"]:
    training_docs.append(create_taggedDocument_from_json("test", Id))

In [None]:
def countWords(text):
    return len(text.split())

In [None]:
# 学習実行（パラメータを調整可能）
# documents:学習データ（TaggedDocumentのリスト）
# min_count=1:最低1回出現した単語を学習に使用する
# dm=0:学習モデル=DBOW（デフォルトはdm=1:学習モデル=DM）
model = Doc2Vec(documents=training_docs, 
                vector_size=250, 
                epochs=50, 
                alpha=0.0025, 
                min_alpha=0.000001, 
                sample=0.001, 
                min_count=5, 
                window=train_df['cleaned_label'].apply(countWords).max(), 
                negative=5,
                ns_exponent=0.75, 
                dbow_words=0, 
                dm=0)

In [None]:
predict_df = pd.DataFrame([], columns = ['test_Id', 'cleaned_label'])
# テストデータの類似文書の情報を格納
for Id in sample_submission_df["Id"]:
    result_df = pd.DataFrame([], columns = ['test_Id' , 'train_Id', 'train_cos', 'cleaned_label'])
    doccnt = 0
    loc = 0
    for doc in model.dv.most_similar(Id, topn=1):
        queryString = 'Id == "' + doc[0] + '"'
        temp_df = train_df.query(queryString)
        if len(temp_df) > 0:
            doccnt = doccnt + 1
            for index, row in temp_df.iterrows():
                result_df.loc[loc] = [Id, doc[0], doc[1], row['cleaned_label']]
                loc = loc + 1
    
    result_df = result_df.groupby(['test_Id', 'cleaned_label'], as_index=False).count()
    result_df = result_df.sort_values(['test_Id', 'train_Id'], ascending=[True, False])
    result_df = result_df.groupby('test_Id')['cleaned_label'].apply('|'.join).reset_index()
    predict_df = predict_df.append(result_df)

submit_df = pd.merge(sample_submission_df, predict_df, how='left', left_on='Id', right_on='test_Id')
submit_df = submit_df.fillna("")
submit_df.head()

In [None]:
my_submission = pd.DataFrame({'Id': submit_df.Id, 'PredictionString': submit_df.cleaned_label})
# you could use any filename. We choose submission here
my_submission.to_csv('submission.csv', index=False)