## literal matching

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
#jsonモジュールのインポート
import json
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string
import gc

In [None]:
def create_text_from_json(dataInd,fileId):
    filename = "/kaggle/input/coleridgeinitiative-show-us-the-data/" + dataInd + "/" + fileId + ".json"
    
    fd = open(filename, mode='r')
    data = json.load(fd)
    fd.close()
    json_text = ''
    for sections in data:
        json_text = json_text + ' ' + sections.get('text')
    
    return json_text

In [None]:
def text_cleaning(text):
    text = ''.join([k if k not in string.punctuation else ' ' for k in text])
    text = re.sub('[^A-Za-z0-9]+', ' ', str(text).lower()).strip()
    
    return text

In [None]:
def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower()).strip()

In [None]:
sample_submission_df = pd.read_csv("/kaggle/input/coleridgeinitiative-show-us-the-data/sample_submission.csv")
train_df = pd.read_csv("/kaggle/input/coleridgeinitiative-show-us-the-data/train.csv")

In [None]:
training_text = []

# 学習データを取り込み
for Id in train_df["Id"]:
    training_text.append(create_text_from_json("train", Id))

train_df['text'] = training_text

In [None]:
train_df['text'] = train_df['text'].apply(text_cleaning)

In [None]:
train_df.head()

In [None]:
submit_text = []
# 学習データを取り込み
for Id in sample_submission_df["Id"]:
    submit_text.append(create_text_from_json("test", Id))

sample_submission_df['text'] = submit_text

In [None]:
sample_submission_df.head()

In [None]:
temp_1 = [x.lower() for x in train_df['dataset_label'].unique()]
temp_2 = [x.lower() for x in train_df['dataset_title'].unique()]
temp_3 = [x.lower() for x in train_df['cleaned_label'].unique()]
existing_labels = set(temp_1 + temp_2 + temp_3)

id_list = []
lables_list = []
for index, row in sample_submission_df.iterrows():
    sample_text = row['text']
    row_id = row['Id']
    temp_df = train_df[train_df['text'] == text_cleaning(sample_text)]
    cleaned_labels = temp_df['cleaned_label'].to_list()
    for known_label in existing_labels:
        if known_label in sample_text.lower():
            cleaned_labels.append(clean_text(known_label))
    cleaned_labels = [clean_text(x) for x in cleaned_labels]
    cleaned_labels = set(cleaned_labels)
    lables_list.append('|'.join(cleaned_labels))
    id_list.append(row_id)

In [None]:
# 提出用データ作成
my_submission = pd.DataFrame()
my_submission['Id'] = id_list
my_submission['PredictionString1'] = lables_list

In [None]:
my_submission.head()

In [None]:
del training_text
del submit_text
del id_list
del lables_list
del sample_text
gc.collect()

## simple transformers

In [None]:
!pip install '../input/simpletransformers0272/seqeval-0.0.12-py3-none-any.whl' -q
!pip install '../input/tokenizers-070/tokenizers-0.7.0-cp37-cp37m-manylinux1_x86_64.whl' -q
!pip install '../input/simpletransformers-0323-pypi/transformers-2.11.0-py3-none-any.whl' -q
!pip install '../input/simpletransformers-0323-pypi/simpletransformers-0.32.3-py3-none-any.whl' -q

In [None]:
from simpletransformers.classification import MultiLabelClassificationModel
import logging

In [None]:
# ログの設定
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

In [None]:
work_df = pd.get_dummies(train_df['cleaned_label']) 
label_list = list(work_df.columns)

In [None]:
del work_df
gc.collect()

In [None]:
def create_textdf_from_json(dataInd,fileId):
    filename = "/kaggle/input/coleridgeinitiative-show-us-the-data/" + dataInd + "/" + fileId + ".json"
    
    fd = open(filename, mode='r')
    data = json.load(fd)
    fd.close()
    text_list = []
    for sections in data:
        text_list.append(sections.get('text'))
    text_df = pd.DataFrame(columns=['Id','text'])
    text_df['text'] = text_list
    text_df['Id'] = fileId
    
    return text_df

In [None]:
distinct_train_df = pd.DataFrame()
distinct_train_df['Id'] = train_df['Id']
distinct_train_df = distinct_train_df.drop_duplicates(subset=["Id"])

In [None]:
text_df = pd.DataFrame(columns=['Id','text'])
for index, row in distinct_train_df.iterrows():
    fileId = row['Id']
    text_df = text_df.append(create_textdf_from_json('train',fileId),ignore_index=True)    

In [None]:
distinct_train_df = pd.merge(distinct_train_df, text_df)

In [None]:
distinct_train_df['text'] = distinct_train_df['text'].apply(text_cleaning)

In [None]:
work_label_df = pd.DataFrame()
for label in label_list:
    match_list = []
    for index, row in distinct_train_df.iterrows():
        match_list.append(1 if label in row['text'] else 0)
    work_label_df[label] = match_list

distinct_train_df['label'] = work_label_df.values.tolist()

In [None]:
distinct_train_df['match_count'] = distinct_train_df['label'].sum()
distinct_train_df = distinct_train_df.query('match_count > 0')

In [None]:
del match_list
del work_label_df
del text_df
gc.collect()

In [None]:
from gensim.parsing.preprocessing import remove_stopwords

In [None]:
stopword_list = list(set([stopword for stopword in clean_text(' '.join(existing_labels)).split()]))

In [None]:
def text_preprocessing(json_text):
    json_text = remove_stopwords(json_text)
    
    for label in label_list:
        json_text.replace(label,'')

    for stopword in stopword_list:
        json_text.replace(stopword,'')

    return json_text.strip()

In [None]:
distinct_train_df['text'] = distinct_train_df['text'].apply(text_preprocessing)

In [None]:
distinct_train_df = distinct_train_df.query('text != ""')

In [None]:
distinct_train_df

In [None]:
distinct_train_df = distinct_train_df.drop('match_count', axis=1)
gc.collect()

In [None]:
sub_text_df = pd.DataFrame(columns=['Id','text'])
for index, row in sample_submission_df.iterrows():
    fileId = row['Id']
    sub_text_df = sub_text_df.append(create_textdf_from_json('train',fileId),ignore_index=True)

In [None]:
sub_text_df['text'] = sub_text_df['text'].apply(text_cleaning)
sub_text_df['text'] = sub_text_df['text'].apply(text_preprocessing)

In [None]:
sub_text_df = sub_text_df.query('text != ""')

In [None]:
del stopword_list
del existing_labels
gc.collect()

In [None]:
def get_tail_label(df):
    """
    Give tail label colums of the given target dataframe
    
    args
    df: pandas.DataFrame, target label df whose tail label has to identified
    
    return
    tail_label: list, a list containing column name of all the tail label
    """
    columns = df.columns
    n = len(columns)
    irpl = np.zeros(n)
    for column in range(n):
        irpl[column] = df[columns[column]].value_counts()[1]
    irpl = max(irpl)/irpl
    mir = np.average(irpl)
    tail_label = []
    for i in range(n):
        if irpl[i] > mir:
            tail_label.append(columns[i])
    return tail_label

def get_index(df):
  """
  give the index of all tail_label rows
  args
  df: pandas.DataFrame, target label df from which index for tail label has to identified
    
  return
  index: list, a list containing index number of all the tail label
  """
  tail_labels = get_tail_label(df)
  index = set()
  for tail_label in tail_labels:
    sub_index = set(df[df[tail_label]==1].index)
    index = index.union(sub_index)
  return list(index)

def get_minority_instace(X, y):
    """
    Give minority dataframe containing all the tail labels
    
    args
    X: pandas.DataFrame, the feature vector dataframe
    y: pandas.DataFrame, the target vector dataframe
    
    return
    X_sub: pandas.DataFrame, the feature vector minority dataframe
    y_sub: pandas.DataFrame, the target vector minority dataframe
    """
    index = get_index(y)
    X_sub = X[X.index.isin(index)].reset_index(drop = True)
    y_sub = y[y.index.isin(index)].reset_index(drop = True)
    return X_sub, y_sub

def nearest_neighbour(X):
    """
    Give index of 5 nearest neighbor of all the instance
    
    args
    X: np.array, array whose nearest neighbor has to find
    
    return
    indices: list of list, index of 5 NN of each element in X
    """
    nbs=NearestNeighbors(n_neighbors=5,metric='euclidean',algorithm='kd_tree').fit(X)
    euclidean,indices= nbs.kneighbors(X)
    return indices

def MLSMOTE(X,y, n_sample):
    """
    Give the augmented data using MLSMOTE algorithm
    
    args
    X: pandas.DataFrame, input vector DataFrame
    y: pandas.DataFrame, feature vector dataframe
    n_sample: int, number of newly generated sample
    
    return
    new_X: pandas.DataFrame, augmented feature vector data
    target: pandas.DataFrame, augmented target vector data
    """
    indices2 = nearest_neighbour(X)
    n = len(indices2)
    new_X = np.zeros((n_sample, X.shape[1]))
    target = np.zeros((n_sample, y.shape[1]))
    for i in range(n_sample):
        reference = random.randint(0,n-1)
        neighbour = random.choice(indices2[reference,1:])
        all_point = indices2[reference]
        nn_df = y[y.index.isin(all_point)]
        ser = nn_df.sum(axis = 0, skipna = True)
        target[i] = np.array([1 if val>2 else 0 for val in ser])
        ratio = random.random()
        gap = X.loc[reference,:] - X.loc[neighbour,:]
        new_X[i] = np.array(X.loc[reference,:] + ratio * gap)
    new_X = pd.DataFrame(new_X, columns=X.columns)
    target = pd.DataFrame(target, columns=y.columns)
    new_X = pd.concat([X, new_X], axis=0)
    target = pd.concat([y, target], axis=0)
    return new_X, target

In [None]:
distinct_train_df_add,_ = MLSMOTE(distinct_train_df,distinct_train_df['label'],len(distinct_train_df) // 10)

In [None]:
distinct_train_df = distinct_train_df.append(distinct_train_df_add, ignore_index=True)

In [None]:
distinct_train_df = distinct_train_df.sample(frac=1)

In [None]:
# モデルの作成
model = MultiLabelClassificationModel('albert', '../input/pretrained-albert-pytorch/albert-base-v2', 
                                      num_labels=len(label_list),
                                      use_cuda=False, 
                                      args={'reprocess_input_data': False, 
                                            'overwrite_output_dir': True, 
                                            'train_batch_size': 8, 
                                            'num_train_epochs': 1})

model.train_model(distinct_train_df.drop('Id',axis=1))

In [None]:
Prediction_list = []
for index, row in sub_text_df.iterrows():
    predictions, raw_outputs = model.predict([row['text']])
    Prediction_list.append('|'.join(np.array(label_list)[predictions==1].tolist()))

sub_text_df['PredictionString2'] = Prediction_list

In [None]:
sub_text_df = (sub_text_df.groupby('Id')['PredictionString2']
          .apply(list)
          .apply(lambda x:sorted(x))
          .apply('|'.join)
         )

In [None]:
sub_text_df = sub_text_df.reset_index()

In [None]:
def splitAndJoin(text):
    text = '|'.join(list(set(text.split('|'))))
    text = re.sub('^\|','',text)
    text = re.sub('\|$','',text)
    return text

In [None]:
sub_text_df['PredictionString2'] = sub_text_df['PredictionString2'].apply(splitAndJoin)

In [None]:
my_submission = pd.merge(my_submission, sub_text_df, how='left')
my_submission.fillna('')

In [None]:
my_submission.head()

In [None]:
#2つのモデルの結果をマージ
my_submission['PredictionString'] = np.where(my_submission['PredictionString1'] == '', 
                                             my_submission['PredictionString2'], 
                                             my_submission['PredictionString1'])
my_submission = my_submission.drop('PredictionString1', axis=1)
my_submission = my_submission.drop('PredictionString2', axis=1)

In [None]:
my_submission.head()

In [None]:
# you could use any filename. We choose submission here
my_submission.to_csv('submission.csv', index=False)