こちらの方のほぼ写経 // this notebook is copied and modified from the link below.

https://www.kaggle.com/ruchi798/commonlit-readability-prize-eda-baseline/comments#Pre-processing-excerpt-%E2%9C%82%EF%B8%8F

# Set up

In [None]:
# The installation of pycaret 2.3.2, the latest version came up with an error.
# -------
# ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
# pdpbox 0.2.1 requires matplotlib==3.1.1, but you have matplotlib 3.4.1 which is incompatible.
# -------
# Therefore, I decided to install version 2.2.1 which happened to work.

#!python -m pip download pycaret==2.2.1 -d ./pycaret-2.2.1
!python -m pip install --find-links=../input/mylibraries pycaret==2.2.1
!python -m pip install --find-links=../input/mylibraries/textstat-0.7.1 textstat==0.7.1

# In case internet access is allowed, pip install is the easiest way.
#!pip install pycaret
#!pip install textstat

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import time

import pandas_profiling as pdp
import matplotlib.pyplot as plt
import seaborn as sns

import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize 
from nltk import pos_tag
from wordcloud import WordCloud,STOPWORDS

from sklearn.feature_extraction.text import CountVectorizer as CV
from sklearn.feature_extraction.text import TfidfVectorizer
import textstat

from pycaret.regression import *

# Load Data

In [None]:
train_df = pd.read_csv("../input/commonlitreadabilityprize/train.csv")
test_df = pd.read_csv("../input/commonlitreadabilityprize/test.csv")

# Profile data

In [None]:
train_df.head()

In [None]:
train_df.excerpt[0]

excerpt (抜粋) から、人によってつけられた文章の読みやすさスコアみたいなの（target）をを当てる

スコアは複数人でつけるため、人による採点ばらつき（standard error）がある

In [None]:
fig, ax = plt.subplots(1,3,figsize=(18,6))

sns.histplot(train_df['target'], ax=ax[0], kde=True, alpha=0.2)
sns.histplot(train_df['standard_error'], ax=ax[1], kde=True, alpha=0.2)

sns.histplot(
    train_df, x="target", y="standard_error",
    bins=30, discrete=(False, False), log_scale=(False, False),
    cbar=True, cbar_kws=dict(shrink=.75), 
    ax=ax[2]
)
plt.show()

In [None]:
test_df.head()

In [None]:
profile = pdp.ProfileReport(train_df)
profile

url, license は、訓練データの 70% が欠損している

In [None]:
pdp.ProfileReport(test_df)

In [None]:
for idx,row in train_df.sort_values(by=['target'], ascending=False).head(3).iterrows():
    print('index:' + str(idx) + ', target ' + str(row.target))
    print(row.excerpt + '\n')

In [None]:
for idx,row in train_df.sort_values(by=['target'], ascending=False).tail(3).iterrows():
    print('index:' + str(idx) + ', target ' + str(row.target))
    print(row.excerpt + '\n')

# Preprocess Data
以下の流れで処理
1. 文章内のアルファベット以外を空白に変換
2. stopwords の削除
3. 単語のレンマ化

In [None]:
excerpt1 = train_df['excerpt'].min()
print("Before preprocessing: \n")
print(excerpt1)

In [None]:
e = re.sub("[^a-zA-Z]", " ", excerpt1) # アルファベット以外は空白に変換
e = e.lower() # 小文字に変換
e = nltk.word_tokenize(e) # tokenizer を使って単語に分割
e[:8]

In [None]:
e = [word for word in e if not word in set(stopwords.words("english"))] # stopwords に登録されている単語は除外する
stopwords.words("english")[:10] # ちなみに stopwords に登録されている単語はこんなの

In [None]:
lemma = nltk.WordNetLemmatizer()
e = [lemma.lemmatize(word) for word in e] # lemmatizer を使ってlemmatizeする
nltk.WordNetLemmatizer().lemmatize("dogs") # lemmatize の例 dogs -> dog

In [None]:
e=" ".join(e)
print("After preprocessing: \n")
print(e)

In [None]:
def preprocess(data):
    excerpt_processed=[]
    lemma = nltk.WordNetLemmatizer()
    for e in data['excerpt']:
        e = re.sub("[^a-zA-Z]", " ", e) # アルファベット以外は空白に変換
        e = e.lower() # 小文字に変換
        e = nltk.word_tokenize(e) # tokenizer を使って単語に分割
        e = [word for word in e if not word in set(stopwords.words("english"))] # stopwords に登録されている単語は除外する
        e = [lemma.lemmatize(word) for word in e] # lemmatizer を使って lemmatize する
        e=" ".join(e)
        excerpt_processed.append(e)
    return excerpt_processed 

In [None]:
train_df['preprocessed_excerpt'] = preprocess(train_df)
test_df['preprocessed_excerpt'] = preprocess(test_df)

# 時間がかかるので、保存しておく
#train_df.to_csv("train_excerpt_preprocessed.csv")
#test_df.to_csv("test_excerpt_preprocessed.csv")

In [None]:
plt.figure(figsize=(16, 8))
sns.countplot(y="license",data=train_df,linewidth=3)
plt.title("License Distribution")
plt.show()

頻出単語(unigram), 頻出bigram, trigram を抽出する

In [None]:
print('train data の数:', len(train_df['preprocessed_excerpt']))
vec = CV(ngram_range=(1, 1)).fit(train_df['preprocessed_excerpt'])
print('vocabulary の種類:', len(vec.vocabulary_))
print('vocabulary の例:', list(vec.vocabulary_.keys())[:3])

bow = vec.transform(train_df['preprocessed_excerpt'])
print('bag of words の shape', bow.shape)

sum_words = bow.sum(axis=0)
print('sum of words の shape', sum_words.shape)
print('sum of words', sum_words)

words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
print('頻出単語の例', words_freq[:5])

# 登場回数で並べ替え
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
print('頻出単語上位', words_freq[:5])

In [None]:
def get_top_n_words(corpus, n=None):
    vec = CV().fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

def get_top_n_bigram(corpus, n=None):
    vec = CV(ngram_range=(2, 2)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

def get_top_n_trigram(corpus, n=None):
    vec = CV(ngram_range=(3, 3)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

In [None]:
common_words = get_top_n_words(train_df['preprocessed_excerpt'], 20)
common_words_df1 = pd.DataFrame(common_words,columns=['word','freq'])
plt.figure(figsize=(16, 6))
ax = sns.barplot(x='freq', y='word', data=common_words_df1,
                 facecolor=(0, 0, 0, 0),linewidth=3,
                 edgecolor=sns.color_palette("ch:start=3, rot=.1",20))

plt.title("Top 20 unigrams",font='Serif')
plt.xlabel("Frequency", fontsize=14)
plt.yticks(fontsize=13)
plt.xticks(rotation=45, fontsize=13)
plt.ylabel("");

common_words = get_top_n_bigram(train_df['preprocessed_excerpt'], 20)
common_words_df2 = pd.DataFrame(common_words,columns=['word','freq'])
plt.figure(figsize=(16, 6))
ax = sns.barplot(x='freq', y='word', data=common_words_df2,
                 facecolor=(0, 0, 0, 0),linewidth=3,
                 edgecolor=sns.color_palette("ch:start=3, rot=.1",20))

plt.title("Top 20 bigrams",font='Serif')
plt.xlabel("Frequency", fontsize=14)
plt.yticks(fontsize=13)
plt.xticks(rotation=45, fontsize=13)
plt.ylabel("");

common_words = get_top_n_trigram(train_df['preprocessed_excerpt'], 20)
common_words_df2 = pd.DataFrame(common_words,columns=['word','freq'])
plt.figure(figsize=(16, 6))
ax = sns.barplot(x='freq', y='word', data=common_words_df2,
                 facecolor=(0, 0, 0, 0),linewidth=3,
                 edgecolor=sns.color_palette("ch:start=3, rot=.1",20))

plt.title("Top 20 trigrams",font='Serif')
plt.xlabel("Frequency", fontsize=14)
plt.yticks(fontsize=13)
plt.xticks(rotation=45, fontsize=13)
plt.ylabel("");

trigram の２位に登場する "th" は stop words に加えたほうがよい？

In [None]:
plt.subplots(figsize=(16,16))
wc = WordCloud(stopwords=STOPWORDS,background_color="white", 
               contour_width=2, contour_color='blue', width=1600, height=800,
               max_words=150, max_font_size=256,random_state=42)
wc.generate(' '.join(train_df['preprocessed_excerpt']))
plt.imshow(wc, interpolation="bilinear")
plt.axis('off')
plt.show()

stop words に 入れたほうがよいword:u 
動詞は過去形のものも残っている

In [None]:
def avg_word_len(df):
    """ 登場する単語の文字数の平均を算出する """
    df = df.str.split().apply(
        lambda x : [len(i) for i in x] # 各単語の文字数
    ).map(lambda x: np.mean(x)) # 文字数の平均
    return df

In [None]:
train_df['excerpt'][0]

In [None]:
# 文字数 // number of characters in the text
text_len = train_df['excerpt'].str.len()
text_len_pre = train_df['preprocessed_excerpt'].str.len()
text_len_dif = train_df['excerpt'].str.len() - train_df['preprocessed_excerpt'].str.len()

# 単語の文字数の平均 // average of number of characters in words present in the text
avg_text = avg_word_len(train_df['excerpt'])
avg_text_pre = avg_word_len(train_df['preprocessed_excerpt'])

# 単語数 // number of words present in the text
lexicon_count = [] 
lexicon_count_pre = []
lexicon_count_dif = []
# 文章数 // number of sentences present in the text
sentence_count = []
# 文章あたりの単語数 // mean/max/min number of words in each sentence
mean_lexicon_count_per_st = []
max_lexicon_count_per_st = []
min_lexicon_count_per_st = []

for i in range(len(train_df)):
    lc = textstat.lexicon_count(train_df['excerpt'][i])
    lcp = textstat.lexicon_count(train_df['preprocessed_excerpt'][i])
    lcd = lc - lcp
    sc = textstat.sentence_count(train_df['excerpt'][i])
    lexicon_count.append(lc)
    lexicon_count_pre.append(lcp)
    lexicon_count_dif.append(lcd)
    sentence_count.append(sc)
    mean_lexicon_count_per_st.append(np.mean([textstat.lexicon_count(x) for x in train_df['excerpt'][i].split(".")]))
    max_lexicon_count_per_st.append(np.max([textstat.lexicon_count(x) for x in train_df['excerpt'][i].split(".")]))
    min_lexicon_count_per_st.append(np.min([textstat.lexicon_count(x) for x in train_df['excerpt'][i].split(".") if textstat.lexicon_count(x) > 1]))

In [None]:
text_props = train_df.copy()
text_props['text_len'] = text_len
text_props['text_len_pre'] = text_len_pre
text_props['text_len_dif'] = text_len_dif
text_props['lexicon_count'] = lexicon_count
text_props['lexicon_count_pre'] = lexicon_count_pre
text_props['lexicon_count_dif'] = lexicon_count_dif # preprocess と original の差
text_props['avg_text'] = avg_text
text_props['avg_text_pre'] = avg_text_pre
text_props['sentence_count'] = sentence_count
text_props['mean_lexicon_count_per_st'] = mean_lexicon_count_per_st
text_props['max_lexicon_count_per_st'] = max_lexicon_count_per_st
text_props['min_lexicon_count_per_st'] = min_lexicon_count_per_st
text_props.head(3)

In [None]:
def plot_distribution(col1,col2,title1,title2):
    fig, ax = plt.subplots(1,2,figsize=(12,6))
    sns.kdeplot(data=text_props, x=col1,label="Excerpt",ax=ax[0])
    sns.kdeplot(data=text_props, x=col2,label="Excerpt preprocessed",ax=ax[0])
    ax[0].set_title(title1,font="Serif")
    ax[0].legend()

    sns.scatterplot(data=text_props,x=col1,y='target',label="Excerpt",ax=ax[1],markers='.')
    sns.scatterplot(data=text_props,x=col2,y='target',label="Excerpt preprocessed", ax=ax[1],markers='.', alpha=0.3)
    ax[1].set_title(title2,font="Serif")
    ax[1].legend()

    plt.show()

plot_distribution("text_len","text_len_pre","Character count distribution","Character count vs Target")
plot_distribution("lexicon_count","lexicon_count_pre","Word count distribution","Word count vs Target")
plot_distribution("avg_text","avg_text_pre", "Average word length distribution","Average word length vs Target")

fig, ax = plt.subplots(1,2,figsize=(12,6))
sns.kdeplot(data=text_props, x=sentence_count,label="Excerpt",ax=ax[0])
ax[0].set_title("Sentence count distribution",font="Serif")
ax[0].set_xlabel("sentence_count")
sns.scatterplot(data=text_props,x='sentence_count',y='target',ax=ax[1],markers='.')
ax[1].set_title("Sentence count vs Target",font="Serif")
plt.show()

fig, ax = plt.subplots(1,2,figsize=(12,6))
sns.kdeplot(data=text_props, x=text_len_dif,label="Excerpt",ax=ax[0])
ax[0].set_title("Character count reduction distribution",font="Serif")
ax[0].set_xlabel("Character count reduction by preprocess")
sns.scatterplot(data=text_props,x='text_len_dif',y='target',ax=ax[1],markers='.')
ax[1].set_title("Character count reduction by preprocess vs Target",font="Serif")
plt.show()

fig, ax = plt.subplots(1,2,figsize=(12,6))
sns.kdeplot(data=text_props, x=lexicon_count_dif,label="Excerpt",ax=ax[0])
ax[0].set_title("Word count reduction distribution",font="Serif")
ax[0].set_xlabel("Word count reduction by preprocess")
sns.scatterplot(data=text_props,x='lexicon_count_dif',y='target',ax=ax[1],markers='.')
ax[1].set_title("Word count reduction by preprocess vs Target",font="Serif")
plt.show()

fig, ax = plt.subplots(1,2,figsize=(12,6))
sns.kdeplot(data=text_props, x=mean_lexicon_count_per_st,label="Excerpt",ax=ax[0])
ax[0].set_title("Mean of word count per centence",font="Serif")
ax[0].set_xlabel("Word count")
sns.scatterplot(data=text_props,x='mean_lexicon_count_per_st',y='target',ax=ax[1],markers='.')
ax[1].set_title("Mean of word count per centence vs Target",font="Serif")
plt.show()

fig, ax = plt.subplots(1,2,figsize=(12,6))
sns.kdeplot(data=text_props, x=max_lexicon_count_per_st,label="Excerpt",ax=ax[0])
ax[0].set_title("Max of word count per centence",font="Serif")
ax[0].set_xlabel("Word count")
sns.scatterplot(data=text_props,x='max_lexicon_count_per_st',y='target',ax=ax[1],markers='.')
ax[1].set_title("Max of word count per centence vs Target",font="Serif")
plt.show()

fig, ax = plt.subplots(1,2,figsize=(12,6))
sns.kdeplot(data=text_props, x=min_lexicon_count_per_st,label="Excerpt",ax=ax[0])
ax[0].set_title("Min of word count per centence",font="Serif")
ax[0].set_xlabel("Word count")
sns.scatterplot(data=text_props,x='min_lexicon_count_per_st',y='target',ax=ax[1],markers='.')
ax[1].set_title("Min of word count per centence vs Target",font="Serif")
plt.show()

num_cols = ['text_len','text_len_pre','text_len_dif', 
            'lexicon_count','lexicon_count_pre','lexicon_count_dif', 
            'avg_text','avg_text_pre','sentence_count',
            'mean_lexicon_count_per_st', 'max_lexicon_count_per_st', 'min_lexicon_count_per_st',
            'target']
corr = text_props[num_cols].corr()

fig = plt.figure(figsize=(8,8),dpi=80)
mask = np.triu(np.ones_like(corr, dtype=bool))
sns.heatmap(corr, mask=mask, cmap='BuPu', robust=True, center=0, annot=True,
            square=True, linewidths=.5)
plt.title('Correlation of text properties', fontsize=15,font="Serif")
plt.show()

- avg_text (単語の文字数の平均)やtext_len（文字数）は target と負の相関→文字数が少ないほど読みやすい
- sentence_count（文章の数）は、target と弱い正の相関→文章数が少ないほど読みやすい
- いずれも直観的な感覚と一致する

### 文章中に登場する単語の品詞をカウントする

In [None]:
text_props['pos_tags'] = text_props['preprocessed_excerpt'].str.split().map(pos_tag)

print(text_props['preprocessed_excerpt'][0][:50])
print(text_props['pos_tags'][0][:5])

In [None]:
def count_tags(pos_tags):
    tag_count = {}
    for word,tag in pos_tags:
        if tag in tag_count:
            tag_count[tag] += 1
        else:
            tag_count[tag] = 1
    return tag_count

text_props['tag_counts'] = text_props['pos_tags'].map(count_tags)
print(text_props['tag_counts'].head())

'JJ' とか 'NNS' とかが品詞の種類、後に続く value が登場回数を表す

定義はoriginal kernel 参照

https://www.kaggle.com/ruchi798/commonlit-readability-prize-eda-baseline/comments?scriptVersionId=62607090&cellId=47

In [None]:
set_pos = set([tag for tags in text_props['tag_counts'] for tag in tags])
tag_cols = list(set_pos)

for tag in tag_cols:
    text_props[tag] = text_props['tag_counts'].map(lambda x: x.get(tag, 0))
text_props[tag_cols].head()

In [None]:
pos = text_props[tag_cols].sum().sort_values(ascending = False)
plt.figure(figsize=(16,8))
ax = sns.barplot(x=pos.index, y=pos.values)
plt.xticks(rotation = 50)
ax.set_yscale('log')
plt.title('POS tags frequency',fontsize=15,font="Serif")
plt.show()

- [original の kernel](https://www.kaggle.com/ruchi798/commonlit-readability-prize-eda-baseline/comments?scriptVersionId=62607090&cellId=54)によると、textstat には読みやすさに関するスコアの算出方法が実装されているそうな


In [None]:
flesch_re, flesch_kg, fog_scale, automated_r,coleman, linsear, text_standard  = ([] for i in range(7))
for i in range(len(text_props)):
    flr = textstat.flesch_reading_ease(train_df['excerpt'][i])
    flkg = textstat.flesch_kincaid_grade(train_df['excerpt'][i])
    fs = textstat.gunning_fog(train_df['excerpt'][i])
    ar = textstat.automated_readability_index(train_df['excerpt'][i])
    cole = textstat.coleman_liau_index(train_df['excerpt'][i])
    lins = textstat.linsear_write_formula(train_df['excerpt'][i])
    ts = textstat.text_standard(train_df['excerpt'][i])
    
    flesch_re.append(flr)
    flesch_kg.append(flkg)
    fog_scale.append(fs)
    automated_r.append(ar)
    coleman.append(cole)
    linsear.append(lins)
    text_standard.append(ts)
    
text_props['flesch_re'] = flesch_re
text_props['flesch_kg'] = flesch_kg
text_props['fog_scale'] = fog_scale
text_props['automated_r'] = automated_r
text_props['coleman'] = coleman
text_props['linsear'] = linsear
text_props['text_standard'] = text_standard

In [None]:
readability_cols = ['flesch_re','flesch_kg','fog_scale','automated_r','coleman','linsear','text_standard','target']

corr = text_props[readability_cols].corr()
fig = plt.figure(figsize=(8,8),dpi=80)
mask = np.triu(np.ones_like(corr, dtype=bool))
sns.heatmap(corr, mask=mask, cmap='PuBuGn', robust=True, center=0,
            square=True, linewidths=.5,annot=True)
plt.title('Correlation of readability tests', fontsize=15,font="Serif")
plt.show()

Flesch Readability Ease の値と target には正の相関

In [None]:
fig, ax = plt.subplots(1,2,figsize=(12,6))
sns.kdeplot(data=text_props, x=flesch_re,ax=ax[0])
ax[0].set_title("Flesch Reading Ease Distribution",font="Serif")
ax[0].set_xlabel("Flesch Reading Ease Test Score")
sns.scatterplot(data=text_props,x='flesch_re',y='target',ax=ax[1],markers='.')
ax[1].set_title("Flesch Reading Ease Test Score vs Target",font="Serif")
plt.show()

ここまでの処理を関数化して、学習データ、テストデータを処理しておく

In [None]:
def count_character_lexicon_sentence(df):

    # 文章中に登場する文字の数 // number of characters in the text
    text_len = df['excerpt'].str.len()
    text_len_pre = df['preprocessed_excerpt'].str.len()
    text_len_dif = df['excerpt'].str.len() - df['preprocessed_excerpt'].str.len()

    # 文章中に登場する単語の文字数の平均 // average of number of characters in words present in the text
    avg_text = avg_word_len(df['excerpt'])
    avg_text_pre = avg_word_len(df['preprocessed_excerpt'])

    # 単語数 // number of words present in the text
    lexicon_count = [] 
    lexicon_count_pre = []
    lexicon_count_dif = []
    # 文章数 // number of sentences present in the text
    sentence_count = []
    # 文章あたりの単語数 // mean/max/min number of words in each sentence
    mean_lexicon_count_per_st = []
    max_lexicon_count_per_st = []
    min_lexicon_count_per_st = []
    
    for i in range(len(df)):
        lc = textstat.lexicon_count(df['excerpt'][i])
        lcp = textstat.lexicon_count(df['preprocessed_excerpt'][i])
        lcd = lc - lcp
        sc = textstat.sentence_count(df['excerpt'][i])
        lexicon_count.append(lc)
        lexicon_count_pre.append(lcp)
        lexicon_count_dif.append(lcd)
        sentence_count.append(sc)
        mean_lexicon_count_per_st.append(np.mean([textstat.lexicon_count(x) for x in df['excerpt'][i].split(".")]))
        max_lexicon_count_per_st.append(np.max([textstat.lexicon_count(x) for x in df['excerpt'][i].split(".")]))
        min_lexicon_count_per_st.append(np.min([textstat.lexicon_count(x) for x in df['excerpt'][i].split(".") if textstat.lexicon_count(x) > 1]))

    df['text_len'] = text_len
    df['text_len_pre'] = text_len_pre
    df['text_len_dif'] = text_len_dif
    df['lexicon_count'] = lexicon_count
    df['lexicon_count_pre'] = lexicon_count_pre
    df['lexicon_count_dif'] = lexicon_count_dif # preprocess と original の差
    df['avg_text'] = avg_text
    df['avg_text_pre'] = avg_text_pre
    df['sentence_count'] = sentence_count
    df['mean_lexicon_count_per_st'] = mean_lexicon_count_per_st
    df['max_lexicon_count_per_st'] = max_lexicon_count_per_st
    df['min_lexicon_count_per_st'] = min_lexicon_count_per_st
    return df


def count_pos(df1, df2):
    df1['pos_tags'] = df1['preprocessed_excerpt'].str.split().map(pos_tag)
    df1['tag_counts'] = df1['pos_tags'].map(count_tags)

    df2['pos_tags'] = df2['preprocessed_excerpt'].str.split().map(pos_tag)
    df2['tag_counts'] = df2['pos_tags'].map(count_tags)
    
    # train, test 両方に登場するposのみを扱う
    set_pos1 = set([tag for tags in df1['tag_counts'] for tag in tags])
    set_pos2 = set([tag for tags in df2['tag_counts'] for tag in tags])
    tag_cols = list(set_pos1.intersection(set_pos2))

    for tag in tag_cols:
        df1[tag] = df1['tag_counts'].map(lambda x: x.get(tag, 0))
        df2[tag] = df2['tag_counts'].map(lambda x: x.get(tag, 0))
    return df1,df2

def extract_readability(df):
    flesch_re, flesch_kg, fog_scale, automated_r,coleman, linsear, text_standard  = ([] for i in range(7))
    for i in range(len(df)):
        flr = textstat.flesch_reading_ease(df['excerpt'][i])
        flkg = textstat.flesch_kincaid_grade(df['excerpt'][i])
        fs = textstat.gunning_fog(df['excerpt'][i])
        ar = textstat.automated_readability_index(df['excerpt'][i])
        cole = textstat.coleman_liau_index(df['excerpt'][i])
        lins = textstat.linsear_write_formula(df['excerpt'][i])
        ts = textstat.text_standard(df['excerpt'][i])
    
        flesch_re.append(flr)
        flesch_kg.append(flkg)
        fog_scale.append(fs)
        automated_r.append(ar)
        coleman.append(cole)
        linsear.append(lins)
        text_standard.append(ts)
    
    df['flesch_re'] = flesch_re
    df['flesch_kg'] = flesch_kg
    df['fog_scale'] = fog_scale
    df['automated_r'] = automated_r
    df['coleman'] = coleman
    df['linsear'] = linsear
    df['text_standard'] = text_standard

    return df

train_df = count_character_lexicon_sentence(train_df)
test_df = count_character_lexicon_sentence(test_df)

train_df, test_df = count_pos(train_df, test_df)
    
train_df = extract_readability(train_df)
test_df = extract_readability(test_df)

train_df.to_csv("train_excerpt_preprocessed.csv")
test_df.to_csv("test_excerpt_preprocessed.csv")

#train_df = pd.read_csv("train_excerpt_preprocessed.csv", index_col=0)
#test_df = pd.read_csv("test_excerpt_preprocessed.csv", index_col=0)

In [None]:
train_df.info()

In [None]:
test_df.info()

KeyError: '[\'WP\', \'TO\', "\'\'", \'WP$\', \'NNPS\', \'POS\', \'WDT\', \'EX\', \'$\', \'PRP$\', \'SYM\', \'PRP\', \'PDT\', \'NNP\', \'RBS\'] not in index'set

In [None]:
reg = setup(data = train_df, 
             target = 'target',
             #numeric_imputation = 'mean',
             categorical_features = ['text_standard'], 
             ignore_features = ['id', 'url_legal', 'license', 'standard_error', 'excerpt', 'preprocessed_excerpt',
                               'pos_tags', 'tag_counts'],
             normalize = True,
             silent = True,
             session_id = 123)

In [None]:
start = time.time()
compare_models()
print('elapsed time : ', time.time() - start)

In [None]:
import time
start = time.time()
ridge = create_model('ridge')
print('elapsed time : ', time.time() - start)

In [None]:
start = time.time()
tuned = tune_model(ridge, optimize='RMSE')
print('elapsed time : ', time.time() - start)

In [None]:
# prediction
predictions = predict_model(tuned, data = test_df)

# submission file
submission_df = pd.DataFrame({'id': test_df.id, 'target': 0})
submission_df.target = predictions['Label']

submission_df.to_csv('./submission.csv', index=False)

submission_df

最後まで読んでいただきどうもありがとうございます。少しでもお役に立てそうと感じていただけたら、Upvoteよろしくお願いいたします。とても励みになります!!
Thank for your iterest. Please upvote if you think this notebook would be helpful for you. It is really encouraging me.