In [None]:
!pip install textstat
!pip install rich

import os
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import re
import nltk
import textstat
import time
import wandb
import rich
import spacy

from pandas import DataFrame
from matplotlib.lines import Line2D
from rich.console import Console
from rich import print
from rich.theme import Theme
from collections import Counter
from wordcloud import WordCloud,STOPWORDS
from spacy import displacy
from nltk.tokenize import sent_tokenize, word_tokenize 
from sklearn.feature_extraction.text import CountVectorizer as CV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error as mse

nltk.download('stopwords')

from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error

In [None]:
sns.set_context("notebook", font_scale=1.5, rc={"lines.linewidth": 2.5})

def custom_palette(custom_colors):
    customPalette = sns.set_palette(sns.color_palette(custom_colors))
    sns.palplot(sns.color_palette(custom_colors),size=0.8)
    plt.tick_params(axis='both', labelsize=0, length = 0)

palette = ["#7209B7","#3F88C5","#136F63","#F72585","#FFBA08"]
palette2 = sns.diverging_palette(120, 220, n=20)
custom_palette(palette)

custom_theme = Theme({
    "info" : "italic bold cyan",
    "warning": "italic bold magenta",
    "danger": "bold blue",
    "notice": "bold underline black"
})


console = Console(theme=custom_theme)

In [None]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
api_key = user_secrets.get_secret("wandb_api_key")

os.environ["WANDB_SILENT"] = "true"
! wandb login $api_key
# wandb.init(project='commonlit', entity='mo-5')

In [None]:
train_df = pd.read_csv("../input/commonlitreadabilityprize/train.csv")
test_df = pd.read_csv("../input/commonlitreadabilityprize/test.csv")
sample_submission_df = pd.read_csv("../input/commonlitreadabilityprize/sample_submission.csv")

In [None]:
def data_info(df):
    console.print("HEAD", style="notice")
    print(df.head())
    console.print("SHAPE", style="notice")
    print(df.shape)
    console.print("UNIQUE", style="notice")
    print(train_df.nunique())
    console.print("NOT NULL", style="notice")
    msno.bar(train_df,color=palette[2], sort="ascending", figsize=(10,5), fontsize=12)
    plt.show()

In [None]:
data_info(train_df)

In [None]:
most_readable = train_df.sort_values(by='target', ascending=False).iloc[0]
least_readable = train_df.sort_values(by='target').iloc[0]

console.print("Most_readable", style='notice')
console.print(most_readable["target"], style='info')
console.print(most_readable["excerpt"], style='warning')

console.print("Least_readable", style='notice')
console.print(least_readable["target"], style='info')
console.print(least_readable["excerpt"], style='warning')

# tagging

https://www.guru99.com/pos-tagging-chunking-nltk.html#:~:text=Some%20NLTK%20POS%20tagging%20examples,tagging%20with%20NLTK%20is%20complete.

In [None]:
def count_tag(excerpt):
    e = re.sub("[^a-zA-Z]", " ", excerpt)
    e = e.lower()
    e = nltk.word_tokenize(e)
    e = nltk.pos_tag(e)

    return DataFrame(e, columns=["word", "tag"]).groupby("tag").count().apply(lambda e: e / e.sum())

def mean_tag_count(excerpts):
    tag_counts = []
    for excerpt in excerpts:
        tag_counts.append(count_tag(excerpt))
    return pd.concat(tag_counts, axis=1, join="outer").fillna(0).mean(axis=1)

def plot_tag_count(df,title,p):
    plt.figure(figsize=(16,8))
    sns.barplot(x='mean', y='tag', data=df,facecolor=(0, 0, 0, 0),linewidth=3,edgecolor=sns.color_palette(p,20))
    plt.title(title,font='Serif')
    plt.xlabel("Frequency", fontsize=14)
    plt.yticks(fontsize=13)
    plt.xticks(rotation=45, fontsize=13)
    plt.ylabel("");

In [None]:
tag_list = [
    'CC','CD','DT','IN',
    'JJ','JJS','MD','NN',
    'NNS','PRP','RB','TO',
    'VB','VBD','VBG','VBN',
    'VBP','VBZ','WP','WRB',
    'PDT','PRP$','RP','WDT',
    'RBR','EX','JJR','RBS',
    'WP$','UH','FW','NNP',
    'SYM','NNPS']

def count_tags(excerpt):
    temp = count_tag(excerpt["excerpt"]).T.reset_index()
    return {i:temp[i][0] if i in temp.columns else 0 for i in tag_list}

In [None]:


tag_df = train_df[["excerpt"]].apply(count_tags, axis=1, result_type='expand')

In [None]:
def show_corr(corr):
    fig = plt.figure(figsize=(9,9),dpi=80)
    mask = np.triu(np.ones_like(corr, dtype=bool))
    sns.heatmap(corr, mask=mask, cmap='BuPu', robust=True, center=0,
                square=True, linewidths=.5)
    plt.title('Correlation of Pos Tag', fontsize=15,font="Serif")
    plt.show()
   

In [None]:
corr = pd.concat([train_df["target"], tag_df], axis=1).corr()

show_corr(corr)
corr["target"].sort_values()

In [None]:
plt.figure(figsize=(10,8))
sns.scatterplot(data=train_df_tag,x='IN',y='target',color= palette[4],markers='.',alpha=0.5,label="IN",)
sns.scatterplot(data=train_df_tag,x='VBN',y='target',color= palette[1],markers='.',alpha=0.5,label="VBN")
plt.legend(title="POS tag",bbox_to_anchor=(1.4, 1))
plt.xlabel("POS tags count")
plt.title("POS vs Target (Negative correlation)")
plt.show()

plt.figure(figsize=(10,8))
sns.scatterplot(data=train_df_tag,x='PRP',y='target',color= palette[3],markers='.',alpha=0.5,label="PRP")
sns.scatterplot(data=train_df_tag,x='VBD',y='target',color= palette[0],markers='.',alpha=0.5,label="VBD")
plt.legend(title="POS tag",bbox_to_anchor=(1.4, 1))
plt.xlabel("POS tags count")
plt.title("POS vs Target (Positive correlation)")
plt.show()



# textstat

In [None]:
def text_stats(df):
    e = df["excerpt"]
    return {
        "syllable_count":               textstat.syllable_count(e),
        "lexicon_count":                textstat.lexicon_count(e, removepunct=True),
        "sentence_count":               textstat.sentence_count(e),
        "flesch_reading_ease":          textstat.flesch_reading_ease(e),
        "flesch_kincaid_grade":         textstat.flesch_kincaid_grade(e),
        "gunning_fog":                  textstat.gunning_fog(e),
        "smog_index":                   textstat.smog_index(e),
        "automated_readability_index":  textstat.automated_readability_index(e),
        "coleman_liau_index":           textstat.coleman_liau_index(e),
        "linsear_write_formula":        textstat.linsear_write_formula(e),
        "dale_chall_readability_score": textstat.dale_chall_readability_score(e),
        "text_standard":                textstat.text_standard(e, float_output=True),
    }


textstat_df = train_df[["excerpt"]].apply(text_stats, axis=1, result_type='expand')

In [None]:
corr = pd.concat([train_df["target"], textstat_df], axis=1).corr()
show_corr(corr)
corr["target"].sort_values()

# train

In [None]:
def training(X, y, model, params, folds=9):
    mdls = []
    kf = KFold(n_splits=folds, shuffle=True, random_state=21)
    for fold, (train_idx, test_idx) in enumerate(kf.split(X, y)):
        print(f"Fold: {fold}")
        X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
        X_test, y_test = X.iloc[test_idx], y.iloc[test_idx]

        mdl = model(**params)
        mdl.fit(X_train, y_train,
                eval_set=[(X_test, y_test)],
                early_stopping_rounds=100,
                verbose=100)
        mdls.append(mdl)

        pred = mdl.predict(np.ascontiguousarray(X_test))
        loss = np.sqrt(mean_squared_error(y_test, pred))
        print(f" Log loss: {loss}")
        print("-"*50)
    
    return mdls

In [None]:
temp = pd.concat([train_df["target"], tag_df, textstat_df], axis=1)
X = temp.drop("target", axis=1)
y = temp["target"]

In [None]:
params = { # デフォルト
    'n_estimators': 3000,
    'max_depth': 5,
    'learning_rate': 0.01,
#     'tree_method': 'gpu_hist',
}

models = training(X, y, XGBRegressor, params)

In [None]:
test_tag_df = test_df[["excerpt"]].apply(count_tags, axis=1, result_type='expand')
test_textstat_df = train_df[["excerpt"]].apply(text_stats, axis=1, result_type='expand')
test_X = pd.concat([tag_df, textstat_df], axis=1)
y_targets = []
for model in models:
  y_target = model.predict(np.ascontiguousarray(test_X))
  y_target = pd.DataFrame(y_target)
  y_targets.append(y_target)
y_target = sum(y_targets)/len(y_targets)

In [None]:
submission_df = sample_submission_df.copy()
submission_df["target"] = y_target
submission_df.to_csv("/kaggle/working/submission.csv", index=False)
submission_df