# Data Prepare

In [None]:
import numpy as np 
import pandas as pd 
from sklearn.model_selection import KFold

train = pd.read_csv('../input/commonlitreadabilityprize/train.csv')

kf = KFold(n_splits=3, random_state=1001,shuffle=True)
for i, (train_index, val_index) in enumerate(kf.split(train)):
    trn= train.iloc[train_index].reset_index()
    val= train.iloc[val_index].reset_index()
    
trn = trn.drop(columns=['index'])
val = val.drop(columns=['index'])

val.to_csv('sub_val.csv',index=False)
trn.to_csv('sub_train.csv',index=False)

# Install Auto3ML package

In [None]:
# !rm -r ./autox
!git clone https://github.com/4paradigm/autox.git
!pip install ./autox

# Import

In [None]:
from autox.autox_nlp import NLP_feature
import pandas as pd
import numpy as np
import os
from tqdm import tqdm

In [None]:
df_train = pd.read_csv('sub_train.csv')
df_test = pd.read_csv('sub_val.csv')

# Different Split mode

## Use space to split

In [None]:
nlp = NLP_feature()
use_Toknizer=False
emb_mode = 'Word2Vec'# TFIDF / Word2Vec / Glove / FastText / Bert
task = 'supervise' # unsupervise / supervise
text_columns_name = ['excerpt']
y = df_train['target']
candidate_labels=None

df = nlp.fit(df_train,
             text_columns_name,
             use_Toknizer,
             emb_mode,
             task,
             y)
df.head(3)

## Use unsupervised tokenizer to split

In [None]:
nlp = NLP_feature()

use_Toknizer=True
emb_mode = 'Word2Vec'# TFIDF / Word2Vec / Glove / FastText / Bert
task = 'supervise' # unsupervise / supervise
text_columns_name = ['excerpt']
y = df_train['target']

df = nlp.fit(df_train,
             text_columns_name,
             use_Toknizer,
             emb_mode,
             task,
             y)
df.head(3)

# different embedding mode

## Use TFIDF to extract feature

In [None]:
nlp = NLP_feature()

use_Toknizer=True
emb_mode = 'TFIDF'# TFIDF / Word2Vec / Glove / FastText / Bert
task = 'supervise' # unsupervise / supervise
text_columns_name = ['excerpt']
y = df_train['target']

df = nlp.fit(df_train,
             text_columns_name,
             use_Toknizer,
             emb_mode,
             task,
             y)
df.head(3)

## Use Glove to extract feature

In [None]:
nlp = NLP_feature()

use_Toknizer=True
emb_mode = 'Glove'# TFIDF / Word2Vec / Glove / FastText / Bert
task = 'supervise' # unsupervise / supervise
text_columns_name = ['excerpt']
y = df_train['target']

df = nlp.fit(df_train,
             text_columns_name,
             use_Toknizer,
             emb_mode,
             task,
             y)
df.head(3)

## Use FastText to extract feature

In [None]:
nlp = NLP_feature()

use_Toknizer=True
emb_mode = 'FastText'# TFIDF / Word2Vec / Glove / FastText / Bert
task = 'supervise' # unsupervise / supervise
text_columns_name = ['excerpt']
y = df_train['target']

df = nlp.fit(df_train,
             text_columns_name,
             use_Toknizer,
             emb_mode,
             task,
             y)
df.head(3)

## Use Bert to extract feature

In [None]:
nlp = NLP_feature()

use_Toknizer=True
emb_mode = 'Bert'# TFIDF / Word2Vec / Glove / FastText / Bert
task = 'supervise' # unsupervise / supervise
text_columns_name = ['excerpt']
y = df_train['target']

df = nlp.fit(df_train,
             text_columns_name,
             use_Toknizer,
             emb_mode,
             task,
             y)
df.head(3)

## Use Zero-shot labeling to extract feature

In [None]:
nlp = NLP_feature()

use_Toknizer=True
emb_mode = 'Bert'# TFIDF / Word2Vec / Glove / FastText / Bert
task = 'zero-shot-classification' # unsupervise / supervise
text_columns_name = ['excerpt']
y = None
hypothesis = {'excerpt':['this excerpt is too complex','this excerpt is easy to understand']}

df = nlp.fit(df_train,
             text_columns_name,
             use_Toknizer,
             emb_mode,
             task,
             y,
             hypothesis)
df = nlp.transform(df_train)
df.head(3)

# Different feature reduction mode

## supervise(target encode)

In [None]:
nlp = NLP_feature()

use_Toknizer=True
emb_mode = 'TFIDF'# TFIDF / Word2Vec / Glove / FastText / Bert
task = 'supervise' # unsupervise / supervise
text_columns_name = ['excerpt']
y = df_train['target']

df = nlp.fit(df_train,
             text_columns_name,
             use_Toknizer,
             emb_mode,
             task,
             y)
df.head(3)

# unsupervise

In [None]:
lp = NLP_feature()

use_Toknizer=True
emb_mode = 'TFIDF'# TFIDF / Word2Vec / Glove / FastText / Bert
task = 'unsupervise' # unsupervise / supervise
text_columns_name = ['excerpt']

df = nlp.fit(df_train,
             text_columns_name,
             use_Toknizer,
             emb_mode,
             task)
df.head(20)

# Concat raw data with meta feature

In [None]:
for column in df.columns:
    df_train[column] = df[column]
df_train = df_train.drop(columns=text_columns_name)
df_train

# Transform test data

In [None]:
test = nlp.transform(df_test)
test

# Write data to .CSV

In [None]:
df_train.to_csv(f'{emb_mode}_{encode_mode}_autox_trn.csv',index=False)
test.to_csv(f'{emb_mode}_{encode_mode}_autox_val.csv',index=False)

# Use Auto3ML General Interface to build model

In [None]:
df_val=pd.read_csv(f'{emb_mode}_{encode_mode}_autox_val.csv').drop(columns=['target'])
df_val.to_csv(f'{emb_mode}_{encode_mode}_autox_tst.csv',index=False)

In [None]:
from autox import AutoX

path = f'.' 
autox = AutoX(target = 'target', train_name = f'{emb_mode}_{encode_mode}_autox_trn.csv', test_name = f'{emb_mode}_{encode_mode}_autox_tst.csv',  id = [], path = path)
sub = autox.get_submit()

In [None]:
val = pd.read_csv(f'sub_val.csv')
from sklearn.metrics import mean_squared_error
RMSE = np.sqrt(mean_squared_error(val['target'], sub['target']))

In [None]:
RMSE