# CommonLit Random Forest Naive Baseline

- I call this a "naive" baseline because it makes no target distribution assumptions, hyperparameter tuning is brief and simple, algorithm select is selected due to its ease to implement and power, etc.  

# Imports

In [None]:
import pandas as pd
import numpy as np
import re
import os
import gc
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from scipy.stats import uniform, truncnorm, randint

# Load Data

In [None]:
df = pd.read_csv("../input/commonlitreadabilityprize/train.csv")
df.head()

# Train

In [None]:
%%time

tfidf = TfidfVectorizer(stop_words='english')
rfr = RandomForestRegressor(criterion='mae')
pipe = Pipeline([('tfidf', tfidf), ('rfr', rfr)])
distributions = {'tfidf__max_df': uniform(.9, .05),
                 'tfidf__min_df': uniform(0.02, .1),
                 'tfidf__max_features': randint(100,5000),
                 'tfidf__ngram_range': [(1,1),(1,2),(1,3)],
                 'rfr__n_estimators': randint(5,500),
                 'rfr__max_depth': randint(1,8)}
reg = RandomizedSearchCV(pipe, distributions, random_state=0, n_iter=40, cv=5, 
                         scoring='neg_mean_absolute_error', n_jobs=-1, verbose=1)
search = reg.fit(df['excerpt'], df['target']) # best model is search.best_estimator_

# Inference

In [None]:
dft = pd.read_csv("../input/commonlitreadabilityprize/test.csv")
dft.head()

In [None]:
target = search.best_estimator_.predict(dft['excerpt'])

In [None]:
dft['target'] = target
dft.head()

In [None]:
dft[['id','target']].to_csv("submission.csv", index=False)