In [None]:
######### IMPORT ########
import pickle
from itertools import cycle
from time import time
from tqdm.auto import tqdm
import shutil
from pathlib import Path

# Pandas, Numpy
import pandas as pd
import numpy as np
from numpy import interp
from matplotlib import pyplot as plt
pd.set_option("display.max_columns", None)

# Model evaluation
from sklearn.metrics import plot_confusion_matrix, roc_auc_score,  auc, \
    precision_recall_fscore_support, classification_report, roc_curve, plot_roc_curve

# Sklearn pipeline
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.pipeline import FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn import set_config
from sklearn.pipeline import Pipeline
set_config(display = 'diagram')


from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

from catboost import CatBoostClassifier, CatBoostRegressor

from sklearn.svm import SVR

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [None]:
data_train = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
data_test = pd.read_csv('../input/commonlitreadabilityprize/test.csv')
data_sample_submission = pd.read_csv('../input/commonlitreadabilityprize/sample_submission.csv')

In [None]:
data_sample_submission

In [None]:
data_train

In [None]:
data_test

In [None]:
data_train.describe(percentiles=[0.01, 0.05, 0.25, 0.75, 0.95, 0.99])

In [None]:
data_train['target'].hist(bins=30)

In [None]:
data_train['standard_error'].hist(bins=30)

In [None]:
(data_train['target'] - data_train['standard_error']).hist(bins=30)

In [None]:
(data_train['target'] + data_train['standard_error']).hist(bins=30)

In [None]:
data_test

## Explore

In [None]:
data_train['license'].value_counts()

In [None]:
data_train['excerpt'][0]

## Build pipeline process

In [None]:
######## SUPPORTING CLASSES ########
class PipelineLogger(object):
    def __init__(self):
        pass
        
    def log_start(self):
        self.start_time = time()
        print(f'======== {self.__class__.__name__} - START ========')
        return None
        
    def log_finish(self):
        self.duration = time() - self.start_time
        print(f'======== {self.__class__.__name__} - FINISH =======> Take: {self.duration:.6f}(s)')

class featureUnion(FeatureUnion):
    def _hstack(self, Xs):
        cols = [X.columns.tolist() for X in Xs]
        dtypes = []
        for X in Xs:
            dtypes.append([str(X[col].dtype) for col in X])
        cols = np.hstack(cols)
        dtypes = np.hstack(dtypes)
        data = pd.DataFrame(super()._hstack(Xs), columns = cols)
        print('====Converting columns types====')
        for col, dtype in tqdm(zip(cols, dtypes)):
            data[col] = data[col].astype(dtype)
        return data

class columnTransformer(ColumnTransformer):
    def _hstack(self, Xs):
        cols = [X.columns.tolist() for X in Xs]
        dtypes = []
        for X in Xs:
            dtypes.append([str(X[col].dtype) for col in X])
        cols = np.hstack(cols)
        dtypes = np.hstack(dtypes)
        data = pd.DataFrame(super()._hstack(Xs), columns = cols)
        print('====Converting columns types====')
        for col, dtype in tqdm(zip(cols, dtypes)):
            data[col] = data[col].astype(dtype)
        return data

class ExperimentBase(BaseEstimator):
    def evaluate(self, X_test, y_test):
        print('Evaluating model')
        print(classification_report(y_true=y_test, y_pred=self.predict(X_test)))
        metrics = self.auc_report(X_test, y_test)
        metrics['precision'], metrics['recall'], metrics['f1_score'], metrics['support'] = precision_recall_fscore_support(y_test, self.predict(X_test))
        return metrics
    
    def auc_report(self, X, y_true):
        classes = self.classes_
        y_pred_classes = self.predict_proba(X)
        n_classes = len(classes)

        lw = 2
        for i in range(len(classes)):
            print(f"""{classes[i]}: {roc_auc_score(y_true=(y_true==classes[i]).astype(int), y_score=y_pred_classes[:,i])}""")

        # Compute ROC curve and ROC area for each class
        fpr = dict()
        tpr = dict()
        roc_auc = dict()

        for i in range(n_classes):
            fpr[i], tpr[i], _ = roc_curve(y_true=(y_true==classes[i]).astype(int), y_score=y_pred_classes[:,i])
            roc_auc[i] = auc(fpr[i], tpr[i])

        all_fpr = np.unique(np.concatenate([fpr[i] for i in range(len(classes))]))

        # Then interpolate all ROC curves at this points
        mean_tpr = np.zeros_like(all_fpr)
        for i in range(len(classes)):
            mean_tpr += interp(all_fpr, fpr[i], tpr[i])

        # Finally average it and compute AUC
        mean_tpr /= n_classes

        fpr["macro"] = all_fpr
        tpr["macro"] = mean_tpr
        roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

        # Plot all ROC curves
        plt.figure()

        plt.plot(fpr["macro"], tpr["macro"],
                 label='macro-average ROC curve (area = {0:0.2f})'
                       ''.format(roc_auc["macro"]),
                 color='navy', linestyle=':', linewidth=4)

        colors = cycle(['aqua', 'darkorange', 'cornflowerblue'])
        for i, color in zip(range(n_classes), colors):
            plt.plot(fpr[i], tpr[i], color=color, lw=lw,
                     label='ROC curve of class {0} (area = {1:0.2f})'
                     ''.format(classes[i], roc_auc[i]))

        plt.plot([0, 1], [0, 1], 'k--', lw=lw)
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Some extension of Receiver operating characteristic to multi-class')
        plt.legend(loc="lower right")
        plt.show()
        metrics = {
            'macro_auc': roc_auc["macro"]
        }
        for i in range(n_classes):
            metrics[f'auc_{classes[i]}'] = roc_auc[i]
        return metrics
    
class simpleImputer(SimpleImputer):
    def fit(self, X, y=None):
        self._cols = X.columns.tolist()
        self._dtypes = [str(X[col].dtype) for col in X.columns]
        super().fit(X, y)
        return self
        
    def transform(self, X):
        X_ = super().transform(X)
        data = pd.DataFrame(X_, columns = self._cols)
        for col, dtype in tqdm(zip(self._cols, self._dtypes)):
            data[col] = data[col].astype(dtype)
        return data
######## DONE SUPPORTING CLASSES ########

In [None]:
('max_imputor', simpleImputer(strategy='constant', fill_value='unk'))

In [None]:
class TextLowerer(BaseEstimator, TransformerMixin, PipelineLogger):
    def __init__(self, columns):
        super().__init__()
        self.columns = columns
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X_ = X[self.columns].copy()
        for c in X_.columns:
            X_[c] = X_[c].apply(lambda x: x.lower())
        return X_

class TextSpliter(BaseEstimator, TransformerMixin, PipelineLogger):
    def __init__(self, columns, spliters):
        super().__init__()
        self.spliters = spliters
        self.columns = columns
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X_ = X[self.columns].copy()
        for col in self.columns:
            X_[col] = X_[col].str.replace(pat='(:|/|_|-)',repl=' ', regex=True)
        return X_
    
class PassThroughExcept(BaseEstimator, TransformerMixin, PipelineLogger):
    def __init__(self, col_except_func):
        super().__init__()
        self.col_except_func = col_except_func
        
    def fit(self, X, y=None):
        self.except_cols = self.col_except_func(X)
        return self
    
    def transform(self, X):
        self.log_start()
        X_ = X[[c for c in X.columns if c not in self.except_cols]]
        self.log_finish()
        return X_

In [None]:
data_train

In [None]:
data_train['url_legal'].str.replace(pat='(:|/|_|-)',repl=' ', regex=True)

In [None]:
pd.Series(dtype='object')

In [None]:
class TextCombinator(BaseEstimator, TransformerMixin, PipelineLogger):
    def fit(self, X, y=None):
        self.cols = X.columns.to_list()
        return self
    
    def transform(self, X):
        X_ = X.copy()
        X_['comb_text'] = ''
        for c in self.cols:
            X_['comb_text'] += ' ' + X[c]
        return X_['comb_text']

In [None]:
SVR()

In [None]:
pl_preprocess = Pipeline(steps=[
    ('unk_imputing', simpleImputer(strategy='constant', fill_value='unk')),
    ('text_lowering', TextLowerer(columns=['url_legal', 'license', 'excerpt'])),
    ('feature_processing', featureUnion(transformer_list=[
        ('text_spliting', TextSpliter(columns=['url_legal', 'license'], spliters=[':', '/', '_', '-'])),
        ('pass_through', PassThroughExcept(col_except_func=lambda X: [c for c in X.columns if c in ['url_legal', 'license']]))
    ])),
    ('combine_text', TextCombinator()),
    ('vect', CountVectorizer(ngram_range=(1,1), max_df=0.9, max_features=None)), 
    ('tfidf', TfidfTransformer()),
    ('clf', SVR(kernel= "rbf",gamma='scale',C=2))
    
])

In [None]:
pl_preprocess

In [None]:
pl_preprocess.fit(data_train.drop(columns=['target', 'standard_error']), data_train.target)

In [None]:
data_train

In [None]:
data_test

In [None]:
data_test_copy = data_test.copy()
data_test_copy['target'] = pl_preprocess.predict(data_test)

In [None]:
data_test_copy

In [None]:
data_test_copy[['id', 'target']].to_csv('submission.csv',index=False)

In [None]:
data_train_copy = data_train.copy()
data_train_copy['pred'] = pl_preprocess.predict(data_train.drop(columns=['target', 'standard_error']))

In [None]:

data_train_copy

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
mean_squared_error(data_train_copy['target'], data_train_copy['pred'], squared=False)

Train's rmse ~ 0.269 is much much lower than submission score (~ 0.750)

=> Could be overfitting