In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import QuantileTransformer

%matplotlib inline

In [None]:
df = pd.read_csv('../input/reddit-upvote-prediction/Train_v2.csv').sort_values('created_utc')

In [None]:
df.sample(5)

In [None]:
df['score'] = pd.to_numeric(df['score'], errors='coerce').fillna(0)

In [None]:
df['score'].hist(bins=51, log=True);

In [None]:
score_quantile = QuantileTransformer(n_quantiles=101, output_distribution='normal')
df['score_q'] = score_quantile.fit_transform(df[['score']])[:, 0]

In [None]:
df['score_q'].hist(bins=21);

## ML

In [None]:
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVR
from sklearn.pipeline import make_pipeline
from tqdm.auto import tqdm
from sklearn import metrics

In [None]:
x = np.arange(len(df))

n_folds = 5
folds = TimeSeriesSplit(n_folds, max_train_size=2*len(x)//n_folds)

for i, (train, valid) in enumerate(folds.split(x), 1):
    plt.broken_barh([[train[0], train[-1]-train[0]]], (i, 0.8), color='lightblue', label='train' if i == 0 else None)
    plt.broken_barh([[valid[0], valid[-1]-valid[0]]], (i, 0.8), color='lightgray', label='valid' if i == 0 else None)
    
plt.gca().invert_yaxis()
plt.legend();

In [None]:
%%time
df_text = df.drop(['score_q', 'score'], axis=1).apply(lambda x: ' '.join(map(str, x.values)), axis=1)
df_text.sample(10)

In [None]:
folds = TimeSeriesSplit(n_folds, max_train_size=2*len(df)//n_folds)


In [None]:
y_oof = np.full(len(df), np.nan)

y = df['score'].values

for fold_num, (train, valid) in enumerate(tqdm(folds.split(df), total=n_folds)):
    X_train = df_text.iloc[train]
    X_valid = df_text.iloc[valid]
    
    y_train = y[train]
    y_valid = y[valid]
    
    model = make_pipeline(
        TfidfVectorizer(min_df=10, max_df=0.5, max_features=20_000),
        LinearSVR(C=0.1)
    )
    
    model.fit(X_train, y_train)
    
    y_oof[valid] = model.predict(X_valid)

In [None]:
y_is_finite = np.isfinite(y_oof)

In [None]:
plt.scatter(y[y_is_finite], y_oof[y_is_finite], alpha=0.2, s=2)
plt.xlim(-100, 100)
plt.ylim(-100, 100);

In [None]:
plt.scatter(
    score_quantile.inverse_transform(y[y_is_finite, np.newaxis]),
    score_quantile.inverse_transform(y_oof[y_is_finite, np.newaxis]),
    alpha=0.2, s=2
)

plt.xlim(-100, 100)
plt.ylim(-100, 100);