In [None]:
# demo notebook for model

In [None]:
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.compose import TransformedTargetRegressor, make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import PredictionErrorDisplay, mean_squared_error, r2_score, mean_absolute_error, root_mean_squared_error, median_absolute_error
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
raw_dataset = pd.read_csv("../data/journal_ranking_data.csv")
clean_dataset = raw_dataset.drop(raw_dataset[raw_dataset["CiteScore"].gt(100)].index)
clean_dataset = clean_dataset.dropna()
clean_dataset = clean_dataset.drop_duplicates(subset=["CiteScore", "Cites/Doc. 2y"])

In [None]:
fig = px.histogram(clean_dataset, x="CiteScore", nbins=400)
fig.show()

In [None]:
fig = px.histogram(clean_dataset, x="Cites/Doc. 2y", nbins=400)
fig.show()

In [None]:
train, test = train_test_split(clean_dataset, test_size=0.2)

In [None]:
# export the two columns to np arrays
X = train["Cites/Doc. 2y"].to_numpy().reshape(-1, 1)
y = train["CiteScore"].to_numpy().reshape(-1, 1)
X_test = test["Cites/Doc. 2y"].to_numpy().reshape(-1, 1)
y_test = test["CiteScore"].to_numpy().reshape(-1, 1)

In [None]:
reg = linear_model.LinearRegression()

In [None]:
reg.fit(X, y)

In [None]:
reg.coef_, reg.intercept_

In [None]:
xvals = np.arange(0, 100, 1)
yvals = reg.predict(xvals.reshape(-1, 1))
yvals[:,0]

In [None]:
# plot the model
fig = px.scatter(train, x="Cites/Doc. 2y", y="CiteScore")
fig.add_trace(go.Scatter(x=xvals, y=yvals[:,0]))
fig.show()

In [None]:
train[["Cites/Doc. 2y", "CiteScore"]].values

In [None]:
sns.pairplot(train[["Cites/Doc. 2y", "CiteScore"]], kind="reg", diag_kind="kde")

In [None]:
mae_train = median_absolute_error(y, reg.predict(X))
y_test_predict = reg.predict(X_test)
mae_test = median_absolute_error(y_test, y_test_predict)

In [None]:
mae_train

In [None]:
mae_test

In [None]:
scores = {
    "MedAE on training set": f"{mae_train:.1f}",
    "MedAE on testing set": f"{mae_test:.1f}",
}

In [None]:
_, ax = plt.subplots(figsize=(5, 5))
display = PredictionErrorDisplay.from_predictions(
    y_test, y_test_predict, kind="actual_vs_predicted", ax=ax, scatter_kwargs={"alpha": 0.5}
)
ax.set_title("Linear")
for name, score in scores.items():
    ax.plot([], [], " ", label=f"{name}: {score}")
ax.legend(loc="upper left")
plt.tight_layout()

# Now include more info in the fitting

In [None]:
clean_dataset.head()

In [None]:
selected_columns = ["SJR-index", "H-index", "Total Docs.", "Total Docs. 3y", "Total Refs.", "Total Cites 3y", "Citable Docs. 3y", "Cites/Doc. 2y", "Refs./Doc."]
multiple_features = clean_dataset[selected_columns + ["CiteScore"]].copy()
# drop duplicates
multiple_features = multiple_features.drop_duplicates()
train, test = train_test_split(multiple_features, test_size=0.2)

In [None]:
train.info()

In [None]:
X = train[selected_columns]
y = train["CiteScore"]
X_test = test[selected_columns]
y_test = test["CiteScore"]

In [None]:
sns.pairplot(train, kind="reg", diag_kind="kde")
plt.show()
# distribution for CiteScore has a long tail, could take the log to 
# approximate a normal distribution

In [None]:
scale_columns = selected_columns

preprocessor = make_column_transformer(
    (StandardScaler(), scale_columns),
)

In [None]:
model = make_pipeline(
    preprocessor,
    TransformedTargetRegressor(
        regressor=linear_model.Ridge(alpha=1e-10),
    ),
)
model.fit(X, y)

In [None]:
model.fit(X, y)

In [None]:
mae_train = median_absolute_error(y, model.predict(X))
y_pred = model.predict(X_test)
mae_test = median_absolute_error(y_test, y_pred)
scores = {
    "MedAE on training set": f"{mae_train:.2f} CiteScore",
    "MedAE on testing set": f"{mae_test:.2f} CiteScore",
}

In [None]:
print(scores)

In [None]:
_, ax = plt.subplots(figsize=(5, 5))
display = PredictionErrorDisplay.from_predictions(
    y_test, y_pred, kind="actual_vs_predicted", ax=ax, scatter_kwargs={"alpha": 0.5}
)
ax.set_title("Ridge model, small regularization")
for name, score in scores.items():
    ax.plot([], [], " ", label=f"{name}: {score}")
ax.legend(loc="upper left")
plt.tight_layout()

In [None]:
feature_names = model[:-1].get_feature_names_out()


coefs = pd.DataFrame(
    model[-1].regressor_.coef_,
    columns=["Coefficients importance"],
    index=feature_names,
)
coefs.plot.barh(figsize=(9, 7))
plt.title("Ridge model, small regularization, normalized variables")
plt.xlabel("Raw coefficient values")
plt.axvline(x=0, color=".5")
plt.subplots_adjust(left=0.3)

In [None]:
cv_model = cross_validate(
    model,
    X,
    y,
    cv=cv,
    return_estimator=True,
    n_jobs=2,
)
coefs = pd.DataFrame(
    [est[-1].regressor_.coef_ for est in cv_model["estimator"]], columns=feature_names
)

In [None]:
plt.figure(figsize=(9, 7))
sns.stripplot(data=coefs, orient="h", palette="dark:k", alpha=0.5)
sns.boxplot(data=coefs, orient="h", color="cyan", saturation=0.5, whis=10)
plt.axvline(x=0, color=".5")
plt.title("Coefficient variability")
plt.subplots_adjust(left=0.3)
plt.savefig("coeff_variability.png")
plt.show()
            

Be careful when implying feature importance due to the coefficients!
Coefficients need to be normalized to compare feature importance.
Machine Learning models are generally unable to infer causal effects because of the likelihood of  unobserved confounding variables that either inflate or deflate that coefficient. 