diff --git a/datasist/__init__.py b/datasist/__init__.py index 96ada3b0..0aa2cc56 100644 --- a/datasist/__init__.py +++ b/datasist/__init__.py @@ -4,4 +4,3 @@ from . import timeseries from . import visualizations from . import model -from . import nlp \ No newline at end of file diff --git a/datasist/model.py b/datasist/model.py index d4812c9e..fbd0f4b2 100644 --- a/datasist/model.py +++ b/datasist/model.py @@ -4,7 +4,7 @@ ''' import platform -from sklearn.metrics import roc_curve, confusion_matrix, precision_score, accuracy_score, recall_score, f1_score, make_scorer +from sklearn.metrics import roc_curve, confusion_matrix, precision_score, accuracy_score, recall_score, f1_score, make_scorer, mean_absolute_error, mean_squared_error, r2_score, mean_squared_log_error from sklearn.model_selection import KFold, cross_val_score import numpy as np import pandas as pd @@ -232,6 +232,55 @@ def get_classification_report(y_train=None, prediction=None, show_roc_plot=True, plt.savefig("roc_plot.png") +def get_regression_report(y_true=None, prediction=None, show_r2_plot=True, save_plot=False): + ''' + Generates performance report for a regression problem. + + Parameters: + ------------------ + y_true: Array, series, list. + + The truth/ground value from the train data set. + + prediction: Array, series, list. + + The predicted value by a trained model. + + show_r2_plot: Bool, default True. + + Show the r-squared curve. + + save_plot: Bool, default True. + + Save the plot to the current working directory. + + ''' + mae = mean_absolute_error(y_true, prediction) + mse = mean_squared_error(y_true, prediction) + msle = precision_score(y_true, prediction) + r2 = r2_score(y_true, prediction) + + print("Mean Absolute Error: ", round(mae, 5)) + print("Mean Squared Error: ", round(mse, 5)) + print("Mean Squared Log Error: ", round(msle, 5)) + print("R-squared Error: ", round(r2, 5)) + print("*" * 100) + + if show_r2_plot: + plt.scatter(y_true,prediction) + plt.xlabel('Truth values') + plt.ylabel('Predicted values') + plt.plot(np.unique(y_true), np.poly1d(np.polyfit(y_true, y_true, 1))(np.unique(y_true))) + plt.text(0.7, 0.2, 'R-squared = %0.2f' % r2) + plt.show() + + if save_plot: + plt.savefig("r2_plot.png") + + + + + def compare_model(models_list=None, x_train=None, y_train=None, scoring_metric=None, scoring_cv=3, silenced=True, plot=True): """ Train multiple user-defined model and display report based on defined metric. Enables user to pick the best base model for a problem. diff --git a/datasist/tests/test_model.py b/datasist/tests/test_model.py index d39ab811..360dbe3c 100644 --- a/datasist/tests/test_model.py +++ b/datasist/tests/test_model.py @@ -6,6 +6,7 @@ from datasist import model + def test_compare_model_classification(): x_train, y_train = make_classification( n_samples=50, @@ -24,6 +25,7 @@ def test_compare_model_classification(): assert type(model_scores) is list assert hasattr(fitted_model[0], "predict") + def test_compare_model_regression(): x_train, y_train = make_classification( n_samples=50, diff --git a/datasist/tests/test_nlp.py b/datasist/tests/test_nlp.py deleted file mode 100644 index 96464d5e..00000000 --- a/datasist/tests/test_nlp.py +++ /dev/null @@ -1,11 +0,0 @@ -from datasist import nlp - - -# Corpus for testing pre-processing -sent = u"14000000 8888 I've been meaning to write this down anyways (in case you're interested and for everyone else who comes across this issue) Going forward, we're actually thinking about encouraging the import syntax more, or even making it the default, recommended way of loading models." - - -def test_pre_process(): - expected = '[#####, ####, meaning, write, anyways, (, case, interested, comes, issue, ), Going, forward, ,, actually, thinking, encouraging, import, syntax, ,, making, default, ,, recommended, way, loading, models, .]' - output = nlp.pre_process(sent, True, False, True) - assert expected == output \ No newline at end of file diff --git a/docs/index.html b/docs/index.html index f29b97b9..51c1d3f7 100644 --- a/docs/index.html +++ b/docs/index.html @@ -29,7 +29,7 @@

Module datasist

from . import timeseries from . import visualizations from . import model -from . import nlp +
@@ -43,10 +43,7 @@

Sub-modules

This module contains all functions relating to modeling in using sklearn library.

-
datasist.nlp
-
-

This module contains all functions relating to nlp

-
+
datasist.structdata

This module contains all functions relating to the cleaning and exploration of structured data sets; mostly in pandas format

@@ -80,7 +77,6 @@

logo