running nbqa on CI, formats notebooks (#280)

ploomber · Feb 9, 2023 · 9c0c894 · 9c0c894
1 parent 9202881
commit 9c0c894
Show file tree

Hide file tree

Showing 22 changed files with 379 additions and 266 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -26,8 +26,12 @@ jobs:
       shell: bash -l {0}
       run: |
         eval "$(conda shell.bash hook)"
+        # run flake8 on .py files
         pip install flake8
         flake8
+        # run flake8 on notebooks (.ipynb, .md, etc)
+        pip install jupytext nbqa
+        nbqa flake8 .
 
     - name: Run tests
       env:

diff --git a/docs/classification/basic.md b/docs/classification/basic.md
@@ -18,16 +18,14 @@ from sklearn.ensemble import RandomForestClassifier
 from sklearn.model_selection import train_test_split
 from sklearn import datasets
 from sklearn_evaluation import plot, table
-import matplotlib.pyplot as plt
 ```
 
 sklearn-evluation has two main modules for evaluating classifiers: [sklearn_evaluation.plot](../api/plot.rst) and [sklearn_evaluation.table](../api/table.rst), let’s see an example of how to use them.
 
 First, let’s load some data and split it in training and test set.
 
 ```{code-cell} ipython3
-data = datasets.make_classification(200, 10, n_informative=5,
-                                    class_sep=0.65)
+data = datasets.make_classification(200, 10, n_informative=5, class_sep=0.65)
 ```
 
 ```{code-cell} ipython3

diff --git a/docs/classification/calibration.md b/docs/classification/calibration.md
@@ -42,13 +42,19 @@ from sklearn.ensemble import RandomForestClassifier
 from sklearn.linear_model import LogisticRegression
 
 X, y = make_classification(
-    n_samples=20000, n_features=4, n_informative=2, n_redundant=2, class_sep=0.7, random_state=0)
+    n_samples=20000,
+    n_features=4,
+    n_informative=2,
+    n_redundant=2,
+    class_sep=0.7,
+    random_state=0,
+)
 
 X_train, X_test, y_train, y_test = train_test_split(
     X, y, test_size=0.33, random_state=0
 )
 
-svc = make_pipeline(StandardScaler(), SVC(gamma='auto', probability=True))
+svc = make_pipeline(StandardScaler(), SVC(gamma="auto", probability=True))
 svc_probas = svc.fit(X_train, y_train).predict_proba(X_test)
 
 lr = LogisticRegression()
@@ -90,18 +96,31 @@ Logistic Regression returns well calibrated predictions by default. In contrast,
 
 ```{code-cell} ipython3
 import matplotlib.pyplot as plt
+
 fig, axs = plt.subplots(2, 2, figsize=(14, 11))
-plot.scores_distribution(svc_probas[:, 1], n_bins=10, 
-                                          title="SVC", color="#F5D065", ax=axs[0,0])
+plot.scores_distribution(
+    svc_probas[:, 1], n_bins=10, title="SVC", color="#F5D065", ax=axs[0, 0]
+)
 
-plot.scores_distribution(rf_probas[:, 1], n_bins=10, 
-                                          title="Random Forest", color="#30AEE6", ax=axs[0,1])
+plot.scores_distribution(
+    rf_probas[:, 1], n_bins=10, title="Random Forest", color="#30AEE6", ax=axs[0, 1]
+)
 
-plot.scores_distribution(lr_probas[:, 1], n_bins=10, 
-                                          title="Logistic Regression", color="#E568EE",ax=axs[1,0])
+plot.scores_distribution(
+    lr_probas[:, 1],
+    n_bins=10,
+    title="Logistic Regression",
+    color="#E568EE",
+    ax=axs[1, 0],
+)
 
-plot.scores_distribution(nb_probas[:, 1], n_bins=10, 
-                                          title="Gaussian Naive Bayes", color="#5BAF69", ax=axs[1,1])
+plot.scores_distribution(
+    nb_probas[:, 1],
+    n_bins=10,
+    title="Gaussian Naive Bayes",
+    color="#5BAF69",
+    ax=axs[1, 1],
+)
 ```
 
 Gaussian Naive Bayes tends to push probabilities to 0 or 1 (note the counts in the histograms). There aren't many observations in the 0.2 to 0.8 region. Logistic regression has good support all along the 0.0 to 1.0 area. SVC and Random Forest also tend to push probabilities toward 0.0 and 1.0 however support in the 0.2 to 0.8 region is better than Gaussian Naive Bayes.

diff --git a/docs/classification/compare.md b/docs/classification/compare.md
@@ -53,9 +53,6 @@ tree_pred, forest_pred = [
     est.fit(X_train, y_train).predict(X_test)
     for est in [DecisionTreeClassifier(), RandomForestClassifier()]
 ]
-
-
-
 ```
 
 ### Decision tree confusion matrix
@@ -77,13 +74,14 @@ compare = tree_cm + forest_cm
 ```
 
 ```{code-cell} ipython3
-diff = forest_cm - tree_cm 
+diff = forest_cm - tree_cm
 ```
 
 ## ROC
 
 ```{code-cell} ipython3
 :tags: [remove-output]
+
 logistic_score, forest_score = [
     est.fit(X_train, y_train).predict_proba(X_test)
     for est in [LogisticRegression(), RandomForestClassifier()]
@@ -145,6 +143,7 @@ diff = forest_cr - tree_cr
 
 ```{code-cell} ipython3
 :tags: [remove-output]
+
 tree_score, forest_score = [
     est.fit(X_train, y_train).predict_proba(X_test)
     for est in [DecisionTreeClassifier(), RandomForestClassifier()]
@@ -154,18 +153,25 @@ tree_score, forest_score = [
 ### Decision tree PR
 
 ```{code-cell} ipython3
-tree_pr = plot.PrecisionRecall.from_raw_data(y_test, tree_score, label=["Decision Tree Class 1", "Decision Tree Class 2", "Decision Tree Class 3"])
+tree_pr = plot.PrecisionRecall.from_raw_data(
+    y_test,
+    tree_score,
+    label=["Decision Tree Class 1", "Decision Tree Class 2", "Decision Tree Class 3"],
+)
 ```
 
 ### Random forest PR
 
 ```{code-cell} ipython3
-forest_pr = plot.PrecisionRecall.from_raw_data(y_test, forest_score, label=["Random Forest Class 1", "Random Forest Class 2", "Random Forest Class 3"])
+forest_pr = plot.PrecisionRecall.from_raw_data(
+    y_test,
+    forest_score,
+    label=["Random Forest Class 1", "Random Forest Class 2", "Random Forest Class 3"],
+)
 ```
 
 ### Compare PR
 
 ```{code-cell} ipython3
 compare = tree_pr + forest_pr
 ```
-
diff --git a/docs/classification/evaluate.md b/docs/classification/evaluate.md
@@ -37,13 +37,17 @@ matplotlib.rcParams["font.size"] = 18
 
 ```{code-cell} ipython3
 # Generate a dataset with low class_sep value
-X, y = make_classification(n_samples=1000,
-                           n_features=20,
-                           n_informative=10,
-                           class_sep=0.8,
-                           n_classes=2,
-                           random_state=0)
-X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
+X, y = make_classification(
+    n_samples=1000,
+    n_features=20,
+    n_informative=10,
+    class_sep=0.8,
+    n_classes=2,
+    random_state=0,
+)
+X_train, X_test, y_train, y_test = train_test_split(
+    X, y, test_size=0.2, random_state=123
+)
 model = LogisticRegression(random_state=101)
 model = model.fit(X_train, y_train)
 ```
@@ -58,13 +62,15 @@ plt.show()
 
 ```{code-cell} ipython3
 X, y = load_data(return_X_y=True)
-X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
+X_train, X_test, y_train, y_test = train_test_split(
+    X, y, test_size=0.2, random_state=123
+)
 
 # Create classifier instance
 lr = LogisticRegression(max_iter=2000)
 
 # Fit the model
-lr.fit(X_train,y_train)
+lr.fit(X_train, y_train)
 y_probas = lr.predict_proba(X_test)
 ```
 

diff --git a/docs/classification/grid_search.md b/docs/classification/grid_search.md
@@ -24,30 +24,36 @@ Here we are going to use the HeartDiseasesUCI dataset.
 
 ```{code-cell} ipython3
 import urllib.request
-import pandas as pd 
+import pandas as pd
 
-# download dataset. Reference: https://www.kaggle.com/datasets/redwankarimsony/heart-disease-data
-urllib.request.urlretrieve('https://raw.githubusercontent.com/sharmaroshan/Heart-UCI-Dataset/master/heart.csv', filename='heart.csv')
+# download dataset
+# Reference: https://www.kaggle.com/datasets/redwankarimsony/heart-disease-data
+urllib.request.urlretrieve(
+    "https://raw.githubusercontent.com/sharmaroshan/Heart-UCI-Dataset/master/heart.csv",
+    filename="heart.csv",
+)
 
-data = pd.read_csv('heart.csv')
+data = pd.read_csv("heart.csv")
 
 data.head()
 ```
 
 ## Specify variables
 
 ```{code-cell} ipython3
-X = data.drop('target', axis = 1)
+X = data.drop("target", axis=1)
 
-y = data['target']
+y = data["target"]
 ```
 
 ## Split the data
 
 ```{code-cell} ipython3
 from sklearn.model_selection import train_test_split
 
-X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 2023)
+X_train, X_test, y_train, y_test = train_test_split(
+    X, y, test_size=0.3, random_state=2023
+)
 ```
 
 ## Initialize the model
@@ -60,7 +66,7 @@ For this example we will use a tiny grid.
 ```{code-cell} ipython3
 from sklearn_evaluation.grid import RandomForestClassifierGrid
 
-model = RandomForestClassifierGrid(grid='tiny')
+model = RandomForestClassifierGrid(grid="tiny")
 ```
 
 ## Train all models
@@ -69,7 +75,6 @@ model = RandomForestClassifierGrid(grid='tiny')
 model.fit(X_train, y_train)
 ```
 
-
 # Evaluate
 
 `RandomForestClassifierGrid` uses `GridSearchCV` under the hood, we can easily access it via the `grid_search_cv_` attribute.
@@ -114,4 +119,4 @@ model.feature_importances()
 
 ```{code-cell} ipython3
 model.grid_search_results()
-```
+```
diff --git a/docs/classification/imbalance.md b/docs/classification/imbalance.md
@@ -21,12 +21,10 @@ It's essential to understand the class imbalance before implementing any resampl
 
 ```{code-cell} ipython3
 import matplotlib
-import matplotlib.pyplot as plt
 from sklearn.datasets import make_classification
 from sklearn.model_selection import train_test_split
 
 from sklearn_evaluation import plot
-from sklearn import datasets
 ```
 
 ```{code-cell} ipython3
@@ -36,12 +34,17 @@ matplotlib.rcParams["font.size"] = 18
 
 ```{code-cell} ipython3
 X, y = make_classification(
-    n_samples=1000, n_features=5, n_informative=3, n_classes=2, 
+    n_samples=1000,
+    n_features=5,
+    n_informative=3,
+    n_classes=2,
     # Set label 0 for  97% and 1 for rest 3% of observations
-    weights=[0.85], 
+    weights=[0.85],
 )
 
-X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
+X_train, X_test, y_train, y_test = train_test_split(
+    X, y, test_size=0.3, random_state=42
+)
 ```
 
 ##### Balance Mode