In [None]:
#@title Setting up the environment
!pip install evidently googletrans-py
!git clone https://github.com/nzmonzmp/dataset-ames.git
import io
import matplotlib.pyplot as plt
import nltk
import numpy
import pandas
import random
import requests
import scipy.stats
import seaborn
import sklearn.ensemble
import sklearn.feature_extraction.text
import sklearn.linear_model
import sklearn.model_selection
import sklearn.pipeline
import warnings
import zipfile
warnings.filterwarnings('ignore')

nltk.download('words')
nltk.download('wordnet')
nltk.download('omw-1.4')

def preprocess(train_file, test_file):
  train_X = pandas.read_csv(train_file, index_col="Id")
  test_X = pandas.read_csv(test_file, index_col="Id")

  train_y = train_X.pop("SalePrice")

  all_X = pandas.concat([train_X, test_X])

  cols_1 = ["LotFrontage"]
  all_X[cols_1] = all_X[cols_1].fillna(train_X[cols_1].median())

  cols_2 = ["MSZoning", "Electrical", "KitchenQual", "Exterior1st",
            "Exterior2nd", "SaleType", "Utilities"]
  all_X[cols_2] = all_X[cols_2].fillna(train_X[cols_2].mode().iloc[0, :])

  cols_4 = ["GarageYrBlt", "GarageArea", "GarageCars", "BsmtFinSF1",
            "BsmtFinSF2", "BsmtFullBath", "BsmtHalfBath", "BsmtUnfSF",
            "MasVnrArea", "TotalBsmtSF"]
  all_X[cols_4] = all_X[cols_4].fillna(0)

  cols_5 = ["Functional"]
  all_X[cols_5] = all_X[cols_5].fillna("Typ")

  all_X = all_X.fillna("NA")

  cols_numerical2label = ['MSSubClass']
  all_X[cols_numerical2label] = all_X[cols_numerical2label].astype(str)

  quality_mapping = dict(NA=0, Po=1, Fa=2, TA=3, Gd=4, Ex=5)
  quality_columns = ["BsmtCond", "BsmtQual", "ExterCond", "ExterQual",
                      "FireplaceQu", "GarageCond", "GarageQual", "HeatingQC",
                      "KitchenQual", "PoolQC"]
  street_mapping = dict(NA=0, Grvl=1, Pave=2)
  bsmt_fin_mapping = dict(NA=0, Unf=1, LwQ=2, Rec=3, BLQ=4, ALQ=5, GLQ=6)

  replace_mapping = dict(
    Alley=street_mapping,
    BsmtExposure=dict(NA=0, No=1, Mn=2, Av=3, Gd=4),
    BsmtFinType1=bsmt_fin_mapping,
    BsmtFinType2=bsmt_fin_mapping,
    Functional=dict(Sal=1, Sev=2, Maj2=3, Maj1=4, Mod=5, Min2=6, Min1=7, Typ=8),
    LandSlope=dict(Sev=1, Mod=2, Gtl=3),
    LotShape=dict(IR3=1, IR2=2, IR1=3, Reg=4),
    PavedDrive=dict(NA=0, N=1, P=2, Y=3),
    Street=dict(Grvl=1, Pave=2),
    Utilities=dict(ELO=1, NoSeWa=2, NoSewr=3, AllPub=4),
  )

  for quality_column in quality_columns:
    replace_mapping[quality_column] = quality_mapping

  all_X.replace(replace_mapping, inplace=True)

  print(f"Nombre de NAs : {all_X.isnull().sum().sum()}")

  return (all_X.iloc[:train_X.shape[0], :],
          train_y,
          all_X.iloc[train_X.shape[0]:, :])


def download_medicine_reviews() -> tuple[pandas.DataFrame, pandas.DataFrame]:
  """Data source: https://archive.ics.uci.edu/ml/datasets/Drug+Review+Dataset+%28Drugs.com%29

  Citation:
    Felix Gräßer, Surya Kallumadi, Hagen Malberg, and Sebastian Zaunseder.
    2018.
    Aspect-Based Sentiment Analysis of Drug Reviews Applying Cross-Domain and Cross-Data Learning.
    In Proceedings of the 2018 International Conference on Digital Health (DH '18).
    ACM, New York, NY, USA, 121-125.
  """
  content = requests.get(
      "https://archive.ics.uci.edu/ml/machine-learning-databases/00462/drugsCom_raw.zip"
  ).content
  with zipfile.ZipFile(io.BytesIO(content)) as arc:
      raw_data = pandas.read_csv(arc.open("drugsComTest_raw.tsv"), sep="\t")
  return raw_data[["drugName", "condition", "review",	"rating"]]


def filter_medicine_reviews(df: pandas.DataFrame, condition: str
                            ) -> pandas.DataFrame:
  df = df.loc[(df["condition"] == condition) & (df["rating"].isin([1, 10])),
                     ["review", "rating"]]
  df["is_positive"] = df["rating"].apply(
      lambda x: 0 if x == 1 else 1)
  return df.drop(columns="rating")


def split_medicine_reviews(df: pandas.DataFrame
                           ) -> tuple[pandas.DataFrame, pandas.DataFrame]:
  X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
      df['review'],
      df['is_positive'],
      test_size=0.4,
      random_state=42,
      shuffle=True)

  reference = pandas.DataFrame({'review': X_train, 'is_positive': y_train})
  valid = pandas.DataFrame({'review': X_test, 'is_positive': y_test})
  return reference, valid

# Monitoring — Data and Concept Drift

## Getting started with the `evidently` library

### Loading the data

In [None]:
reference_df, _, current_df = preprocess("dataset-ames/train.csv", "dataset-ames/test.csv")

### Imports

In [None]:
from evidently import (
    BinaryClassification,
    Dataset,
    DataDefinition,
    Report,
)

from evidently.descriptors import (
    NonLetterCharacterPercentage,
    OOVWordsPercentage,
    SentenceCount,
    TextLength,
    WordCount,
)

from evidently.metrics import MissingValueCount, UniqueValueCount

from evidently.presets import (
    ClassificationPreset,
    DataDriftPreset,
    ValueStats,
)

### Creating a `Dataset` with a `DataDefinition`

Following the steps described in the [evidently documentation](https://docs.evidentlyai.com/docs/library/data_definition), create two datasets: one, `reference`, for `reference_df`, and the other, `current`, for `current_df`. You can use the default `DataDefinition`.

In [None]:
# Your code here

#### Solution

In [None]:
reference = Dataset.from_pandas(reference_df, data_definition=DataDefinition())
current = Dataset.from_pandas(current_df, data_definition=DataDefinition())

### Reports

Evidently can generate reports and test suites.

Reports are meant to be read and studied by humans, whereas test suites are more intended for automation, for example to trigger automatic retraining.

Start by creating a very first report that uses the [default parameters to detect data drift](https://docs.evidentlyai.com/metrics/preset_data_drift).

In [None]:
# Your code here

#### Solution

In [None]:
report = Report([DataDriftPreset()])
run = report.run(current_data=current, reference_data=reference)
run

### Inspecting specific columns

In the dataset we use for these hands-on exercises (AMES), two columns are particularly important: `OverallQual` and `GrLivArea`.

Create a report on these two columns with the `UniqueValueCount` metric for the `OverallQual` column and the `ValueStats` preset for the `GrLivArea` column.

In [None]:
# Your code here

#### Solution

In [None]:
report = Report([UniqueValueCount(column="OverallQual"), ValueStats("GrLivArea")])
run = report.run(current_data=current, reference_data=reference)
run

### Saving a report

You can [save a report](https://docs.evidentlyai.com/docs/library/output_formats) in HTML format to read it directly or in JSON format to be consumed by other programs.

Save the data drift report in JSON format.

In [None]:
# Your code here

#### Solution

In [None]:
report = Report([DataDriftPreset()])
eval = report.run(reference_data=reference, current_data=current)
eval.save_json('data-drift-report.json')

### Test suites

[Test suites](https://docs.evidentlyai.com/docs/library/tests) are better suited than reports for an automated context such as CI/CD.

To start, create a first test suite that uses the `DataDriftPreset` preset that we used above to produce a report.

In [None]:
# Your code here

#### Solution

In [None]:
report = Report([DataDriftPreset()], include_tests=True)
run = report.run(reference_data=reference, current_data=current)
run

### Analyzing results in Python

Since tests are more often used for automation, we regularly analyze their results in Python. Compute the percentage of successful tests from the latest test suite. You can use the `dict` method.

In [None]:
# Your code here

#### Solution

In [None]:
results = run.dict()
successes = sum(d["status"] == "SUCCESS" for d in results["tests"])
percentage = successes / len(results["tests"]) * 100
percentage

## Applying the `evidently` library to text data

### Loading the data

The data we will use are reviews about medications used to treat several conditions. For now, we will focus on medications used to treat pain, *Pain* in the data.

In [None]:
raw_data = download_medicine_reviews()
filtered_data = filter_medicine_reviews(raw_data, "Pain")
reference_df, current_df = split_medicine_reviews(filtered_data)

In [None]:
reference_df

### Training a classification model

In [None]:
pipeline = sklearn.pipeline.Pipeline(
    [
        ("vectorization",
         sklearn.feature_extraction.text.TfidfVectorizer(
             sublinear_tf=True,
             max_df=0.5,
             stop_words="english")),
        ("classification",
         sklearn.linear_model.SGDClassifier(
             alpha=0.0001,
             max_iter=50,
             penalty='l1',
             loss='modified_huber',
             random_state=42))
    ])
pipeline.fit(reference_df['review'].values, reference_df['is_positive'].values)

Create a new column in `reference_df` and `current_df` that contains the predictions:

In [None]:
reference_df['predictions'] = pipeline.predict(reference_df['review'].values)
current_df['predictions'] = pipeline.predict(current_df['review'].values)

In [None]:
reference_df

Define two `Dataset` objects (`reference` and `current`) using a `DataDefinition` adapted to the three columns `review`, `is_positive`, and `predictions` in our data.

In [None]:
# Your code here

#### Solution

In [None]:
data_definition = DataDefinition(
    text_columns=["review"],
    classification=[BinaryClassification(target="is_positive",
                                         prediction_labels="predictions")]
)

reference = Dataset.from_pandas(reference_df, data_definition=data_definition)
current = Dataset.from_pandas(current_df, data_definition=data_definition)

### Classification quality report

Create a [classification quality](https://docs.evidentlyai.com/metrics/preset_classification) report from the reference and validation data.

In [None]:
# Your code here

#### Solution

In [None]:
report = Report([ClassificationPreset()])

run = report.run(reference_data=reference, current_data=current)
run

### Detecting a “technical” data drift

We are going to simulate a common event in a processing pipeline: a bug leads to poor-quality processing. Here, we will even simulate two:

- Bug in the cleaning of HTML tags during review preprocessing
- Data ingestion bug causing the data to be in a different language than the training language

In [None]:
from googletrans import Translator
translator = Translator()

def translate_str(s):
  return translator.translate(s, dest='fr').text

random_html_tags = ('<body>, </body>', '<html><body>', '</body></html>', '<h1>', '</h1>',
                    '<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 0 0" width="0" height="0" focusable="false" role="none" style="visibility: hidden; position: absolute; left: -9999px; overflow: hidden;"><defs><filter id="wp-duotone-magenta-yellow"><feColorMatrix color-interpolation-filters="sRGB" type="matrix" values=" .299 .587 .114 0 0 .299 .587 .114 0 0 .299 .587 .114 0 0 .299 .587 .114 0 0 "></feColorMatrix><feComponentTransfer color-interpolation-filters="sRGB"><feFuncR type="table" tableValues="0.78039215686275 1"></feFuncR><feFuncG type="table" tableValues="0 0.94901960784314"></feFuncG><feFuncB type="table" tableValues="0.35294117647059 0.47058823529412"></feFuncB><feFuncA type="table" tableValues="1 1"></feFuncA></feComponentTransfer><feComposite in2="SourceGraphic" operator="in"></feComposite></filter></defs></svg>')

def inject_random_html_tags(s):
  num_tags = 25
  for i in range(num_tags):
    random.seed(i)
    pos = random.choice(range(len(s)))
    s = s[:pos] + random.choice(random_html_tags) + s[pos:]

  return s

In [None]:
current_disturbed_df = current_df[['review', 'is_positive']].copy()

In [None]:
disturbed_num = int(len(current_disturbed_df) * 0.5)
random.seed(42)
disturbed_ind = random.sample(list(current_disturbed_df.index), k=disturbed_num)
current_disturbed_df.loc[disturbed_ind[:int(disturbed_num / 10)], 'review'] = \
current_disturbed_df.loc[disturbed_ind[:int(disturbed_num / 10)], 'review'].apply(inject_random_html_tags)
# current_disturbed_df.loc[disturbed_ind[int(disturbed_num / 10):], 'review'] = \
# current_disturbed_df.loc[disturbed_ind[int(disturbed_num / 10):], 'review'].apply(translate_str)

In [None]:
current_disturbed_df['predictions'] = pipeline.predict(current_disturbed_df['review'].values)
current_disturbed = Dataset.from_pandas(current_disturbed_df, data_definition=data_definition)

### Producing a new quality report

Reuse the previous code to analyze the model performance on this degraded data, comparing this time against the “clean” validation data.

In [None]:
# Your code here

#### Solution

In [None]:
report = Report([ClassificationPreset()])

run = report.run(reference_data=current, current_data=current_disturbed)
run

### Analyzing the model’s poor performance

Produce a data drift report that shows the drift of the `is_positive` and `predictions` columns, as well as the drift of text [descriptors](https://docs.evidentlyai.com/docs/library/descriptors) for the `review` column. You will first need to define a list of text descriptors to use and add them to the datasets already defined using the `add_descriptors` method.

In [None]:
# Your code here

#### Solution

In [None]:
descriptors = [
    NonLetterCharacterPercentage("review", alias="non_letters"),
    OOVWordsPercentage("review", alias="oov"),
    SentenceCount("review", alias="sentence_count"),
    TextLength("review", alias="text_length"),
    WordCount("review", alias="word_count"),
]

reference.add_descriptors(descriptors)
current.add_descriptors(descriptors)
current_disturbed.add_descriptors(descriptors)

report = Report([DataDriftPreset()])

run = report.run(reference_data=current, current_data=current_disturbed)
run

### Manual inspection of faulty examples

We can observe an increase in long texts and in the presence of out-of-vocabulary (*OOV*) words.

Use the export of descriptors as a dataframe to inspect faulty examples, for instance:

- Reviews with a length greater than 1000 characters
- Reviews with more than 30% out-of-vocabulary words. How does `evidently` define the vocabulary?

In [None]:
# Your code here

#### Solution

In [None]:
descriptors_df = current_disturbed.as_dataframe()
descriptors_df

In [None]:
descriptors_df[descriptors_df['text_length'] > 1000]

In [None]:
descriptors_df[descriptors_df['oov'] > 30]

### Data drift

Let’s now simulate data drift by looking at another part of our original data: reviews about medications used to treat depression.

In [None]:
new_content_df = filter_medicine_reviews(raw_data, "Depression")
new_content_df

In [None]:
new_content_df["predictions"] = pipeline.predict(new_content_df.review.values)

### Classification quality report

As before, use `evidently` to quantify how the model performance evolves.

In [None]:
# Your code here

#### Solution

In [None]:
new_content = Dataset.from_pandas(new_content_df,
                                  data_definition=data_definition,
                                  descriptors=descriptors)

report = Report([ClassificationPreset()])

run = report.run(reference_data=current, current_data=new_content)
run

### Detecting data drift

Unsurprisingly, performance is very degraded. Produce a data drift report. Would `evidently` have detected this drift in time to allow retraining?

In [None]:
# Your code here

#### Solution

In [None]:
report = Report([DataDriftPreset()])

run = report.run(reference_data=current, current_data=new_content)
run