# Step 1 - Install the required dependencies and make sure the python version is 3.10 and above

In [1]:
!pip install zeno-client
!pip install --upgrade pip
!pip install --upgrade bottleneck
!pip install langdetect


Collecting zeno-client
  Using cached zeno_client-0.1.16-py3-none-any.whl (7.3 kB)
Collecting outdated>=0.2.0
  Using cached outdated-0.2.2-py2.py3-none-any.whl (7.5 kB)
Collecting arrow-json<0.10.0,>=0.9.0
  Using cached arrow_json-0.9.0-cp37-abi3-win_amd64.whl (1.3 MB)
Collecting pydantic>=1.0
  Using cached pydantic-2.6.1-py3-none-any.whl (394 kB)
Installing collected packages: pydantic, outdated, arrow-json, zeno-client
Successfully installed arrow-json-0.9.0 outdated-0.2.2 pydantic-2.6.1 zeno-client-0.1.16



[notice] A new release of pip is available: 23.0.1 -> 24.0
[notice] To update, run: C:\Users\sslin\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


Collecting pip
  Using cached pip-24.0-py3-none-any.whl (2.1 MB)
Installing collected packages: pip
Successfully installed pip-24.0



[notice] A new release of pip is available: 23.0.1 -> 24.0
[notice] To update, run: C:\Users\sslin\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


Collecting bottleneck
  Downloading Bottleneck-1.3.7-cp310-cp310-win_amd64.whl (109 kB)
     ---------------------------------------- 0.0/109.9 kB ? eta -:--:--
     -------------------------------------- 109.9/109.9 kB 6.2 MB/s eta 0:00:00
Installing collected packages: bottleneck
Successfully installed bottleneck-1.3.7
Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
     ---------------------------------------- 0.0/981.5 kB ? eta -:--:--
     --------- ---------------------------- 235.5/981.5 kB 7.3 MB/s eta 0:00:01
     ------------------------------------  972.8/981.5 kB 12.3 MB/s eta 0:00:01
     ------------------------------------  972.8/981.5 kB 12.3 MB/s eta 0:00:01
     -------------------------------------- 981.5/981.5 kB 6.9 MB/s eta 0:00:00
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'

In [2]:
!python --version

Python 3.10.11


In [3]:
from zeno_client import ZenoClient, ZenoMetric
import pandas as pd

# Initialize a client with our API key.
client = ZenoClient("zen_nbmWB_MutVoVF5ngf8tmGQ3TG5pQ_pYzqW1bDCaBhU0")


  from .autonotebook import tqdm as notebook_tqdm


# Step 2 - Create a project

In [9]:
project = client.create_project(
    name="Biased Movies Analysis 00",
    view="text-classification",
    metrics=[
        ZenoMetric(name="accuracy", type="mean", columns=["correct"]),
        #Uncomment the following line after implementing the function
        ZenoMetric(name="avg length ratio", type="mean", columns=["avg_length_ratio"]),
    ]
)

Successfully updated project.
Access your project at  https://hub.zenoml.com/project/cceab597-8dc6-4d95-9356-930196c1de51/Biased%20Movies%20Analysis%2000


# Step 3 - Create dataset

In [10]:
import pandas as pd
from langdetect import detect
from collections import Counter
import random

# Sample language detection function
def detect_language(text):
    try:
        return detect(text)
    except:
        return "Unknown"

# Create a Pandas DataFrame for the biased dataset of movies
df = pd.DataFrame(
    {
        "id": range(1, 11),  # Using only 10 instances for demonstration
        "text": [
            "I love this movie!",
            "¡Odio esta película!",
            "Ce film est ok.",
            "The acting was great!",
            "The plot was confusing.",
            "I really enjoyed it!",
            "Worst movie ever!",
            "Not bad, but could be better.",
            "A masterpiece!",
            "I didn't like it.",
        ],
        "genre": ["action", "drama", "comedy", "action", "drama", "action", "drama", "comedy", "action", "drama"],
        "year": [random.choice([2020, 2021, 2022]) for _ in range(10)],  # Randomly select year
        "label": ["positive", "negative", "neutral", "positive", "negative", "positive", "negative", "neutral", "positive", "negative"],
    }
)

# Introduce biases in genre
biased_genre = ["action", "action", "drama", "drama", "comedy", "comedy", "comedy", "comedy", "drama", "drama"]
df["biased_genre"] = biased_genre

# Introduce biases in year
biased_year = [2020, 2020, 2020, 2021, 2021, 2021, 2022, 2022, 2022, 2022]
df["biased_year"] = biased_year

# Add language detection to create a new column
df["detected_language"] = df["text"].apply(detect_language)

# Add additional columns for analysis
df["input_length"] = df["text"].str.len()

#Uncomment the following line
df["avg_length_ratio"] = df["input_length"]/df["input_length"].mean()



# Step 4 - Upload the Base Dataset


In [11]:

project.upload_dataset(df, id_column="id", data_column="text", label_column="label")



  df.loc[:, id_column] = df[id_column].astype(str)
100%|██████████| 1/1 [00:00<00:00,  2.17it/s]

Successfully uploaded data





# Step 5 - Upload the AI System Outputs


In [12]:
# For demonstration, let's assume we have system predictions
df_system = pd.DataFrame(
    {
        "output": ["positive", "negative", "negative", "positive", "negative", "positive", "negative", "neutral", "positive", "negative"],
    }
)

# Create an id column to match the base dataset
df_system["id"] = df_system.index + 1

# Measure accuracy for each instance, which is averaged by the ZenoMetric above
df_system["correct"] = (df_system["output"] == df["label"]).astype(int)



# Step 6 - Upload the system outputs


In [13]:
project.upload_system(df_system, name="System A", id_column="id", output_column="output")

  df.loc[:, id_column] = df[id_column].astype(str)
100%|██████████| 1/1 [00:00<00:00,  2.06it/s]

Successfully uploaded system



