# Setup

In [1]:
import os
import pandas as pd

from pprint import pprint

from docx import Document

from sklearn.metrics import accuracy_score

from skllm import MultiLabelZeroShotGPTClassifier
from skllm.config import SKLLMConfig

In [2]:
#See notes in INSTALL.md for how to set this, DO NOT HARD CODE YOUR API KEY HERE, 
# if your repository is public, then someone will steal your API key and make you pay for their shit 
SKLLMConfig.set_openai_key(os.environ.get('OPENAI_API_KEY'))

# Collect a Dataset

In [3]:
# Set the folder path where your Word documents are located
folder_path = 'data'

X = []

# Loop over each file in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.docx'):
        # Construct the full file path
        file_path = os.path.join(folder_path, filename)
        
        # Open the Word document
        doc = Document(file_path)
        
        # Loop over each paragraph in the document and print its text
        filetext=""
        for para in doc.paragraphs:
            filetext+=para.text
        X.append(filetext)

data = pd.DataFrame(X, columns=['WritingSample'])



# Understand Your Data

In [4]:
data.head()

Unnamed: 0,WritingSample
0,"Every year, about 1.35 million people are kill..."
1,"In a world driven by technology, some recent a..."
2,The modern world is changing at a concerningly...
3,Technology in today's world continues to take ...
4,Technological improvements are occurring world...


In [5]:
data.describe()

Unnamed: 0,WritingSample
count,5
unique,5
top,"Every year, about 1.35 million people are kill..."
freq,1


In [6]:
X = data["WritingSample"]

In [7]:
X

0    Every year, about 1.35 million people are kill...
1    In a world driven by technology, some recent a...
2    The modern world is changing at a concerningly...
3    Technology in today's world continues to take ...
4    Technological improvements are occurring world...
Name: WritingSample, dtype: object

# "Develop" a Model aka just use OpenAI's API

In [8]:
# Define candidate labels
candidate_labels = {
    "A": "Above grade-level performance. Thorough and skillful understanding with diverse, smoothly integrated evidence. Original ideas are well-elaborated and cited.",
    "B": "Within grade-level performance. Clear understanding with logical development. Adequate elaboration and relevant evidence are noted.",
    "C": "Approaching grade-level performance. Some understanding is evident, despite partial development. Recognition for any relevant evidence and attempt at elaboration.",
    "D": "Below grade-level performance. Acknowledge any attempt to develop ideas, even if lacking in clarity or understanding. Any evidence, however vague, is noted.",
    "F": "Nothing or almost nothing is written. Did not meet any of the requirements."
}

# Create and fit the classifier
clf = MultiLabelZeroShotGPTClassifier(max_labels=2) 
clf.fit(None, [candidate_labels])

# Example test loop to evaluate classifier's response to different inputs
for label, description in candidate_labels.items():
    # Create a test input example for each label
    test_input = f"This is a test input that represents {label} level performance according to the description: {description}"
    print(f"Testing with input for label {label}")

    # Predict the label for the test input
    result = clf.predict([test_input])
    print(f"Result for label {label}: {result}")

Testing with input for label A


100%|█████████████████████████████████████████████| 1/1 [00:01<00:00,  1.76s/it]


Result for label A: [['A']]
Testing with input for label B


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00,  1.18it/s]


Result for label B: [['B']]
Testing with input for label C


100%|█████████████████████████████████████████████| 1/1 [00:01<00:00,  1.32s/it]


Result for label C: [['C']]
Testing with input for label D


100%|█████████████████████████████████████████████| 1/1 [00:01<00:00,  1.31s/it]


Result for label D: [['D']]
Testing with input for label F


100%|█████████████████████████████████████████████| 1/1 [00:01<00:00,  1.63s/it]

Result for label F: [['F']]





In [9]:
# Predict the labels
labels = clf.predict(X)

100%|█████████████████████████████████████████████| 5/5 [00:07<00:00,  1.56s/it]


In [10]:
# Add labels to the dataset and save
data['ChatGPTLabel'] = labels
data.to_csv('data/classified_tips.csv', index=False)

# Choose a measure of success, Choose an evaluation protocol / evaluate

In [11]:
data[["WritingSample","ChatGPTLabel"]]

Unnamed: 0,WritingSample,ChatGPTLabel
0,"Every year, about 1.35 million people are kill...","[B, C]"
1,"In a world driven by technology, some recent a...","[A, C]"
2,The modern world is changing at a concerningly...,"[A, C]"
3,Technology in today's world continues to take ...,"[A, C]"
4,Technological improvements are occurring world...,"[A, D]"


In [None]:
accuracy = accuracy_score(data[["HumanLabel"]], data[["ChatGPTLabel"]])
print(accuracy)

# Skipped Steps
* Beat a baseline
* Overfit, regularize and tune
* Communicate with stakeholders
* Ship an inference model
* Monitor and maintain