# Setup

In [1]:
import os
import pandas as pd

from pprint import pprint

from docx import Document

from sklearn.metrics import accuracy_score

from skllm import MultiLabelZeroShotGPTClassifier
from skllm.config import SKLLMConfig

In [2]:
#See notes in INSTALL.md for how to set this, DO NOT HARD CODE YOUR API KEY HERE, 
# if your repository is public, then someone will steal your API key and make you pay for their shit 
SKLLMConfig.set_openai_key(os.environ.get('OPENAI_API_KEY'))

# Collect a Dataset

In [3]:
# Set the folder path where your Word documents are located
folder_path = 'data'

X = []

# Loop over each file in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.docx'):
        # Construct the full file path
        file_path = os.path.join(folder_path, filename)
        
        # Open the Word document
        doc = Document(file_path)
        
        # Loop over each paragraph in the document and print its text
        filetext=""
        for para in doc.paragraphs:
            filetext+=para.text
        X.append(filetext)

data = pd.DataFrame(X, columns=['WritingSample'])



# Understand Your Data

In [4]:
data.head()

Unnamed: 0,WritingSample
0,"Every year, about 1.35 million people are kill..."
1,"In a world driven by technology, some recent a..."
2,The modern world is changing at a concerningly...
3,Technology in today's world continues to take ...
4,Technological improvements are occurring world...


In [5]:
data.describe()

Unnamed: 0,WritingSample
count,5
unique,5
top,"Every year, about 1.35 million people are kill..."
freq,1


In [6]:
X = data["WritingSample"]

In [7]:
X

0    Every year, about 1.35 million people are kill...
1    In a world driven by technology, some recent a...
2    The modern world is changing at a concerningly...
3    Technology in today's world continues to take ...
4    Technological improvements are occurring world...
Name: WritingSample, dtype: object

# "Develop" a Model aka just use OpenAI's API

In [8]:
# Define candidate labels
candidate_labels = {
    "4": "This was written by a 9th grade student. A score of 4 is for well-written samples that make sense, have a good flow, and do not have many grammatical errors. Students are also required to cite sources and the citations and references must be relevant.",
    "3": "This was written by a 9th grade student. A score of 3 is for work that is good but no great. There is a lttle more leniancy for grammar and spelling, but students still need to make logical and well-written responses.",
    "2": "This was written by a 9th grade student. A score of 2 is for average work. There may be more spelling mistakes and grammatical errors that prevent a 2 from being scored as a 3. Stuents who earn a 2 should put some logical ideas together, but it may not flow very well.",
    "1": "This was written by a 9th grade student. A score of 1 is for sub-par work. Much of this work will not make sense. There will be more grammatical errors that in the higher scores. A 1 would equate to D- or F- work on the traditional grading scale.",
    "0": "This was written by a 9th grade student. A score of zero is for writing that is terrible and/or very incomplete. These samples will be a total disaster. They will not make sense and there will be very little, if anything, of value in the writing. This would earn a failing grade on just abou any grading scale."
}

# Create and fit the classifier
clf = MultiLabelZeroShotGPTClassifier(max_labels=2) 
clf.fit(None, [candidate_labels])

In [9]:
# Predict the labels
labels = clf.predict(X)

100%|█████████████████████████████████████████████| 5/5 [00:11<00:00,  2.20s/it]


In [10]:
# Add labels to the dataset and save
data['ChatGPTLabel'] = labels
data.to_csv('data/classified_tips.csv', index=False)

# Choose a measure of success, Choose an evaluation protocol / evaluate

In [11]:
data[["WritingSample","ChatGPTLabel"]]

Unnamed: 0,WritingSample,ChatGPTLabel
0,"Every year, about 1.35 million people are kill...","[2, 3]"
1,"In a world driven by technology, some recent a...","[2, 3]"
2,The modern world is changing at a concerningly...,"[2, 3]"
3,Technology in today's world continues to take ...,[2]
4,Technological improvements are occurring world...,"[2, 3]"


In [19]:
accuracy = accuracy_score(data[["HumanLabel"]], data[["ChatGPTLabel"]])
print(accuracy)

KeyError: "None of [Index(['HumanLabel'], dtype='object')] are in the [columns]"

# Skipped Steps
* Beat a baseline
* Overfit, regularize and tune
* Communicate with stakeholders
* Ship an inference model
* Monitor and maintain