# Setup

In [1]:
import os
import pandas as pd

from pprint import pprint

from docx import Document

from sklearn.metrics import accuracy_score

from skllm import MultiLabelZeroShotGPTClassifier
from skllm.config import SKLLMConfig

In [2]:
#See notes in INSTALL.md for how to set this, DO NOT HARD CODE YOUR API KEY HERE, 
# if your repository is public, then someone will steal your API key and make you pay for their shit 
SKLLMConfig.set_openai_key(os.environ.get('OPENAI_API_KEY'))

# Collect a Dataset

In [3]:
# Set the folder path where your Word documents are located
folder_path = 'data'

X = []

# Loop over each file in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.docx'):
        # Construct the full file path
        file_path = os.path.join(folder_path, filename)
        
        # Open the Word document
        doc = Document(file_path)
        
        # Loop over each paragraph in the document and print its text
        filetext=""
        for para in doc.paragraphs:
            filetext+=para.text
        X.append(filetext)

data = pd.DataFrame(X, columns=['WritingSample'])



# Understand Your Data

In [4]:
data.head()

Unnamed: 0,WritingSample
0,"Every year, about 1.35 million people are kill..."
1,"In a world driven by technology, some recent a..."
2,The modern world is changing at a concerningly...
3,Technology in today's world continues to take ...
4,Technological improvements are occurring world...


In [5]:
data.describe()

Unnamed: 0,WritingSample
count,5
unique,5
top,"Every year, about 1.35 million people are kill..."
freq,1


In [6]:
X = data["WritingSample"]

In [7]:
X

0    Every year, about 1.35 million people are kill...
1    In a world driven by technology, some recent a...
2    The modern world is changing at a concerningly...
3    Technology in today's world continues to take ...
4    Technological improvements are occurring world...
Name: WritingSample, dtype: object

# "Develop" a Model aka just use OpenAI's API

In [8]:
# Define candidate labels
candidate_labels = {
    "4": "Writing samples are from high school students. A score of A is for work that is above grade-level performance. The rubric states that the work needs exceptional focus and consistency in maintaining the central idea. The structure and transitions are purposeful, enhancing clarity and cohesion. Introduction and conclusion are notably effective. Please be very generous with the predictions for these samples. A score of 4 should be somewhat easy to attain as long as the writing is decent and some of the objectives are met.",
    "3": "Writing samples are from high school students. A score of B is for work that is within grade-level performance. Adequate focus on the task with a generally maintained central idea. Structure and transitions logically connect ideas. Introduction and conclusion contribute to the essay’s completeness.",
    "2": "Writing samples are from high school students. A score of C is for work that is approaching grade-level performance. There might be some clarity issues with the central idea. Structure may be inconsistent but attempts to organize ideas are evident. Transitions and conclusion are present but may be basic.",
    "1": "Writing samples are from high school students. A score of D is for work that is below grade-level performance. Central idea and structure may be unclear. However, any attempt at organizing and transitioning ideas, even if minimal or confusing, should be acknowledged. Introduction and conclusion might be lacking but any effort is recognized.",
    "0": "Nothing or almost nothing is written. Did not meet any of the requirements."
}

# Create and fit the classifier
clf = MultiLabelZeroShotGPTClassifier(max_labels=2) 
clf.fit(None, [candidate_labels])

In [9]:
# Predict the labels
labels = clf.predict(X)

100%|█████████████████████████████████████████████| 5/5 [00:09<00:00,  1.83s/it]


In [10]:
# Add labels to the dataset and save
data['ChatGPTLabel'] = labels
data.to_csv('data/classified_tips.csv', index=False)

# Choose a measure of success, Choose an evaluation protocol / evaluate

In [11]:
data[["WritingSample","ChatGPTLabel"]]

Unnamed: 0,WritingSample,ChatGPTLabel
0,"Every year, about 1.35 million people are kill...","[B, C]"
1,"In a world driven by technology, some recent a...","[A, C]"
2,The modern world is changing at a concerningly...,"[A, C]"
3,Technology in today's world continues to take ...,"[A, C]"
4,Technological improvements are occurring world...,"[A, D]"


In [None]:
accuracy = accuracy_score(data[["HumanLabel"]], data[["ChatGPTLabel"]])
print(accuracy)

# Skipped Steps
* Beat a baseline
* Overfit, regularize and tune
* Communicate with stakeholders
* Ship an inference model
* Monitor and maintain