# Setup

In [1]:
import os
import pandas as pd

from pprint import pprint

from docx import Document

from sklearn.metrics import accuracy_score

from skllm import MultiLabelZeroShotGPTClassifier
from skllm.config import SKLLMConfig

In [2]:
#See notes in INSTALL.md for how to set this, DO NOT HARD CODE YOUR API KEY HERE, 
# if your repository is public, then someone will steal your API key and make you pay for their shit 
SKLLMConfig.set_openai_key(os.environ.get('OPENAI_API_KEY'))

# Collect a Dataset

In [3]:
# Set the folder path where your Word documents are located
folder_path = 'data'

X = []

# Loop over each file in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.docx'):
        # Construct the full file path
        file_path = os.path.join(folder_path, filename)
        
        # Open the Word document
        doc = Document(file_path)
        
        # Loop over each paragraph in the document and print its text
        filetext=""
        for para in doc.paragraphs:
            filetext+=para.text
        X.append(filetext)

data = pd.DataFrame(X, columns=['WritingSample'])

# Step 1: Read the CSV file containing the text for 'HumanLabel'
csv_data = pd.read_csv('data/teacher_grades.csv')

# Step 2: Create a 'HumanLabel' column in your DataFrame and initialize it with empty strings
data['HumanLabel'] = ""

# Step 3: Iterate through the rows of your DataFrame
for index, row in data.iterrows():
    file_name = row['WritingSample']  # Assuming 'WritingSample' contains file names
    matching_row = csv_data[csv_data['StudentName'] == file_name]  # Correct the column name
    if not matching_row.empty:
        data.at[index, 'HumanLabel'] = matching_row['Text'].values[0]

# Now, the 'HumanLabel' column in your DataFrame should be populated correctly

# Understand Your Data

In [4]:
data.head()

Unnamed: 0,WritingSample,HumanLabel
0,"Every year, about 1.35 million people are kill...",
1,"In a world driven by technology, some recent a...",
2,The modern world is changing at a concerningly...,
3,Technology in today's world continues to take ...,
4,Technological improvements are occurring world...,


In [5]:
data.describe()

Unnamed: 0,WritingSample,HumanLabel
count,5,5.0
unique,5,1.0
top,"Every year, about 1.35 million people are kill...",
freq,1,5.0


In [6]:
X = data["WritingSample"]

In [7]:
X

0    Every year, about 1.35 million people are kill...
1    In a world driven by technology, some recent a...
2    The modern world is changing at a concerningly...
3    Technology in today's world continues to take ...
4    Technological improvements are occurring world...
Name: WritingSample, dtype: object

# "Develop" a Model aka just use OpenAI's API

In [8]:
# Define candidate labels
candidate_labels = [
    "A",
    "B",
    "C",
    "D",
    "F"
]

# Create and fit the classifier
clf = MultiLabelZeroShotGPTClassifier(max_labels=2) 
clf.fit(None, [candidate_labels])

# Define classification prompt (without specifying the writing_sample)
classification_prompt = (
    "Please be very strict when assigning scores. Anything that earns an A should be almost perfect. Bs should be extremly good. Cs should be average. Ds should be okay. Fs shuld be poor.\n"
    "This section will be scored on development. This section focuses primarily on flow and how well a student develops the main ideas in the writing. \n"
    "Please assess the student's work based on the following criteria:\n\n"
    "Development of Main Ideas:\n"
    "1. Perfect development of main ideas (A: Perfect)\n"
    "2. Great development of main ideas (B: Great)\n"
    "3. Good development of main ideas (C: Good)\n"
    "4. Okay development of main ideas (D: Fair)\n"
    "5. Undeveloped (F: Poor)\n\n"
    "Quality of Work:\n"
    "6. Above average work (A: Perfect)\n"
    "7. Great work (B: Very good)\n"
    "8. Good work (C: Average)\n"
    "9. Okay work (D: Fair)\n"
    "10. Below average work (F: Poor)\n\n"
    "Flow:\n"
    "11. Writing flows perfectly (A: Perfect)\n"
    "12. Writing has a great flow (B: Great)\n"
    "13. Writing has a good flow (C: Good)\n"
    "14. The flow of this writing is confusing (D: Fair)\n"
    "15. The flow of this writing does not make sense (F: Poor)\n\n"
    "Use of Resources or Evidence:\n"
    "16. Great use of resources or evidence (A: Perfect)\n"
    "17. Used or referenced resources, but it could have been done better (C: Good)\n"
    "18. Inadequate use of resources or evidence (F: Poor)\n"
    "Tone:\n"
    "19. Tone strengthens the writing (A: Perfect)\n"
    "20. Tone is great (B: Great)\n"
    "21. Tone is appropriate but could be better (C: Good)\n"
    "22. Tone is inconsistent or confusing (D: Fair)\n"
    "23. Tone is inappropriate (F: Poor)\n\n"
    "Language: \n"
    "24. Exceptional use of grammar and language (A: Perfect)\n"
    "25. Great use of grammar and language (B: Great)\n"
    "26. Good use of grammar and language (C: Good)\n"
    "27. Sub-standard use of grammar and language (D: Fair)\n"
    "28. Poor use of grammar and language (F: Poor)\n\n"
    "Please assign an overall score of A to F based on the criteria above. An A is the best score and an F is the worst score."
)

# Iterate through the Writing Samples and classify them
for writing_sample in X:
    # Generate a classification prompt (including criteria and the writing sample)
    complete_prompt = classification_prompt + f"\n\n{writing_sample}"
    
    # Use the classifier to classify the writing sample
    predicted_labels = clf.predict([complete_prompt])[0]
    
    # Print the classification result
    print("Predicted Labels:", predicted_labels)
    print()

100%|███████████████████████████████████████████| 1/1 [00:00<00:00,  1.04it/s]


Predicted Labels: ['B', 'C']



100%|███████████████████████████████████████████| 1/1 [00:00<00:00,  1.14it/s]


Predicted Labels: ['A', 'B']



100%|███████████████████████████████████████████| 1/1 [00:00<00:00,  1.40it/s]


Predicted Labels: ['A', 'C']



100%|███████████████████████████████████████████| 1/1 [00:00<00:00,  1.22it/s]


Predicted Labels: ['B', 'C']



100%|███████████████████████████████████████████| 1/1 [00:00<00:00,  1.48it/s]

Predicted Labels: ['A', 'C']






In [11]:
# Create an empty list to store final letter grades
final_letter_grades = []

# Define a mapping of letter grades to numerical values
numerical_values = {
    "A": 5,
    "B": 4,
    "C": 3,
    "D": 2,
    "F": 1
}

# Iterate through the Writing Samples and classify them
for i, writing_sample in enumerate(X):
    # Generate a classification prompt (including criteria and the writing sample)
    complete_prompt = classification_prompt + f"\n\n{writing_sample}"
    
    # Use the classifier to classify the writing sample
    predicted_labels = clf.predict([complete_prompt])[0]
    
    # Calculate the average score for this sample
    average_score = sum([numerical_values[label] for label in predicted_labels]) / len(predicted_labels)
    
    # Custom rounding logic: Round up for scores with a decimal part of 0.5 or greater
    rounded_grade = round(average_score) if average_score % 1 < 0.5 else int(average_score) + 1
    
    # Convert the rounded grade back to a letter grade
    final_letter_grade = next(label for label, value in numerical_values.items() if value == rounded_grade)
    
    # Append the final letter grade to the list
    final_letter_grades.append(final_letter_grade)
    
    # Print the results for this sample
    print(f"Sample {i + 1} - Average Score: {average_score}, Letter Grade: {final_letter_grade}")

# Now, final_letter_grades contains the letter grade for each writing sample


100%|███████████████████████████████████████████| 1/1 [00:00<00:00,  1.15it/s]


Sample 1 - Average Score: 3.5, Letter Grade: B


100%|███████████████████████████████████████████| 1/1 [00:00<00:00,  1.40it/s]


Sample 2 - Average Score: 4.5, Letter Grade: A


100%|███████████████████████████████████████████| 1/1 [00:00<00:00,  1.22it/s]


Sample 3 - Average Score: 4.0, Letter Grade: B


100%|███████████████████████████████████████████| 1/1 [00:00<00:00,  1.22it/s]


Sample 4 - Average Score: 3.5, Letter Grade: B


100%|███████████████████████████████████████████| 1/1 [00:00<00:00,  1.22it/s]

Sample 5 - Average Score: 4.0, Letter Grade: B





In [12]:
# Add labels to the dataset and save
data['ChatGPTLabel'] = final_letter_grades
data.to_csv('data/classified_tips.csv', index=False)


# Choose a measure of success, Choose an evaluation protocol / evaluate

In [13]:
print(data.columns)

Index(['WritingSample', 'HumanLabel', 'ChatGPTLabel'], dtype='object')


In [14]:
selected_columns = data[["WritingSample", "ChatGPTLabel", "HumanLabel"]]
print(selected_columns)

                                       WritingSample ChatGPTLabel HumanLabel
0  Every year, about 1.35 million people are kill...            B           
1  In a world driven by technology, some recent a...            A           
2  The modern world is changing at a concerningly...            B           
3  Technology in today's world continues to take ...            B           
4  Technological improvements are occurring world...            B           


In [15]:
from sklearn.metrics import accuracy_score

# Check if the column 'HumanLabel' exists in your DataFrame
if 'HumanLabel' in data.columns:
    # Ensure the column 'HumanLabel' has compatible data types (e.g., both are strings)
    data['HumanLabel'] = data['HumanLabel'].astype(str)
    
    # Calculate accuracy
    accuracy = accuracy_score(data['HumanLabel'], data['ChatGPTLabel'])
    print("Accuracy:", accuracy)
else:
    print("Column 'HumanLabel' not found in the DataFrame.")



Accuracy: 0.0


# Skipped Steps
* Beat a baseline
* Overfit, regularize and tune
* Communicate with stakeholders
* Ship an inference model
* Monitor and maintain