# Setup

In [27]:
import os
import pandas as pd

from pprint import pprint

from docx import Document

from sklearn.metrics import accuracy_score

from skllm import MultiLabelZeroShotGPTClassifier
from skllm.config import SKLLMConfig

In [28]:
#See notes in INSTALL.md for how to set this, DO NOT HARD CODE YOUR API KEY HERE, 
# if your repository is public, then someone will steal your API key and make you pay for their shit 
SKLLMConfig.set_openai_key(os.environ.get('OPENAI_API_KEY'))

# Collect a Dataset

In [29]:
import os
import pandas as pd
from docx import Document

# Set the folder path where your Word documents are located
folder_path = 'data'

X = []

# Initialize an empty list to store student names
student_names = []

# Loop over each file in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.docx'):
        # Extract the student name from the file name
        student_name = os.path.splitext(filename)[0]  # Remove the file extension
        student_names.append(student_name)  # Append the student name to the list
        
        # Construct the full file path
        file_path = os.path.join(folder_path, filename)
        
        # Open the Word document
        doc = Document(file_path)
        
        # Loop over each paragraph in the document and print its text
        filetext = ""
        for para in doc.paragraphs:
            filetext += para.text
        X.append((student_name, filetext))

# Create a DataFrame from the extracted text with student names
data = pd.DataFrame(X, columns=['StudentName', 'WritingSample'])

# Step 1: Read the CSV file containing the scores assigned by the teacher
csv_data = pd.read_csv('data/teacher_grades.csv')

# Initialize an empty list to store human labels (letter grades)
human_labels = []

# Iterate through the rows of the DataFrame
for index, row in data.iterrows():
    student_name = row['StudentName']
    matching_row = csv_data[csv_data['StudentName'] == student_name]  # Match based on 'StudentName'
    if not matching_row.empty:
        human_labels.append(matching_row['Score'].values[0])  # Append the letter grade to the list
    else:
        human_labels.append("")  # If no match found, insert an empty string

# Add the 'HumanLabel' column to the DataFrame
data['HumanLabel'] = human_labels

# Now, the 'StudentName' and 'HumanLabel' columns in your DataFrame should be populated correctly

# Understand Your Data

In [30]:
data.head()

Unnamed: 0,StudentName,WritingSample,HumanLabel
0,Student 12,\tWill robots be able to replace humans? In re...,A
1,Student 19,Most people think robots are taking over the w...,D
2,Student 3,"Every year, about 1.35 million people are kill...",C
3,Student 7,The recent advancements of robot technology ha...,C
4,Student 6,\tAdvancements in technology have led to groun...,A


In [31]:
data.describe()

Unnamed: 0,StudentName,WritingSample,HumanLabel
count,20,20,20
unique,20,20,5
top,Student 12,\tWill robots be able to replace humans? In re...,B
freq,1,1,8


In [32]:
X = data["WritingSample"]

In [33]:
X

0     \tWill robots be able to replace humans? In re...
1     Most people think robots are taking over the w...
2     Every year, about 1.35 million people are kill...
3     The recent advancements of robot technology ha...
4     \tAdvancements in technology have led to groun...
5     Technology has come a long way from when it fi...
6     \tRobot technology has advancements year aroun...
7     Technology has been  so much to prevent any ac...
8         In this new age, there have been rapid cha...
9     In a world driven by technology, some recent a...
10    The modern world is changing at a concerningly...
11    Imagine a world where robots are not only a pa...
12    AI and robot technology is becoming more and m...
13    Imagine you're just chilling in your dorm and ...
14    Technology in today's world continues to take ...
15    Recent advancements in robot technology have h...
16                                                     
17    Technological improvements are occurring w

# "Develop" a Model aka just use OpenAI's API

In [34]:
# Define candidate labels
candidate_labels = [
    "A",
    "B",
    "C",
    "D",
    "F"
]

# Create and fit the classifier
clf = MultiLabelZeroShotGPTClassifier(max_labels=2) 
clf.fit(None, [candidate_labels])

# Define classification prompt (without specifying the writing_sample)
classification_prompt = (
    "Please be very strict when assigning scores. Anything that earns an A should be almost perfect. Bs should be extremely good. Cs should be average. Ds should be okay. Fs should be for inadequate work.\n"
    "The writing sample is for a standardized test for high school students. Teachers are supposed to grade this with a 'critical eye'. Good writing should be rewarded with high scores. Anything that is less than stellar should be penalized. \n"
    "Any samples that are blank or almost blank result in a score of F. \n"
    "Excessive mistakes with spelling, punctuation, and grammar will reduce scores. \n"
    "Language is important. Students must use appropriate language when describing things. \n"
    "Bonus points for examples in which the writer did a good job of elaborating. \n"
    "Bonus points for when the writer did a good job of utilizing or referencing other documents. \n"
    "Engaging and interesting writing improves scores. \n"
    "Please assess the student's work based on the following criteria for grading:\n\n"
    "Development of Main Ideas:\n"
    "1. Perfect development of main ideas (A: Almost perfect, outstanding development)\n"
    "2. Great development of main ideas (B: Extremely good, very strong development)\n"
    "3. Good development of main ideas (C: Above average, solid development)\n"
    "4. Okay development of main ideas (D: Adequate, fair development)\n"
    "5. Undeveloped (F: Poor, no development)\n\n"
    "Quality of Work:\n"
    "6. Above average work (A: Exceptional quality, outstanding work)\n"
    "7. Great work (B: Very good, excellent quality)\n"
    "8. Good work (C: Above average, solid quality)\n"
    "9. Okay work (D: Adequate, fair quality)\n"
    "10. Below average work (F: Poor, unacceptable quality)\n\n"
    "Flow:\n"
    "11. Writing flows perfectly (A: Perfect flow, flawless coherence)\n"
    "12. Writing has a great flow (B: Great flow, very smooth and coherent)\n"
    "13. Writing has a good flow (C: Good flow, generally coherent)\n"
    "14. The flow of this writing is confusing (D: Fair flow, somewhat confusing)\n"
    "15. The flow of this writing does not make sense (F: Poor flow, completely confusing)\n\n"
    "Use of Resources or Evidence:\n"
    "16. Great use of resources or evidence (A: Perfect utilization, highly effective)\n"
    "17. Used or referenced resources, but it could have been done better (C: Good utilization, moderately effective)\n"
    "18. Inadequate use of resources or evidence (F: Poor utilization, ineffective)\n"
    "Tone:\n"
    "19. Tone strengthens the writing (A: Perfect tone, enhances the writing)\n"
    "20. Tone is great (B: Great tone, very appropriate)\n"
    "21. Tone is appropriate but could be better (C: Good tone, generally suitable)\n"
    "22. Tone is inconsistent or confusing (D: Fair tone, somewhat inconsistent)\n"
    "23. Tone is inappropriate (F: Poor tone, completely inappropriate)\n\n"
    "Language: \n"
    "24. Exceptional use of grammar and language (A: Perfect language, impeccable grammar)\n"
    "25. Great use of grammar and language (B: Great language, very strong grammar)\n"
    "26. Good use of grammar and language (C: Good language, generally correct grammar)\n"
    "27. Sub-standard use of grammar and language (D: Fair language, some grammar issues)\n"
    "28. Poor use of grammar and language (F: Poor language, numerous grammar issues)\n\n"
    "Please assign an overall score of A to F based on the criteria above. An A is the best score and an F is the worst score."
)

# Iterate through the Writing Samples and classify them
for writing_sample in X:
    # Generate a classification prompt (including criteria and the writing sample)
    complete_prompt = classification_prompt + f"\n\n{writing_sample}"
    
    # Use the classifier to classify the writing sample
    predicted_labels = clf.predict([complete_prompt])[0]
    
    # Print the classification result
    print("Predicted Labels:", predicted_labels)
    print()

100%|███████████████████████████████████████████| 1/1 [00:00<00:00,  1.23it/s]


Predicted Labels: ['A', 'C']



100%|███████████████████████████████████████████| 1/1 [00:00<00:00,  1.38it/s]


Predicted Labels: ['A', 'C']



100%|███████████████████████████████████████████| 1/1 [00:00<00:00,  1.22it/s]


Predicted Labels: []



100%|███████████████████████████████████████████| 1/1 [00:00<00:00,  1.40it/s]


Predicted Labels: ['A', 'C']



100%|███████████████████████████████████████████| 1/1 [00:00<00:00,  1.40it/s]


Predicted Labels: ['C', 'D']



100%|███████████████████████████████████████████| 1/1 [00:00<00:00,  1.09it/s]


Predicted Labels: ['A', 'C']



100%|███████████████████████████████████████████| 1/1 [00:00<00:00,  1.40it/s]


Predicted Labels: ['A', 'C']



100%|███████████████████████████████████████████| 1/1 [00:00<00:00,  1.40it/s]


Predicted Labels: ['A', 'C']



100%|███████████████████████████████████████████| 1/1 [00:00<00:00,  1.40it/s]


Predicted Labels: ['A', 'C']



100%|███████████████████████████████████████████| 1/1 [00:00<00:00,  1.09it/s]


Predicted Labels: ['A', 'B']



100%|███████████████████████████████████████████| 1/1 [00:00<00:00,  1.22it/s]


Predicted Labels: ['C', 'D']



100%|███████████████████████████████████████████| 1/1 [00:00<00:00,  1.34it/s]


Predicted Labels: ['A', 'C']



100%|███████████████████████████████████████████| 1/1 [00:00<00:00,  1.01it/s]


Predicted Labels: ['C', 'D']



100%|███████████████████████████████████████████| 1/1 [00:00<00:00,  1.40it/s]


Predicted Labels: ['B', 'C']



100%|███████████████████████████████████████████| 1/1 [00:00<00:00,  1.32it/s]


Predicted Labels: ['B', 'C']



100%|███████████████████████████████████████████| 1/1 [00:00<00:00,  1.45it/s]


Predicted Labels: ['C', 'D']



100%|███████████████████████████████████████████| 1/1 [00:00<00:00,  1.44it/s]


Predicted Labels: ['A', 'C']



100%|███████████████████████████████████████████| 1/1 [00:00<00:00,  1.22it/s]


Predicted Labels: ['A', 'C']



100%|███████████████████████████████████████████| 1/1 [00:00<00:00,  1.22it/s]


Predicted Labels: ['A', 'C']



100%|███████████████████████████████████████████| 1/1 [00:00<00:00,  1.47it/s]

Predicted Labels: ['C', 'D']






In [35]:
# Create an empty list to store final letter grades
final_letter_grades = []

# Define a mapping of letter grades to numerical values
numerical_values = {
    "A": 5,
    "B": 4,
    "C": 3,
    "D": 2,
    "F": 1
}

# Iterate through the Writing Samples and classify them
for i, writing_sample in enumerate(X):
    # Generate a classification prompt (including criteria and the writing sample)
    complete_prompt = classification_prompt + f"\n\n{writing_sample}"
    
    # Use the classifier to classify the writing sample
    predicted_labels = clf.predict([complete_prompt])[0]
    
    # Calculate the average score for this sample
    average_score = sum([numerical_values[label] for label in predicted_labels]) / len(predicted_labels)
    
    # Custom rounding logic: Round up for scores with a decimal part of 0.5 or greater
    rounded_grade = round(average_score) if average_score % 1 < 0.5 else int(average_score) + 1
    
    # Convert the rounded grade back to a letter grade
    final_letter_grade = next(label for label, value in numerical_values.items() if value == rounded_grade)
    
    # Append the final letter grade to the list
    final_letter_grades.append(final_letter_grade)
    
    # Print the results for this sample
    print(f"Sample {i + 1} - Average Score: {average_score}, Letter Grade: {final_letter_grade}")

# Now, final_letter_grades contains the letter grade for each writing sample


100%|███████████████████████████████████████████| 1/1 [00:00<00:00,  1.43it/s]


Sample 1 - Average Score: 4.0, Letter Grade: B


100%|███████████████████████████████████████████| 1/1 [00:00<00:00,  1.40it/s]


Sample 2 - Average Score: 4.0, Letter Grade: B


100%|███████████████████████████████████████████| 1/1 [00:00<00:00,  1.22it/s]


Sample 3 - Average Score: 3.5, Letter Grade: B


100%|███████████████████████████████████████████| 1/1 [00:00<00:00,  1.40it/s]


Sample 4 - Average Score: 4.0, Letter Grade: B


100%|███████████████████████████████████████████| 1/1 [00:00<00:00,  1.40it/s]


Sample 5 - Average Score: 2.5, Letter Grade: C


100%|███████████████████████████████████████████| 1/1 [00:00<00:00,  1.41it/s]


Sample 6 - Average Score: 4.0, Letter Grade: B


100%|███████████████████████████████████████████| 1/1 [00:00<00:00,  1.39it/s]


Sample 7 - Average Score: 4.0, Letter Grade: B


100%|███████████████████████████████████████████| 1/1 [00:01<00:00,  1.02s/it]


Sample 8 - Average Score: 4.0, Letter Grade: B


100%|███████████████████████████████████████████| 1/1 [00:00<00:00,  1.22it/s]


Sample 9 - Average Score: 4.0, Letter Grade: B


100%|███████████████████████████████████████████| 1/1 [00:00<00:00,  1.53it/s]


Sample 10 - Average Score: 4.5, Letter Grade: A


100%|███████████████████████████████████████████| 1/1 [00:00<00:00,  1.29it/s]


Sample 11 - Average Score: 4.0, Letter Grade: B


100%|███████████████████████████████████████████| 1/1 [00:00<00:00,  1.22it/s]


Sample 12 - Average Score: 4.0, Letter Grade: B


100%|███████████████████████████████████████████| 1/1 [00:00<00:00,  1.22it/s]


Sample 13 - Average Score: 2.5, Letter Grade: C


100%|███████████████████████████████████████████| 1/1 [00:04<00:00,  4.30s/it]


Sample 14 - Average Score: 3.5, Letter Grade: B


100%|███████████████████████████████████████████| 1/1 [00:00<00:00,  1.22it/s]


Sample 15 - Average Score: 3.5, Letter Grade: B


100%|███████████████████████████████████████████| 1/1 [00:00<00:00,  1.40it/s]


Sample 16 - Average Score: 2.5, Letter Grade: C


100%|███████████████████████████████████████████| 1/1 [00:00<00:00,  1.48it/s]


Sample 17 - Average Score: 4.0, Letter Grade: B


100%|███████████████████████████████████████████| 1/1 [00:00<00:00,  1.40it/s]


Sample 18 - Average Score: 4.0, Letter Grade: B


100%|███████████████████████████████████████████| 1/1 [00:00<00:00,  1.32it/s]


Sample 19 - Average Score: 4.0, Letter Grade: B


100%|███████████████████████████████████████████| 1/1 [00:00<00:00,  1.40it/s]

Sample 20 - Average Score: 2.5, Letter Grade: C





In [36]:
# Add labels to the dataset and save
data['ChatGPTLabel'] = final_letter_grades
data.to_csv('data/classified_tips.csv', index=False)


# Choose a measure of success, Choose an evaluation protocol / evaluate

In [37]:
print(data.columns)

Index(['StudentName', 'WritingSample', 'HumanLabel', 'ChatGPTLabel'], dtype='object')


In [38]:
selected_columns = data[["WritingSample", "ChatGPTLabel", "HumanLabel"]]
print(selected_columns)

                                        WritingSample ChatGPTLabel HumanLabel
0   \tWill robots be able to replace humans? In re...            B          A
1   Most people think robots are taking over the w...            B          D
2   Every year, about 1.35 million people are kill...            B          C
3   The recent advancements of robot technology ha...            B          C
4   \tAdvancements in technology have led to groun...            C          A
5   Technology has come a long way from when it fi...            B          C
6   \tRobot technology has advancements year aroun...            B          B
7   Technology has been  so much to prevent any ac...            B          B
8       In this new age, there have been rapid cha...            B          B
9   In a world driven by technology, some recent a...            A          B
10  The modern world is changing at a concerningly...            B          A
11  Imagine a world where robots are not only a pa...           

In [39]:
from sklearn.metrics import accuracy_score

# Check if the column 'HumanLabel' exists in your DataFrame
if 'HumanLabel' in data.columns:
    # Ensure the column 'HumanLabel' has compatible data types (e.g., both are strings)
    data['HumanLabel'] = data['HumanLabel'].astype(str)
    
    # Calculate accuracy
    accuracy = accuracy_score(data['HumanLabel'], data['ChatGPTLabel'])
    print("Accuracy:", accuracy)
else:
    print("Column 'HumanLabel' not found in the DataFrame.")



Accuracy: 0.35


# Skipped Steps
* Beat a baseline
* Overfit, regularize and tune
* Communicate with stakeholders
* Ship an inference model
* Monitor and maintain