In [12]:
#gets data from the college results reddit

from dotenv import load_dotenv
import os
import praw

load_dotenv()

reddit = praw.Reddit(
    client_id = os.getenv("CLIENT_ID"), 
    client_secret = os.getenv("CLIENT_SECRET"), 
    user_agent = os.getenv("USER_AGENT"), 
)

posts = []

for submission in reddit.subreddit("collegeresults").hot(limit=10000):
    #print(submission.title)
    #print(submission.selftext)
    posts.append({'id': submission.id, 'body': submission.selftext})


Welcome to r/collegeresults!

This is a subreddit dedicated to compiling data about the undergraduate and transfer admissions processes. We intend to create a repository for information about past applicants and their college decisions, in order for current applicants to browse through examples of student profiles and potentially gauge their chances of admission to different schools and programs. We encourage all students who have received their decisions to contribute to our subreddit by creating a post using our [official templates](https://www.reddit.com/r/collegeresults/comments/gjgqu7/post_templates_and_flairs/?utm_source=share&utm_medium=web2x). To all current applicants, this subreddit is a great resource for you to compare your stats with those of other students, discover ideas on how to improve your extracurriculars and overall application, and discuss student profiles via comments sections. For your convenience, we are organizing both new and archived posts with flairs, accor

In [14]:
#cleans the data and stores into csv, we can use an opensource model like llama instead for the actual training run later

import os
import json
import pandas as pd
from openai import OpenAI

client = OpenAI(
    api_key=os.environ['OPENAI_API_KEY']
)

expected_format = {
    "Gender": "Male/Female",
    "Race": "",
    "Residence": "",
    "Income Bracket": "",
    "Type of School": "",
    "Hooks": "",
    "Intended Major(s)": "",
    "GPA (UW/W)": "",
    "Rank (or percentile)": "",
    "# of Honors/AP/IB/Dual Enrollment/etc.": "",
    "Senior Year Course Load": "",
    "SAT": "1400",
    "ACT": "35",
    "AP": "",
    "Extra Curriculars": [],
    "Awards": [],
    "Letters of rec": [],
    "Interviews": "",
    "Essays": "",
    "Acceptances": [],
    "Waitlists": [],
    "Rejections": []
}

extracted_data = []

for post in posts:
    #print(post)

    lower_ed = post.lower()
    if(lower_ed.find("accepted") == -1 and lower_ed.find("acceptance") == -1):
        continue

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "Extract the following details from the post and return in JSON format. If the data is not present, skip it."},
            {"role": "user", "content": f"""
            Post:
            {post}
            Extract the following details and return in JSON format:
            {json.dumps(expected_format, indent=2)}
            """}
        ],
        temperature=0,
        response_format={"type": "json_object"}
    )
    extracted_data.append(response.choices[0].message.content)

extracted_data_dicts = [json.loads(data) for data in extracted_data]
df = pd.DataFrame(extracted_data_dicts)
df.to_csv('extracted_data.csv', index=False)

In [None]:
# uses csv to create model

