# Required documents:
- wharton_data.xlsx: Dataset that contains Xiaohongshu's posts

In [1]:
pip install openai==0.28



In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import json
import random
import openai
import time

# Load Data

In [3]:
post_df = pd.read_excel("/content/wharton_data.xlsx")
post_df.columns = ["row_number", "post_date", "content", "like_count", "comments", "media_type", "image_url", "video_url", "article_url"]
post_df = post_df.dropna()
post_df.shape

(267, 9)

In [4]:
post_df.head()

Unnamed: 0,row_number,post_date,content,like_count,comments,media_type,image_url,video_url,article_url
0,1,"12/9/24, 11:01AM (EST)",What are the factors shaping the future of fin...,437,4,Video,No images,https://dms.licdn.com/playlist/vid/dynamic/D56...,https://www.linkedin.com/feed/hashtag/?keyword...
1,2,"1/22/25, 1:00PM (EST)",How can tax and spending reforms reduce the na...,13,0,Text,No images,No videos,https://www.linkedin.com/feed/hashtag/?keyword...
2,3,"1/17/25, 8:45AM (EST)","While leaders often succumb to ""not invented h...",34,0,Image,https://media.licdn.com/dms/image/v2/D4E10AQGn...,No videos,https://www.linkedin.com/feed/hashtag/?keyword...
3,4,"1/20/25, 9:00AM (EST)",Three professors weigh in on the impact of int...,3,0,Text,No images,No videos,https://www.linkedin.com/in/ACoAABbbHCIBpUmuGM...
4,5,"1/16/25, 3:21PM (EST)","While AI is transforming industries, its resou...",14,1,Text,No images,No videos,https://www.linkedin.com/feed/hashtag/?keyword...


In [5]:
post_df['like_count'].median()

20.0

In [6]:
nonpreferred = post_df[post_df['like_count'] < post_df['like_count'].median()]
preferred = post_df[post_df['like_count'] >= post_df['like_count'].median()]
preferred_SFT = preferred.iloc[:10,:]
preferred_DPO = preferred.iloc[10:,:]
print('nonpreferred count:', nonpreferred.shape[0])
print('preferred count:', preferred.shape[0])

nonpreferred count: 131
preferred count: 136


# SFT and DPO Dataset

In [13]:
with open("SFT_data.jsonl", 'w', encoding='utf-8') as jsonl_file:
    # Loop through preferred_SFT DataFrame
    for _, row in preferred_SFT.iterrows():
        # Assuming 'content' contains the text you want to use
        preferred_text = f"{row['content']}"

        # Create a data entry for JSONL in the required structure
        entry = {
            "messages": [
                {
                    "role": "system",
                    "content": "Respond in the writing tone, style, voice, personality, perplexity, and burstiness of NYU’s Master’s program in Business, tailored for American social media sites."
                },
                {
                    "role": "user",
                    "content": ""  # If there's specific user input, populate it here
                },
                {
                    "role": "assistant",
                    "content": preferred_text
                }
            ]
        }
        # Write the entry as a JSON line
        jsonl_file.write(json.dumps(entry, ensure_ascii=False) + '\n')

In [34]:
output_file = "DPO_data.jsonl"

# The "prompt" you want for every example
# (currently stored in 'user_message'["messages"][0]["content"])
# If you have multiple prompts or user messages, you'll need to adapt accordingly.
prompt_text = ( "Respond in the writing tone, style, voice, personality, perplexity, and burstiness of NYU’s Master’s program in Business, tailored for American social media sites.")

# Sort the 'preferred_DPO' DataFrame in descending order by 'like_count'
preferred_DPO_sorted = preferred_DPO.sort_values(by="like_count", ascending=False).reset_index(drop=True)

# Sort the 'nonpreferred' DataFrame in ascending order by 'like_count'
# then select only as many rows as in 'preferred_DPO_sorted'
nonpreferred_DPO = (
    nonpreferred
    .sort_values(by="like_count", ascending=True)
    .head(len(preferred_DPO_sorted))
    .reset_index(drop=True)
)

# Optional: warn if there's a mismatch
if len(nonpreferred_DPO) < len(preferred_DPO_sorted):
    print("Warning: Not enough nonpreferred entries to pair with all preferred DPO entries.")

# Build a standard supervised dataset: "prompt", "completion"
# We ignore the nonpreferred output entirely
dataset = []
for i in range(len(preferred_DPO_sorted)):
    # preferred answer
    preferred_content = preferred_DPO_sorted.iloc[i]["content"]

    # Construct a chat-based entry
    entry = {
        "messages": [
            {
                "role": "system",
                "content": "Respond in the writing tone, style, voice, personality, perplexity, and burstiness of NYU’s Master’s program in Business, tailored for American social media sites."
            },
            {
                "role": "user",
                "content": ""
            },
            {
                "role": "assistant",
                "content": preferred_content
            }
        ]
    }
    dataset.append(entry)

# Write to JSONL
with open(output_file, "w", encoding="utf-8") as f:
    for item in dataset:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")


In [35]:
preferred_DPO_sorted

Unnamed: 0,row_number,post_date,content,like_count,comments,media_type,image_url,video_url,article_url
0,22,"12/31/24, 11:05AM (EST)",Is AI really transforming the workplace? Not...,1195,39,Video,No images,https://dms.licdn.com/playlist/vid/dynamic/D56...,https://www.linkedin.com/feed/hashtag/?keyword...
1,34,"12/12/24, 1:07PM (EST)",AI should not be viewed as just a tech challen...,896,23,Video,No images,https://dms.licdn.com/playlist/vid/dynamic/D56...,https://www.linkedin.com/feed/hashtag/?keyword...
2,91,"10/1/24, 12:49PM (EST)","The U.S. has a national debt problem, and a cr...",723,50,Video,No images,https://dms.licdn.com/playlist/vid/dynamic/D56...,https://www.linkedin.com/feed/hashtag/?keyword...
3,157,"4/16/24, 2:45PM (EST)","To use AI for your personal finances, you must...",627,11,Video,No images,https://dms.licdn.com/playlist/vid/dynamic/D56...,https://www.linkedin.com/feed/hashtag/?keyword...
4,203,"5/29/24, 3:00PM (EST)",NEWS: We are excited to announce the Wharton...,588,12,Text,No images,No videos,https://www.linkedin.com/feed/hashtag/?keyword...
...,...,...,...,...,...,...,...,...,...
121,63,"11/14/24, 12:05PM (EST)",What lessons can be learned from the 2023 Sili...,21,0,Video,No images,https://dms.licdn.com/playlist/vid/dynamic/D56...,https://www.linkedin.com/company/bis/; https:/...
122,72,"9/5/24, 6:00PM (EST)",What does it take for AI to win our trust? Man...,20,0,Text,No images,No videos,https://www.linkedin.com/in/ACoAAAMhzfoBDyftMy...
123,47,"11/20/24, 3:05PM (EST)","Why do meme stock phenomena happen, and what c...",20,0,Video,No images,https://dms.licdn.com/playlist/vid/dynamic/D56...,https://whr.tn/40U9nau; https://www.linkedin.c...
124,59,4mo • \n \n 4 months ago,The Wharton School Professor Ethan Mollick...,20,1,Text,No images,No videos,https://www.linkedin.com/company/time/; https:...


# Finetune Models

In [None]:
with open('/content/drive/MyDrive/Marketing Analytics Course Preparation/Sichen/NYU Stern Social Media/Model Creation/data/API_key.txt', 'r') as f:  # Replace with your file path
       API_key = f.read()
openai.api_key = API_key
baseline_model = "gpt-3.5-turbo"

In [37]:
def upload_file(file_path, purpose):
    try:
        response = openai.File.create(
            file=open(file_path, "rb"),
            purpose=purpose
        )
        print(f"Uploaded {file_path}: {response['id']}")
        return response['id']
    except Exception as e:
        print(f"Error uploading {file_path}: {e}")
        return None

# Upload SFT dataset
sft_file_path = "SFT_data.jsonl"
sft_file_id = upload_file(sft_file_path, purpose="fine-tune")

# Upload DPO dataset
dpo_file_path = "DPO_data.jsonl"
dpo_file_id = upload_file(dpo_file_path, purpose="fine-tune")

Uploaded SFT_data.jsonl: file-B6K6ACwsgGPQb8Qjfm9B3A
Uploaded DPO_data.jsonl: file-N8BLoGUSoHYakPDkYt6b7n


In [18]:
try:
    fine_tune_response_sft = openai.FineTuningJob.create(
        training_file=sft_file_id,
        model=baseline_model,
        hyperparameters={
            "n_epochs": 4,
            "learning_rate_multiplier": 0.1
            # "batch_size": 32,
        },
        suffix="SFT-model"
    )
    print(f"Started SFT fine-tuning: {fine_tune_response_sft['id']}")
except Exception as e:
    print(f"Error during SFT fine-tuning: {e}")
    raise e


Started SFT fine-tuning: ftjob-Ww0jRhFFHX4DI78In8XiAPbK


In [19]:
job_id = fine_tune_response_sft['id']
fine_tuned_model_name = None

while True:
    job_status = openai.FineTuningJob.retrieve(job_id)
    status = job_status["status"]
    print(f"Job status: {status}")

    if status in ["succeeded", "failed", "cancelled"]:
        # Job is done
        if status == "succeeded":
            # Set the model name only when it first succeeds
            fine_tuned_model_name = job_status["fine_tuned_model"]
            print("Fine-tuning succeeded!")
            print("New model name:", fine_tuned_model_name)
        else:
            print(f"Fine-tuning job ended with status: {status}")
        break

    # Wait for 60 seconds (or any interval you want) before checking again.
    time.sleep(60)

Job status: validating_files
Job status: validating_files
Job status: running
Job status: running
Job status: running
Job status: succeeded
Fine-tuning succeeded!
New model name: ft:gpt-3.5-turbo-0125:personal:sft-model:AvhkrpsD


In [38]:
try:
    fine_tune_response_dpo = openai.FineTuningJob.create(
        training_file=dpo_file_id,
        model=baseline_model,
        hyperparameters={
            "n_epochs": 4,
            "learning_rate_multiplier": 0.1
        },
        suffix="DPO-model"
    )
    print(f"Started DPO fine-tuning: {fine_tune_response_dpo['id']}")
except Exception as e:
    print(f"Error during DPO fine-tuning: {e}")
    raise e

Started DPO fine-tuning: ftjob-nV0nBkf3fBD8A1Gn0H8YxIL5


In [39]:
job_id = fine_tune_response_dpo['id']
fine_tuned_model_name = None

while True:
    job_status = openai.FineTuningJob.retrieve(job_id)
    status = job_status["status"]
    print(f"Job status: {status}")

    if status in ["succeeded", "failed", "cancelled"]:
        # Job is done
        if status == "succeeded":
            # Set the model name only when it first succeeds
            fine_tuned_model_name = job_status["fine_tuned_model"]
            print("Fine-tuning succeeded!")
            print("New model name:", fine_tuned_model_name)
        else:
            print(f"Fine-tuning job ended with status: {status}")
        break

    # Wait for 60 seconds (or any interval you want) before checking again.
    time.sleep(60)

Job status: validating_files
Job status: validating_files
Job status: running
Job status: running
Job status: running
Job status: running
Job status: running
Job status: running
Job status: running
Job status: running
Job status: running
Job status: running
Job status: running
Job status: running
Job status: running
Job status: running
Job status: succeeded
Fine-tuning succeeded!
New model name: ft:gpt-3.5-turbo-0125:personal:dpo-model:Avi9Q9Fk


In [23]:
#for debugging
# events = openai.FineTuningJob.list_events(job_id, stream=False)
# for event in events["data"]:
#     print(event["message"])

The job failed due to an invalid training file. Invalid file format. Input file file-SkrFN8ZuR7t7k2dja1DUpH is in the prompt-completion format, but the specified model gpt-3.5-turbo-0125 is a chat model and requires chat-formatted data. See https://platform.openai.com/docs/guides/fine-tuning#preparing-your-dataset for details.
Validating training file: file-SkrFN8ZuR7t7k2dja1DUpH
Created fine-tuning job: ftjob-TG4PxCGnzhhNjzromXBRzKZE


In [40]:
SFT_model = "ft:gpt-3.5-turbo-0125:personal:sft-model:AvhkrpsD"
DPO_model = "ft:gpt-3.5-turbo-0125:personal:dpo-model:Avi9Q9Fk"

## Baseline

In [28]:
# Make a completion call
response = openai.ChatCompletion.create(
    model=baseline_model,
    messages=[
        {"role": "user", "content": "Respond in the writing tone, style, voice, personality, perplexity, and burstiness of NYU’s Master’s program in Business, tailored for American social media sites."}
    ],
    temperature=0.2,        # Controls randomness; higher values produce more random responses.
    max_tokens=600,         # Limits the length of the response.
    top_p=0.9,              # Nucleus sampling; considers tokens with cumulative probability <= top_p.
    frequency_penalty=0.5,  # Discourages repetition of the same words.
    presence_penalty=0.6    # Encourages talking about new topics.
)

# Print the response
print(response['choices'][0]['message']['content'])

Hey there future business leaders! 🌟 Ready to take your career to the next level with NYU's Master's program in Business? 💼📈 Get ready to dive deep into cutting-edge strategies, network with industry experts, and gain the skills you need to succeed in today's fast-paced business world. 🚀 Don't just dream about success – make it happen at NYU! Apply now and join our community of driven, ambitious professionals. Let's do this! 💪 #NYUBusinessMasters #FutureLeaders #SuccessStartsHere 🎓🔥


## SFT

In [29]:
# Make a completion call
response = openai.ChatCompletion.create(
    model=SFT_model,
    messages=[
        {"role": "user", "content": "Respond in the writing tone, style, voice, personality, perplexity, and burstiness of NYU’s Master’s program in Business, tailored for American social media sites."}
    ],
    temperature=0.2,        # Controls randomness; higher values produce more random responses.
    max_tokens=600,         # Limits the length of the response.
    top_p=0.9,              # Nucleus sampling; considers tokens with cumulative probability <= top_p.
    frequency_penalty=0.5,  # Discourages repetition of the same words.
    presence_penalty=0.6    # Encourages talking about new topics.
)

# Print the response
print(response['choices'][0]['message']['content'])

Hey there, future business leaders! 🌟 Ready to take your career to the next level with NYU's Master's program in Business? Get ready to dive into cutting-edge coursework, collaborate with top industry professionals, and network like never before. Our program is designed to challenge you, inspire you, and push you to reach your full potential. So what are you waiting for? Apply now and get ready to #NYUBusinessYourFuture! 💼🎓 #GradSchoolGoals #BusinessMasters #NYU2022


## DPO

In [41]:
# Make a completion call
response = openai.ChatCompletion.create(
    model=DPO_model,
    messages=[
        {"role": "user", "content": "Respond in the writing tone, style, voice, personality, perplexity, and burstiness of NYU’s Master’s program in Business, tailored for American social media sites."}
    ],
    temperature=0.2,        # Controls randomness; higher values produce more random responses.
    max_tokens=600,         # Limits the length of the response.
    top_p=0.9,              # Nucleus sampling; considers tokens with cumulative probability <= top_p.
    frequency_penalty=0.5,  # Discourages repetition of the same words.
    presence_penalty=0.6    # Encourages talking about new topics.
)

# Print the response
print(response['choices'][0]['message']['content'])


The intersection of technology and business has always been a hot topic, but the rise of artificial intelligence (AI) is taking it to a whole new level. AI is transforming industries across the board, from healthcare to finance to retail. But what does this mean for business leaders? How can they harness the power of AI to drive growth and innovation in their organizations?

These are just some of the questions we'll be exploring in our upcoming webinar on "AI and Business: Navigating the Future." Join us as we dive into the latest research and insights on how AI is reshaping the business landscape, and learn how you can leverage this powerful technology to stay ahead of the curve.

Don't miss out on this opportunity to gain valuable knowledge and network with industry experts. Register now for our free webinar on AI and Business! #AI #business #innovation
