In [6]:
import pandas as pd
from tqdm import tqdm
import os
import openai
from glob import glob
import re

In [2]:
cous = pd.read_pickle('../data/processed/counts_of_unique_subjects.pickle')

In [3]:
cous.sum(axis=1).sort_values(ascending=False).iloc[:30]

pld
www.coursera.org               7520.0
www.edx.org                    7407.0
studylib.net                   5646.0
www.diziler.com                4683.0
www.physicsforums.com          4677.0
ja.coursera.org                3851.0
ru.coursera.org                3784.0
pt.coursera.org                3601.0
fr.coursera.org                3375.0
zh.coursera.org                3353.0
es.coursera.org                3336.0
de.coursera.org                3162.0
ko.coursera.org                3053.0
codegalaxy.io                  3026.0
zh-tw.coursera.org             3023.0
www.goconqr.com                2471.0
www.lynda.com                  2017.0
www.verboeducacional.com.br    1526.0
www.taskutark.ee               1143.0
drops.dagstuhl.de              1135.0
www.haberankara.com            1113.0
www.lumoslearning.com          1013.0
mille-sorties.com               895.0
quiwego.com                     848.0
redooc.com                      832.0
studylibpl.com                  744.0
www.habe

In [4]:
top_ed_domain_data_files = glob('../data/processed/pld_data/*.csv')

In [5]:
top_ed_data = [(f, pd.read_csv(f, index_col=0)) for f in top_ed_domain_data_files]

In [7]:
def get_name_year(path):
    filename = path.split('/')[-1]
    name = filename.replace('.csv', '')
    pld, year = re.findall('([a-z_]+)_([0-9]+)', name)[0]
    year = int(year)
    pld = pld.replace('_', '.')
    return pld, year

# [(get_name_year(f), df.shape[0]) for (f, df) in top_ed_data]

In [8]:
def summary_stats(filepath, df):
    pld, year = get_name_year(filepath)
    s_count = df.s.nunique()
    p_count = df.p.nunique()
    ps_per_s_count = df.groupby('s').p.nunique().mean()
    return pd.Series({
        'pld': pld,
        'year': year,
        'ps_per_s_count': ps_per_s_count,
        's_count': s_count,
        'p_count': p_count
    })

sum_stats = pd.DataFrame([summary_stats(f, df) for (f, df) in top_ed_data]).groupby('pld').s_count.apply(list)

In [10]:
def get_title_desc(df):
    o_str_exists = (df.o.notna()) & (df.o.str.strip().str.len() > 0)
    descriptions = df[(df.p.str.contains('description')) & o_str_exists].o
    title = df[df.p.str.contains('name') & o_str_exists].o
    if (descriptions.shape[0] > 0) and (title.shape[0] > 0):
        return pd.Series({'desc': descriptions.iloc[0],
                          'title': title.iloc[0]})
    else:
        return pd.Series({'desc': None, 'title': None})


def prompt_template(title, description):
    return f"""The following text is the title and description of a course. Give me a list of learning objectives of the course, the subject the course belogs to and what level of student it is appropriate for.

    Text: \"\"\"Title: {title} Description: {description}\"\"\"

    Learning objectives, subject and student level:"""
    
prompts = []
for ed_data in top_ed_data:
    domain, year = get_name_year(ed_data[0])
    if ed_data[1].shape[0] > 0:
        title_descs = ed_data[1].groupby('s').apply(get_title_desc)
        for _, row in title_descs.iterrows():
            prompts.append({'domain': domain, 'year': year, 'title' : row.title, 'desc': row.desc, 
                            'prompt': prompt_template(row.title, row.desc)})
    
prompts = pd.DataFrame(prompts).drop_duplicates(subset=['prompt'])
prompts = prompts[prompts.desc.notna() & prompts.title.notna()]
from hashlib import md5
prompts['hashed_prompt'] = prompts.prompt.apply(lambda p: str(md5(p.encode('utf-8')).hexdigest()))

In [11]:
completed = glob('../data/gpt_results/*.completion_temp_07')
completed = [c.split('/')[-1].split('.')[0] for c in completed]

In [12]:
prompts[prompts.hashed_prompt.isin()]

['11c20f4567d5ca193395ced206023663',
 '918e206172e9088a7e73accaf4af9571',
 '35940e783db1d6b4e8bb463aa855d3f9',
 'b657bfaec4859c081b17c7e679e4ecbd',
 '6c6b2b86301045c40ecb2d381e0d9377',
 'd45d0507087672fecf64cf0705ef649c',
 'ffeec8def26e2b03411361cd1ffdce2e',
 'e81de8016850221cec739eda215dcf1a',
 '614f4b90fb12c118da3c4f82ea064411',
 '5ca23561ff249ebc66e6f888d6b82d23',
 '229a6c815590e6436e0ad304c22ecf4c',
 '7c6480d91327c23a398b3ebe602d5f47',
 '00a437ac61145672b6eda50f6035f045',
 '6bcf96e2798e4e2b17d72f6b235cb922',
 'a4ae6bb1711503b5aaa0b132a1c777ed',
 '142ecb6fb81fc3efc1b22ecda6aa0692',
 'c3cc470eb17dc6f7409d3ff50183c136',
 '372908bf3efac1d48c478a95ff05be86',
 'fd279a7ccc2c74c2cec02b6245519ee5',
 '8d8e4931eb30dee52923c7d2f9aca85a',
 '2005f8ff4da82ac955cba82bdef9910a',
 '2c19eea10189003c000cfaf969f69520',
 'db762b6a535973fb9750f1934f2c37ef',
 'e764b8e9c84d4361b2b75c1d8711d30f',
 'ad1694c17159aab3b9c54ae8a14619c0',
 'e361269fedf9f45642da1b0497fcb098',
 'c4d6b4425d44445196131a1278a9996d',
 

In [105]:
sample_prompts = prompts.groupby('domain').sample(frac=0.33)

In [106]:
sample_prompts.head(2)

Unnamed: 0,domain,year,title,desc,prompt,hashed_prompt
13455,es.coursera.org,2021,Business English: Final Project,Ofrecido por Universidad del Estado de Arizona...,The following text is the title and descriptio...,5027a9f6729e9e622232ad2c5743f5c0
14032,es.coursera.org,2021,Managing Security in Google Cloud,Ofrecido por Google Cloud. This self-paced tra...,The following text is the title and descriptio...,2d7f967dff41b3da101f10f9535fa28f


In [108]:
# !mkdir ../data/gpt_results

In [103]:
gpt_model='text-davinci-003'

In [104]:
gpt_temperature=0.1

In [105]:
gpt_max_length = 256

In [106]:
# sample_prompts.to_pickle('../data/sample_prompts.pickle')

In [107]:
sample_prompts = pd.read_pickle('../data/sample_prompts.pickle')

In [None]:
def save_completion_to_file_if_not_exists(filename, prompt):
    if not os.path.exists(filename):
        completion = generate_text(prompt)
        with open(filename, 'w') as f:
            if completion is not None:
                f.write(completion)
            else:
                f.write("NO_COMPLETION")
            print(f"File '{filename}' created and contents saved successfully.")
    else:
        print(f"File '{filename}' already exists. Contents not saved.")
        
openai.api_key = os.environ["OPENAI_API_KEY"]  # Set your API key as an environment variable

def generate_text(prompt, model=gpt_model, temperature=gpt_temperature, max_tokens=gpt_max_length):
    try:
        response = openai.Completion.create(
            engine=model,
            prompt=prompt,
            max_tokens=max_tokens,
            temperature=temperature
        )

        if response.choices:
            return response.choices[0].text.strip()
        else:
            return "NO_COMPLETION"

    except openai.error.InvalidRequestError as e:
        print(f"Invalid request error: {e}")
        return "ERROR: InvalidRequest"

    except openai.error.AuthenticationError as e:
        print(f"Authentication error: {e}")
        return "ERROR: Authentication"

    except openai.error.APIConnectionError as e:
        print(f"API connection error: {e}")
        return "ERROR: APIConneection"

    except openai.error.OpenAIError as e:
        print(f"OpenAI error: {e}")
        return "ERROR: OpenAI"


# sample_prompts.shape
running_set = sample_prompts.sample(1000)
# running_set = sample_prompts[sample_prompts.hashed_prompt.isin(completed)]
for _, prompt in tqdm(running_set.iterrows(), total=running_set.shape[0]):
    save_completion_to_file_if_not_exists(f'../data/gpt_results/{prompt.hashed_prompt}.completion_temp_01', 
                               prompt.prompt)

  0%|                                 | 1/1000 [00:03<1:00:07,  3.61s/it]

File '../data/gpt_results/db762b6a535973fb9750f1934f2c37ef.completion_temp_01' created and contents saved successfully.


  0%|                                 | 2/1000 [00:08<1:13:03,  4.39s/it]

File '../data/gpt_results/abe98ae1c92bd7ad41fc649405b9a7fc.completion_temp_01' created and contents saved successfully.


  0%|                                 | 3/1000 [00:15<1:33:52,  5.65s/it]

File '../data/gpt_results/dcdbec43602ac3d81793c32d33e5a77b.completion_temp_01' created and contents saved successfully.


  0%|▏                                | 4/1000 [00:21<1:32:19,  5.56s/it]

File '../data/gpt_results/c1df44ca037a53b1d14ba17b6eb58483.completion_temp_01' created and contents saved successfully.


  0%|▏                                | 5/1000 [00:37<2:34:28,  9.32s/it]

File '../data/gpt_results/c26d6653423ac877850a589eef1b1b47.completion_temp_01' created and contents saved successfully.


In [80]:
completed = glob('../data/gpt_results/*.completion_temp_07')
completed = [c.split('/')[-1].split('.')[0] for c in completed]

In [82]:
# import shutil 

# for c in completed:
#     shutil.move(f'../data/gpt_results/{c}.completion', f'../data/gpt_results/{c}.completion_temp_07')

In [100]:
import re

overall_pattern_1 = re.compile('^Learning [Oo]bjectives:(.*)Subject: (.*)Student [Ll]evel: (.*)', flags=re.DOTALL)
overall_pattern_2 = re.compile('^(1\..*)Subject: (.*)Student [Ll]evel: (.*)', flags=re.DOTALL)
split_los_pattern = re.compile('[0-9]+\.', re.DOTALL)

def parse_completion(c):
    segments = overall_pattern_1.findall(c)
    if len(segments) == 0:
        segments = overall_pattern_2.findall(c)
    if len(segments) > 0:
        learning_objectives = [lo.strip() 
                               for lo 
                               in re.split(split_los_pattern, segments[0][0].strip()) 
                               if lo.strip()]
        subject = segments[0][1].strip()
        level = segments[0][2].strip()
        return learning_objectives, subject, level
    else:
        return None
        

In [101]:
with open('../data/gpt_results/e361269fedf9f45642da1b0497fcb098.completion_temp_07') as f:
    completion = f.read()
print(completion)
parse_completion(completion)

1. Describe the process of respiration in the human body
2. Explain the role of the lungs in the respiration process
3. Identify the role of oxygen and CO₂ in the body
4. Summarize the medical understanding of respiration

Subject: Human Biology
Student Level: Intermediate/Advanced


(['Describe the process of respiration in the human body',
  'Explain the role of the lungs in the respiration process',
  'Identify the role of oxygen and CO₂ in the body',
  'Summarize the medical understanding of respiration'],
 'Human Biology',
 'Intermediate/Advanced')

In [102]:
for c in completed:
    with open(f'../data/gpt_results/{c}.completion_temp_07') as f:
        completion = f.read()
    prompt = sample_prompts[sample_prompts.hashed_prompt == c]
    print("ID", c)
    print("Domain: ", prompt.domain.iloc[0])
    print("Year: ", prompt.year.iloc[0])
    print("Title: ", prompt.title.iloc[0])
    print("Description: ", prompt.desc.iloc[0])
    completion = parse_completion(completion)
    if completion:
        los, subject, level = completion
        print("Completion: ")
        print("-"*30)
        print("Learning Objectives:")
        for i, lo in enumerate(los):
            print(i, lo)
        print("Subject: ", subject)
        print("Level: ", level)
    else:
        print("UNPARSABLE COMPLETION:", completion)
    print("="*30)

ID 11c20f4567d5ca193395ced206023663
Domain:  ja.coursera.org
Year:  2021
Title:  Comparing Genes, Proteins, and Genomes (Bioinformatics III)
Description:  カリフォルニア大学サンディエゴ校による提供。Once we have sequenced genomes in the previous course, we would like to compare them to determine how species have evolved and ... 無料で登録。
Completion: 
------------------------------
Learning Objectives:
0 Understand the fundamentals of gene and protein comparison.
1 Learn the principles of genomic comparison.
2 Analyze data from genome sequences.
3 Compare different genomes and proteins to identify evolutionary relationships.
Subject:  Bioinformatics
Level:  Advanced
ID 918e206172e9088a7e73accaf4af9571
Domain:  es.coursera.org
Year:  2021
Title:  Los obstáculos y la conducción en las negociaciones
Description:  Ofrecido por Universidad Nacional Autónoma de México. En este curso se analizará para su aplicación la ubicación y señalamiento de los ... Inscríbete gratis.
Completion: 
------------------------------
Le

In [17]:
another_running_set.hashed_prompt

11473    4d392dbe11e2878db8748ef7e9f44660
29464    ed7b17233fc0204525fa1c5026aaf4f5
36749    8469308ef2a0f9c12d7d5eed9aecf1c5
11223    1abe305c2fbc960fba72285d1527c561
42760    0e96dce7ee44c2dddf0c401eb79930fd
37952    7bb7cad6c382d6da707c4aedb906eab5
35978    af27a045afa280b62f6c2f3de102be13
2396     f85c9217c6f1dc05fdfdcae11081feca
11986    3611ebe17e0d3330538e762f549f0219
38541    439da497dfce0444dd97868b07b523af
Name: hashed_prompt, dtype: object

In [96]:
prompt_texts = set()
prompt_lengths = []

for _, prompt in sample_prompts.iterrows():
    prompt_text = prompt.prompt
    prompt_texts.add(prompt_text)
    prompt_lengths.append(len(prompt_text.split(' ')))

len(set(prompt_texts)), sum(prompt_lengths)

(9722, 820651)

In [98]:
prompt_texts

{'The following text is the title and description of a course. Give me a list of learning objectives of the course, the subject the course belogs to and what level of student it is appropriate for.\n\n    Text: """Title: Yalan Dünya\'nın yıldızı saç davasını kaybetti! Description: O dava sonuçlandı!"""\n\n    Learning objectives, subject and student level:',
 'The following text is the title and description of a course. Give me a list of learning objectives of the course, the subject the course belogs to and what level of student it is appropriate for.\n\n    Text: """Title: Influencing: Storytelling, Change Management and Governance Description: The ability to influence without force or coercion is important in any organization and at all levels. Modern managers must master the art of persuasive communication, adapt to the needs of the audience, construct a good narrative and inspire the audience to act (Communicate with Impact). They must learn to design and negotiate both informal a

In [76]:
17 / 50

0.34

In [61]:
title_descs_sample.title.iloc[0]

'Grow Your Business with Goldman Sachs 10,000 Women'

In [49]:
[(f, df[df.p.str.contains('description')].o.value_counts()) 
 for (f, df) 
 in top_ed_data]

[('../data/processed/pld_data/ja_coursera_org_2020.csv',
  The Foundations of Teaching for Learning programme is for anyone who is teaching, or who would like to teach, in any subject and any context - be it at school, at home or in the workplace.                                                                                                                                                                              10
  Este curso es uno de los 10 cursos disponibles en la colección Goldman Sachs 10,000 Women, diseñado para emprendedoras listas para llevar su negocio al siguiente nivel.                                                                                                                                                                                                   9
  Prove to potential employers that you’re up to the task by becoming an Autodesk Certified Professional.                                                                                                        