In [1]:
import json
import time
import os
import numpy as np
import pandas as pd
from openai import OpenAI
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv())
# Initialize the OpenAI API client
client = OpenAI()  # API anahtarınızı ortam değişkeninden alın

# Define custom exception for rate limiting
class RateLimitException(Exception):
    pass

# Veri yükleme/kaydetme işlemleri için yardımcı fonksiyonlar
def load_progress():
    """Load existing progress and entity database from files."""
    try:
        if os.path.exists("processing_progress.json"):
            with open("processing_progress.json", "r", encoding="utf-8") as f:
                progress = json.load(f)
                print(f"Loaded progress: Processed {progress['processed_count']} theses")
                return progress
        return {
            "processed_count": 0,
            "last_processed_index": -1,
            "failed_indices": []
        }
    except Exception as e:
        print(f"Error loading progress: {e}")
        return {
            "processed_count": 0,
            "last_processed_index": -1,
            "failed_indices": []
        }

def load_entity_database():
    """Load existing entity database if available."""
    try:
        if os.path.exists("entity_database.json"):
            with open("entity_database.json", "r", encoding="utf-8") as f:
                database = json.load(f)
                entity_count = sum(len(entities) for entities in database.values())
                print(f"Loaded entity database with {entity_count} entities")
                return database
        return {
            "STAKEHOLDER": [],
            "PROBLEM_CHALLENGE": [],
            "SOLUTION_APPROACH": [],
            "FOCUS_AREA_THEME": []
        }
    except Exception as e:
        print(f"Error loading entity database: {e}")
        return {
            "STAKEHOLDER": [],
            "PROBLEM_CHALLENGE": [],
            "SOLUTION_APPROACH": [],
            "FOCUS_AREA_THEME": []
        }

def save_progress(progress):
    """Save current processing progress."""
    with open("processing_progress.json", "w", encoding="utf-8") as f:
        json.dump(progress, f, indent=2)

def save_entity_database(database):
    """Save current entity database."""
    with open("entity_database.json", "w", encoding="utf-8") as f:
        json.dump(database, f, indent=2)

def save_results(results):
    """Save processed thesis results."""
    with open("thesis_results.json", "w", encoding="utf-8") as f:
        json.dump(results, f, indent=2)

def get_system_prompt(existing_entities_json):
    """Generate the system prompt with the current entity database."""
    return f"""You are a specialized academic text analyzer who focuses on extracting entities and their relationships from distance education thesis abstracts. Your task is to extract four types of entities (stakeholders, problems/challenges, solutions/approaches, and focus areas/themes) and their relationships.

EXISTING ENTITIES DATABASE:
{json.dumps(existing_entities_json, indent=2)}

ENTITY EXTRACTION RULES:
1. STAKEHOLDER:
   - Extract people, groups, or institutions involved in or affected by distance education
   - Examples: student, teacher, researcher, university, school administrator, educational institution
   - Always use singular form and standardized terms (e.g., "student" not "students")
   - Balance specificity with generality - use specific terms (e.g., "physics teacher") only when the specialty is important to the context
   - Educational institutions should be extracted with their specific names when available (e.g., "National Defense University" rather than generic "university")
   - Include "thesis" itself as a STAKEHOLDER to enable proper EXAMINES relationships
   - FIRST CHECK if the stakeholder already exists in the database and USE THE EXISTING ENTITY NAME for consistency
   
2. PROBLEM_CHALLENGE:
   - Extract issues, barriers, difficulties, or challenges mentioned in distance education
   - Examples: internet access issue, student motivation problem, assessment difficulty, lack of interaction
   - Include both technical problems (infrastructure) and pedagogical challenges (engagement)
   - Use descriptive phrases that identify both the problem area and its specific manifestation (e.g., "low technological pedagogical content knowledge" rather than just "knowledge gap")
   - Focus on core problems, not symptoms
   - FIRST CHECK if the problem/challenge already exists in the database and USE THE EXISTING ENTITY NAME for consistency
   
3. SOLUTION_APPROACH:
   - Extract methods, strategies, models, frameworks, implementations, or technologies that address challenges
   - Examples: hybrid education model, in-service training, cooperative learning, flipped classroom, learning management system, video conferencing tool
   - Include both pedagogical approaches, technological tools, and organizational/administrative solutions
   - Technologies should be specific when possible (e.g., "learning management system" rather than general "technology")
   - Include data collection tools when they are significant to the solution (e.g., "mobile learning application", "student assessment platform")
   - Do not include research methodologies (statistical analysis, content analysis, etc.) in this category
   - Do not classify "distance education" itself as a solution approach, it should be a FOCUS_AREA_THEME
   - FIRST CHECK if the solution/approach already exists in the database and USE THE EXISTING ENTITY NAME for consistency
   
4. FOCUS_AREA_THEME:
   - Extract main research areas, contexts, or themes that the thesis examines
   - Examples: digital competency, teacher attitudes, student motivation, academic achievement
   - "Distance education" itself should typically be classified as a FOCUS_AREA_THEME, not a solution
   - When "technology integration" is mentioned, prefer the more specific term "technology integration in education" for clarity
   - Use a granular approach for educational contexts (e.g., "science education" rather than just "education")
   - Always identify at least one primary focus area for each thesis
   - Include educational levels (higher education, K-12) and subject domains when relevant
   - FIRST CHECK if the focus area/theme already exists in the database and USE THE EXISTING ENTITY NAME for consistency

RELATIONSHIP RULES:
1. USES:
   - Use when a stakeholder employs a solution/approach
   - Example: "faculty member" USES "learning management system"
   - Example: "classroom teacher" USES "video support"
   - This relationship typically connects STAKEHOLDER with SOLUTION_APPROACH
   - Note: All technological tools and platforms are now classified under SOLUTION_APPROACH

2. FACES:
   - Use when a stakeholder encounters a problem or challenge
   - Example: "UZEM manager" FACES "systemic problems"
   - Example: "student" FACES "transportation problem"
   - This relationship typically connects STAKEHOLDER with PROBLEM_CHALLENGE

3. REQUIRES:
   - Use when one entity necessitates another entity
   - Example: "distance education" REQUIRES "learning management system"
   - Example: "hybrid education system" REQUIRES "new generation learning technologies"
   - This relationship can connect various entity types where one depends on the other
   - Especially useful for connecting FOCUS_AREA_THEME with necessary SOLUTION_APPROACH entities

4. ADDRESSES:
   - Use when a solution targets a specific problem or when a focus area aims at a stakeholder
   - Example: "in-service training program" ADDRESSES "digital competency gap"
   - Example: "learning management system" ADDRESSES "lack of interaction"
   - This relationship typically connects SOLUTION_APPROACH with PROBLEM_CHALLENGE
   - When a technological solution addresses a problem, always create this relationship

5. ENHANCES:
   - Use when one entity positively affects another entity
   - Example: "technology integration in education" ENHANCES "course permanence"
   - Example: "video support" ENHANCES "student interest"
   - This relationship indicates positive impact between various entity types
   - Can connect SOLUTION_APPROACH with FOCUS_AREA_THEME or with STAKEHOLDER

6. HINDERS:
   - Use when one entity negatively affects another entity
   - Example: "technological inadequacy" HINDERS "distance education quality"
   - Example: "lack of parent support" HINDERS "student success"
   - This relationship indicates negative impact between various entity types

7. EXAMINES:
   - Use when a thesis or research investigates a specific topic or area
   - Example: "thesis" EXAMINES "digital competency levels"
   - Example: "research" EXAMINES "distance education management"
   - ALWAYS create at least one EXAMINES relationship for each thesis
   - This relationship typically connects "thesis" (which should be extracted as a STAKEHOLDER) with the main FOCUS_AREA_THEME

# SPECIFIC GUIDELINES

1. Extract each unique entity only once in its standardized form
2. All entities should be in English, singular form, and lowercase format (except for proper nouns)
3. Entity names should be concise but descriptive (2-5 words recommended)
4. Each relationship must use entities from your extracted list
5. For each entity category, extract 1-7 items depending on thesis content and complexity
6. Create at least 5 meaningful relationships that are explicitly stated or strongly implied in the text
7. Prioritize explicitly stated relationships; avoid speculation or reaching conclusions not supported by the text
8. ALWAYS check if an entity already exists in the database before creating a new one
9. When you find a match or similar concept in the existing database, USE THAT ENTITY NAME rather than creating a variant
10. Only create new entities when you are confident no suitable match exists in the database
11. For each thesis, ALWAYS create at least one EXAMINES relationship to capture the main research focus
12. Maintain categorical consistency: the same entity should not appear in multiple categories (e.g., "distance education" should be consistently classified as FOCUS_AREA_THEME)
13. Research methodologies (statistical analysis, content analysis, descriptive analysis) should NOT be extracted as entities in any category
14. Standardize similar entities: use consistent naming for similar concepts (e.g., choose either "teacher" or "instructor" consistently)
15. Use only the seven defined relationship types and maintain consistent directionality:
    - USES: stakeholder → solution/approach they employ
    - FACES: stakeholder → problem/challenge they encounter
    - ADDRESSES: solution/approach → problem/challenge it solves
    - ENHANCES: entity → entity it improves or strengthens
    - REQUIRES: entity → entity it needs to function effectively
    - HINDERS: entity → entity it impedes or negatively affects
    - EXAMINES: thesis → focus area/theme being researched
16. All technological tools, platforms, and educational technologies should be classified under SOLUTION_APPROACH when they are presented as solutions to educational challenges
17. Ensure that each technological solution classified under SOLUTION_APPROACH has at least one meaningful relationship showing either who USES it, what problem it ADDRESSES, or what it ENHANCES
18. Use descriptive phrases for problems that identify both the problem area and its specific manifestation (e.g., "low technological pedagogical content knowledge" rather than just "knowledge gap")
19. For educational institutions, extract with their specific names when available (e.g., "National Defense University" rather than just "university")
20. Always include "thesis" as a STAKEHOLDER to enable proper EXAMINES relationships
21. Aim for a balanced distribution of relationship types when the content supports it
22. Use ENHANCES only when clear positive impact is stated in the text
23. Use REQUIRES only when one entity is clearly dependent on another
24. Consider including HINDERS relationships when obstacles, barriers, or negative effects are mentioned

JSON output:
{{
  "thesis_id": "<string>",
  "year": "<string|number>",
  "entities": {{
    "STAKEHOLDER": [
      "<ENTITY_NAME_1>",
      "<ENTITY_NAME_2>"
    ],
    "PROBLEM_CHALLENGE": [
      // ...
    ],
    "SOLUTION_APPROACH": [
      // ...
    ],
    "FOCUS_AREA_THEME": [
      // ...
    ]
  }},
  "relations": [
    {{
      "source": "<ENTITY_NAME>",
      "target": "<ENTITY_NAME>",
      "relation": "<RELATION_TYPE>"
    }}
    // Relation objects...
  ]
}}
"""

def parse_json_from_response(response_text):
    """Extract JSON from the response text, handling different formats."""
    # Try to parse directly first
    try:
        return json.loads(response_text)
    except json.JSONDecodeError:
        # Try to extract JSON from markdown code blocks
        if "```json" in response_text:
            json_text = response_text.split("```json")[1].split("```")[0].strip()
            return json.loads(json_text)
        elif "```" in response_text:
            json_text = response_text.split("```")[1].split("```")[0].strip()
            return json.loads(json_text)
        else:
            # Try to find JSON-like structure
            start_idx = response_text.find("{")
            end_idx = response_text.rfind("}") + 1
            if start_idx >= 0 and end_idx > 0:
                json_text = response_text[start_idx:end_idx]
                return json.loads(json_text)
            raise ValueError("Could not extract JSON from response")

@retry(
    retry=retry_if_exception_type(RateLimitException),
    stop=stop_after_attempt(5),
    wait=wait_exponential(multiplier=1, min=4, max=60),
    reraise=True
)
def generate_entities_with_retry(input_data, existing_entities_json):
    """Generate entities from the input data using the OpenAI model with retry logic."""
    try:
        # Get the current system prompt with updated entity database
        system_prompt = get_system_prompt(existing_entities_json)
        
        # Format the input data for GPT
        thesis_text = f"""
        Thesis ID: {input_data['thesis_id']}
        Year: {input_data['year']}
        Title: {input_data['title']}
        Abstract: {input_data['abstract']}
        """
        
        # Generate content using the OpenAI model
        response = client.chat.completions.create(
            model="gpt-4.1",  # GPT-4o modeli kullanıyoruz (GPT-4 Omni)
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": thesis_text}
            ],
            temperature=0.3,
            response_format={"type": "json_object"},
            max_tokens=8192,
        )
        
        # Parse the response
        result = parse_json_from_response(response.choices[0].message.content)
        return result
        
    except Exception as e:
        error_msg = str(e).lower()
        # Check if it's a rate limit error
        if "rate limit" in error_msg or "quota" in error_msg or "too many requests" in error_msg:
            print(f"Rate limit exceeded. Retrying after backoff: {e}")
            raise RateLimitException(f"Rate limit error: {e}")
        else:
            print(f"Error generating content: {e}")
            raise

def update_entity_database(result, existing_entities_json):
    """Update the existing entity database with new entities from the result."""
    if not result or "entities" not in result:
        print("No valid entities found in result")
        return existing_entities_json
    
    try:
        entities = result['entities']
        
        # Extract entities from each category
        for category in ["STAKEHOLDER", "PROBLEM_CHALLENGE", "SOLUTION_APPROACH", "FOCUS_AREA_THEME"]:
            # Get new entities
            new_entities = entities.get(category, [])
            
            # Add new entities to existing ones (maintaining uniqueness)
            existing_entities_json[category] = list(set(existing_entities_json.get(category, []) + new_entities))
        
        return existing_entities_json
    except Exception as e:
        print(f"Error updating entity database: {e}")
        return existing_entities_json

def process_thesis(thesis_data, existing_entities_json):
    """Process a single thesis and update the entity database."""
    try:
        # Format the thesis data for processing
        input_data = {
            "thesis_id": str(thesis_data.get("tez_no", "")),
            "year": thesis_data.get("year", ""),
            "title": thesis_data.get("en_title", ""),
            "abstract": thesis_data.get("abstract_en", "")
        }
        
        # Generate entities from the thesis data
        result = generate_entities_with_retry(input_data, existing_entities_json)
        
        if result is None:
            print(f"Failed to process thesis ID: {input_data['thesis_id']}")
            return existing_entities_json, None
        
        # Add thesis_id and year to the result
        result["thesis_id"] = input_data["thesis_id"]
        result["year"] = input_data["year"]
        
        # Update the entity database
        updated_entities_json = update_entity_database(result, existing_entities_json)
        
        return updated_entities_json, result
    
    except Exception as e:
        print(f"Error processing thesis: {e}")
        return existing_entities_json, None

def process_dataframe(df, start_from_index=0, batch_size=None, retry_failed=True):
    """Process theses from a pandas DataFrame sequentially."""
    # Load existing progress and data
    progress = load_progress()
    entity_database = load_entity_database()
    
    # Load existing results if available
    all_results = []
    if os.path.exists("thesis_results.json"):
        try:
            with open("thesis_results.json", "r", encoding="utf-8") as f:
                all_results = json.load(f)
                print(f"Loaded {len(all_results)} previous results")
        except Exception as e:
            print(f"Error loading previous results: {e}")
    
    # Determine starting index - use the highest of provided index, saved index
    start_index = max(start_from_index, progress["last_processed_index"] + 1)
    
    # If we have failed indices and retry_failed is True, process failed ones first
    failed_indices = progress.get("failed_indices", [])
    if failed_indices and retry_failed:
        print(f"Processing {len(failed_indices)} previously failed theses first")
        for idx in failed_indices[:]:  # Create a copy to iterate over while modifying
            if idx < len(df):
                thesis = df.iloc[idx].to_dict()
                thesis_id = str(thesis.get('tez_no', 'unknown'))
                print(f"Retrying thesis ID: {thesis_id} (index {idx})")
                
                try:
                    # Process the thesis with the CURRENT entity database
                    entity_database, result = process_thesis(thesis, entity_database)
                    
                    if result:
                        # Add or update result in all_results
                        result_idx = next((i for i, r in enumerate(all_results) 
                                        if r.get('thesis_id') == thesis_id), None)
                        if result_idx is not None:
                            all_results[result_idx] = result
                        else:
                            all_results.append(result)
                        
                        # Remove from failed indices
                        failed_indices.remove(idx)
                        
                        # Update progress
                        progress["processed_count"] += 1
                    
                    # Save progress after each thesis
                    progress["failed_indices"] = failed_indices
                    save_progress(progress)
                    save_entity_database(entity_database)
                    save_results(all_results)
                    
                    # Add a small delay to avoid hitting rate limits
                    time.sleep(0.5)
                    
                except Exception as e:
                    print(f"Error retrying thesis at index {idx}: {e}")
    
    # Process the main batch
    print(f"Starting thesis processing from index {start_index}")
    
    # Determine end index
    end_index = len(df)
    if batch_size:
        end_index = min(start_index + batch_size, len(df))
    
    # Process each thesis in the range sequentially
    for i in range(start_index, end_index):
        thesis = df.iloc[i].to_dict()
        thesis_id = str(thesis.get('tez_no', 'unknown'))
        
        # Check if this thesis was already processed
        if any(r.get('thesis_id') == thesis_id for r in all_results):
            print(f"Skipping already processed thesis ID: {thesis_id} (index {i})")
            continue
        
        print(f"Processing thesis ID: {thesis_id} (index {i}, {i-start_index+1}/{end_index-start_index})")
        
        try:
            # Process the thesis with the CURRENT entity database
            entity_database, result = process_thesis(thesis, entity_database)
            
            if result:
                # Add or update result in all_results
                result_idx = next((i for i, r in enumerate(all_results) 
                                if r.get('thesis_id') == thesis_id), None)
                if result_idx is not None:
                    all_results[result_idx] = result
                else:
                    all_results.append(result)
                
                # Update progress
                progress["processed_count"] += 1
            else:
                # Add to failed indices
                if i not in failed_indices:
                    failed_indices.append(i)
            
            # Update and save progress
            progress["last_processed_index"] = i
            progress["failed_indices"] = failed_indices
            save_progress(progress)
            save_entity_database(entity_database)
            save_results(all_results)
            
            # Add a small delay to avoid hitting rate limits
            time.sleep(0.5)
            
        except Exception as e:
            print(f"Error processing thesis at index {i}: {e}")
            # Add to failed indices
            if i not in failed_indices:
                failed_indices.append(i)
            progress["failed_indices"] = failed_indices
            save_progress(progress)
    
    print("Processing complete!")
    return entity_database, all_results

# # Programı çalıştırma
# def main():
#     # DataFrame'i yükle
#     df = pd.read_csv("thesis_data.csv")  # Dosya yolunu doğru şekilde ayarlayın
    
#     # İşlemeyi başlat
#     process_dataframe(df, start_from_index=0, batch_size=None, retry_failed=True)

# if __name__ == "__main__":
#     main()

In [2]:
import pandas as pd
data = pd.read_csv('/mnt/depo/YüksekLisans/dönem-projesi/cleaned-data/uzaktan-eğitim-cleaned_english.csv')
df = data.copy()
print(df.shape)

(703, 5)


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 703 entries, 0 to 702
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   tez_no       703 non-null    int64 
 1   year         703 non-null    int64 
 2   university   703 non-null    object
 3   abstract_en  703 non-null    object
 4   en_title     701 non-null    object
dtypes: int64(2), object(3)
memory usage: 27.6+ KB


In [4]:
df.head()

Unnamed: 0,tez_no,year,university,abstract_en,en_title
0,782422,2023,Milli Savunma Üniversitesi,The hybrid system has become a necessity of to...,Classroom model designed with new generation l...
1,821224,2023,Muğla Sıtkı Koçman Üniversitesi,The main purpose of this study is to evaluate ...,Evaluation of the digital competence levels of...
2,739239,2022,Kırşehir Ahi Evran Üniversitesi,This research aims to determine the level of t...,Determination of classroom teacher\'s technolo...
3,724056,2022,Sakarya Üniversitesi,"In this research, it is aimed to specify the p...",The problems and solution strategies encounter...
4,722422,2022,Ondokuz Mayıs Üniversitesi,If you want to take special talented individua...,Evaluation of distance education applications ...


In [9]:
# Programı çalıştırma
def main():
    # DataFrame'i yükle
    # df = pd.read_csv("thesis_data.csv")  # Dosya yolunu doğru şekilde ayarlayın
    
    # İşlemeyi başlat
    process_dataframe(df, start_from_index=0, batch_size=None, retry_failed=True)

if __name__ == "__main__":
    main()

Loaded progress: Processed 700 theses
Error loading entity database: Expecting value: line 1 column 1 (char 0)
Loaded 700 previous results
Processing 3 previously failed theses first
Retrying thesis ID: 117810 (index 695)
Retrying thesis ID: 100177 (index 696)
Retrying thesis ID: 117774 (index 697)
Starting thesis processing from index 703
Processing complete!


In [7]:
df.iloc[695].abstract_en

'The purpose of this study is to compare the academic success of Open High School (OHS) - Professional Open Teaching Program (POTP) students and Girl Vocational High School (GVHS) students. The population of this study, during 1999-2000 education semester second term, consisted of the students of clothing department in Ziibeyde Hanım Girl Vocational School and the students of OHS-POTP. All (25) POTP students, 36 students chosen from GVHS were included randomly in this study. In the study, "The Information Summative Test" which is used for measuring the level of cognitive behaviours of the students, was prepared as to the purposes of GVHS Educational Programs. It was obtained the validity and the reliability of the work by testing. It was asked 55 for the first class, 70 for the second class, 80 questions for the third class. The test was done as a pre-test at the beginning of the term and as a post-test at the end of the term. Counting the pre_post-tests scores of the students, it was 