In [None]:
%load_ext autoreload
%autoreload 2

from modules.auth import *
from modules.assessments_endpoints import *
from modules.frame_transformations import *
from modules.config import base_url_illuminate
import logging
import os
import sys
from pyspark import RDD
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder \
    .appName("API Request Parallelization") \
    .getOrCreate()

spark.sparkContext.setLogLevel("INFO")


# Configure logging to use StreamHandler for stdout
logging.basicConfig(
    level=logging.INFO,  # Adjust as needed (e.g., DEBUG, WARNING)
    format="%(asctime)s - %(message)s",  # Log format
    datefmt="%d-%b-%y %H:%M:%S",  # Date format
    handlers=[
        logging.StreamHandler(sys.stdout)  # Direct logs to stdout
    ],
    force=True  # Ensures existing handlers are replaced
)


def get_assessment_results(spark, save_path, view_path, years_data, start_date, end_date_override=None):
    logging.info('\n\n-------------New Illuminate Operations Logging Instance')

    try:
        access_token, expires_in = get_access_token()

        assessments_df, assessment_id_list = get_all_assessments_metadata(access_token)
        assessment_id_list = assessment_id_list[:100] #for testing
        missing_ids_from_metadata = ['114845', '141498'] # Add assessments that are not present in assessements metadata
        assessment_id_list = list(set(assessment_id_list + missing_ids_from_metadata))
        logging.info(f'Here is the length of the assessment_id_list variable {len(assessment_id_list)}')

        test_results_group, log_results_group = parallel_get_assessment_scores(spark, access_token, assessment_id_list, 'Group', start_date, end_date_override=None)
        test_results_standard, log_results_standard = parallel_get_assessment_scores(spark, access_token, assessment_id_list, 'Standard', start_date, end_date_override)
        test_results_no_standard, log_results_no_standard = parallel_get_assessment_scores(spark, access_token, assessment_id_list, 'No_Standard', start_date, end_date_override)
 
        test_results_combined = bring_together_test_results(test_results_no_standard, test_results_standard)
        test_results_view = create_test_results_view(test_results_combined, years_data) #add in grade level col, string matching
        logging.info("Assessment results fetched and processed.")

        
        os.makedirs(save_path, exist_ok=True)

        if years_data == '23-24':
            logging.info(f'Sending data for {years_data} school year')
            send_to_local(save_path, test_results_group, 'assessment_results_group_historical.csv')
            send_to_local(save_path, test_results_combined, 'assessment_results_combined_historical.csv')
            send_to_local(view_path, test_results_view, 'illuminate_assessment_results_historical.csv')
            
        elif years_data == '24-25':
            logging.info(f'Sending data for {years_data} school year')
            send_to_local(save_path, test_results_group, 'assessment_results_group.csv')
            send_to_local(save_path, test_results_combined, 'assessment_results_combined.csv')
            send_to_local(view_path, test_results_view, 'illuminate_assessment_results.csv')
        else:
            raise ValueError(f'Unexpected value for years variable data {years_data}')
        
        #No matter what update assessments_metadata file to display available assessments
        send_to_local(save_path, assessments_df, 'assessments_metadata,csv')
        
        


    except Exception as e:
        logging.error(f"Error fetching assessment results: {e}")
        raise AirflowException("Failed to fetch and process assessment results")


get_assessment_results(spark,
                        save_path = '/home/g2015samtaylor/illuminate',
                        view_path = '/home/g2015samtaylor/views',
                        years_data = '24-25',
                        start_date = '2024-07-01')

# end_date_override='2024-07-01' #should default to todays date



#Create spark session in main script
#Merge branch with main for feauture enhancement practice. 

#Add to requirements.txt
#Re-initaite docker with new tag of spark, in case need to roll back
#Update changes in docker file
#Make sure changes flow through to airflow
#Run locally for string matching


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


25/01/12 23:13:04 INFO SharedState: Setting hive.metastore.warehouse.dir ('null') to the value of spark.sql.warehouse.dir.
25/01/12 23:13:04 INFO SharedState: Warehouse path is 'file:/home/g2015samtaylor/airflow/git_directory/pyspark_local/ICEF_Illuminate/spark-warehouse'.
25/01/12 23:13:05 INFO BlockManagerInfo: Removed broadcast_1_piece0 on icef-instance-2.us-west2-a.c.icef-437920.internal:45131 in memory (size: 3.9 KiB, free: 366.3 MiB)
25/01/12 23:13:05 INFO BlockManagerInfo: Removed broadcast_2_piece0 on icef-instance-2.us-west2-a.c.icef-437920.internal:45131 in memory (size: 3.9 KiB, free: 366.3 MiB)


12-Jan-25 23:13:06 - 

-------------New Illuminate Operations Logging Instance
12-Jan-25 23:13:06 - Calling API token endpoint
12-Jan-25 23:13:06 - Succesfully retrieved API token
12-Jan-25 23:13:06 - Fetching data from https://icefps.illuminateed.com/live/rest_server.php/Api/Assessments/?page=1&limit=1000
12-Jan-25 23:13:07 - Here is the total num of pages on this endpoint 2
12-Jan-25 23:13:07 - Fetching data from https://icefps.illuminateed.com/live/rest_server.php/Api/Assessments/?page=2&limit=1000
12-Jan-25 23:13:07 - Here is the total num of pages on this endpoint 2
12-Jan-25 23:13:07 - Looped through 2 pages. Results for func get_all_assessments_metadata output into DataFrame
12-Jan-25 23:13:07 - Here is the length of the assessment_id_list variable 102


25/01/12 23:13:07 INFO SparkContext: Starting job: collect at /home/g2015samtaylor/airflow/git_directory/pyspark_local/ICEF_Illuminate/modules/assessments_endpoints.py:226
25/01/12 23:13:07 INFO DAGScheduler: Got job 3 (collect at /home/g2015samtaylor/airflow/git_directory/pyspark_local/ICEF_Illuminate/modules/assessments_endpoints.py:226) with 4 output partitions
25/01/12 23:13:07 INFO DAGScheduler: Final stage: ResultStage 3 (collect at /home/g2015samtaylor/airflow/git_directory/pyspark_local/ICEF_Illuminate/modules/assessments_endpoints.py:226)
25/01/12 23:13:07 INFO DAGScheduler: Parents of final stage: List()
25/01/12 23:13:07 INFO DAGScheduler: Missing parents: List()
25/01/12 23:13:07 INFO DAGScheduler: Submitting ResultStage 3 (PythonRDD[7] at collect at /home/g2015samtaylor/airflow/git_directory/pyspark_local/ICEF_Illuminate/modules/assessments_endpoints.py:226), which has no missing parents
25/01/12 23:13:07 INFO MemoryStore: Block broadcast_3 stored as values in memory (esti

12-Jan-25 23:13:20 - Assessment results fetched and processed.
12-Jan-25 23:13:20 - Sending data for 24-25 school year
12-Jan-25 23:13:20 - assessment_results_group.csv saved to /home/g2015samtaylor/illuminate
12-Jan-25 23:13:20 - assessment_results_combined.csv saved to /home/g2015samtaylor/illuminate
12-Jan-25 23:13:20 - illuminate_assessment_results.csv saved to /home/g2015samtaylor/views
12-Jan-25 23:13:20 - assessments_metadata,csv saved to /home/g2015samtaylor/illuminate


In [None]:
new_file = '/home/g2015samtaylor/illuminate/test_file.csv'
df = pd.DataFrame()
df.to_csv(new_file)


In [155]:

import pandas as pd
from modules.frame_transformations import *
pd.set_option('display.max_colwidth', None)

fixes = pd.read_csv('/home/g2015samtaylor/airflow/git_directory/Illuminate/illuminate_historical_column_fixes_2324.csv')
v = pd.read_csv('/home/g2015samtaylor/views/illuminate_assessment_results_historical.csv') 

#Birng back jennys produced excel frame with the master assessments frame
access_token, expires_in = get_access_token()
temp, assessments_df = merge_excel_with_assessments_master_on_title(access_token)

#Right only are titles that are strictly in the produced excel file
#Present means the titles are present in teh excel file and the assessments endpoint
missing = temp.loc[temp['_merge'] == 'right_only']
present = temp.loc[temp['_merge'] == 'both']


In [None]:
display(present[['title', 'current grade', 'updated grade', 'curriculum', 'updated curriculum']].tail(4)) # 130 total titles present

print('Here is the view below with the title present')
v.loc[v['title'].str.contains('Kinder - IM Unit 5 Checkpoint A',case=False)][['title', 'grade', 'curriculum']].drop_duplicates()

In [None]:
display(missing[['title']])

print('Teseting if partial string is present')
assessments_df.loc[assessments_df['title'].str.contains('1st Grade IM Unit', case=False)]['title']

#Looks like 132 of the titles are completely missing from Illuminate when I pull from their assessments endpoint. 
#These are titles that originagte from the results-20250106-085115 excel file. 

#Without any matching title, or matching assessment_id I am unsure of what to do with these moving forward. 

# A couple of questions is where did you get these title names from? Can you obtain the assessments ids for these if so?

#The strings matching or assessment_id matching is possible when the titles are present

In [None]:
assessments_df.loc[assessments_df['title'].str.contains('Kinder - IM Unit 7', case=False)]
#Missing Kinder - IM Unit 6 Checkpoint B

In [None]:
assessments_df.loc[assessments_df['title'].str.contains('Kinder - IM Unit 8', case=False)]
#Missing edn of unit assessment

# Re-testing and double checking get_all_assessments_metadata()

In [136]:
# https://demo.illuminateed.com/live/rest_server.php/Api/Assessments/

# Set the initial page and an empty DataFrame to store all results
page = 1
all_results = pd.DataFrame()

#To ensure all pages are looped through properly
# while True:

#Base URL and headers for API requests
url_ext = f'Assessments/?page={page}&limit=1000'
headers = {"Authorization": f"Bearer {access_token}"}

logging.info(f'Fetching data from {base_url_illuminate + url_ext}')

# try:
    # Make the API request with the current page number
response = requests.get(base_url_illuminate + url_ext.format(url_ext), headers=headers)

#From raw request here are the outputs
# 'page': 1,
#  'num_pages': 2,
#  'num_results': 1126,

In [None]:
access_token, expires_in = get_access_token()
assessments_df, assessment_id_list = get_all_assessments_metadata(access_token)

#All possible assessments from this endpoint are grabbed
assessments_df['assessment_id'].nunique() == 1126

# Efforts


#Based on complete string matching of the title column between results-20250106-085115 excel file & all of the assessments found at the  in the views db.

#When merging with the illuminate_assessments_results there are 160 titles that are present in the illuminate_assessments_results table & 132 that are not are not found.

#Becuase these are strictly in the produced excel file, hence there is string matching that can occur on the title.

------------------------------------------
# Example - because titles with IM has multiple grades I can not apply a singule updated grade to all based on IM being in the title
# However it will work for updated curriculum. 
