In [None]:
import pandas as pd
import numpy as np

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
pd.options.display.max_columns = None
pd.set_option('display.precision', 2)

df = pd.read_csv('/content/drive/MyDrive/Colab Data/KY EPSB/All Years/df_header.csv')

df_header = df.loc[:,['District Code', 'District', 'School Code', 'School', 'Level']]
school_code = df_header['School Code']

In [None]:
# Rename columns
# Use school_code to filter out any schools that are not in the df_header

def rename_five_cols(df, end_year):
  df.columns = ['End Year', 'School Code', 'Proficiency Score', 'Classification', 'Level']
  df['End Year'] = end_year
  new_df = df[df['School Code'].isin(school_code)]
  return new_df.reset_index(drop=True)

def rename_four_cols(df, end_year):
  df.columns = ['End Year', 'School Code', 'Proficiency Score', 'Level']
  df['End Year'] = end_year
  new_df = df[df['School Code'].isin(school_code)]
  return new_df.reset_index(drop=True)

# Import test score data

In [None]:
score_2012 = pd.read_csv('/content/drive/MyDrive/Colab Data/KY EPSB/2011-2012/ACCOUNTABILITY_PROFILE-2012.csv').loc[:,['SCH_YEAR', 'SCH_CD', 'OVERALL_SCORE', 'CLASSIFICATION', 'CONTENT_LEVEL']] # total points available
score_2013 = pd.read_csv('/content/drive/MyDrive/Colab Data/KY EPSB/2012-2013/ACCOUNTABILITY_PROFILE-2013.csv').loc[:,['SCH_YEAR', 'SCH_CD', 'OVERALL_SCORE', 'CLASSIFICATION', 'CONTENT_LEVEL']] # total points available
score_2014 = pd.read_csv('/content/drive/MyDrive/Colab Data/KY EPSB/2013-2014/ACCOUNTABILITY_PROFILE-2014.csv').loc[:,['SCH_YEAR', 'SCH_CD', 'OVERALL_SCORE', 'CLASSIFICATION', 'CONTENT_LEVEL']]
score_2015 = pd.read_csv('/content/drive/MyDrive/Colab Data/KY EPSB/2014-2015/ACCOUNTABILITY_PROFILE-2015.csv').loc[:,['SCH_YEAR', 'SCH_CD', 'OVERALL_SCORE', 'CLASSIFICATION', 'CONTENT_LEVEL']]
score_2016 = pd.read_csv('/content/drive/MyDrive/Colab Data/KY EPSB/2015-2016/ACCOUNTABILITY_PROFILE-2016.csv').loc[:,['SCH_YEAR', 'SCH_CD', 'OVERALL_SCORE', 'CLASSIFICATION', 'CONTENT_LEVEL']]

score_2012 = rename_five_cols(score_2012, 2012)
score_2013 = rename_five_cols(score_2013, 2013)
score_2014 = rename_five_cols(score_2014, 2014)
score_2015 = rename_five_cols(score_2015, 2015)
score_2016 = rename_five_cols(score_2016, 2016)

In [None]:
score_2017 = pd.read_csv('/content/drive/MyDrive/Colab Data/KY EPSB/2016-2017/ACCOUNTABILITY_SUMMARY-2017.csv').loc[:,['SCH_YEAR', 'SCH_CD', 'ACHIEVEMENT_POINTS', 'CONTENT_LEVEL']]
score_2018 = pd.read_csv('/content/drive/MyDrive/Colab Data/KY EPSB/2017-2018/ACCOUNTABILITY_SUMMARY-2018.csv').loc[:,['SCH_YEAR', 'SCH_CD', 'PROFICIENCY_RATE', 'LEVEL']] # All Null values for Proficiency Rating

score_2017 = rename_four_cols(score_2017, 2017)
score_2018 = rename_four_cols(score_2018, 2018)


In [None]:
score_2019 = pd.read_csv('/content/drive/MyDrive/Colab Data/KY EPSB/2018-2019/ACCOUNTABILITY_PROFILE-2019.csv').loc[:,['SCH_YEAR', 'SCH_CD', 'PROFICIENCY_RATE', 'PROFICIENCY_RATING', 'LEVEL']] # PROFICIENCY rate available

score_2022 = pd.read_csv('/content/drive/MyDrive/Colab Data/KY EPSB/2021-2022/accountability_profile_2022.csv').loc[:,['SCHOOL YEAR','SCHOOL CODE', 'OVERALL INDICATOR RATE', 'OVERALL INDICATOR RATING', 'LEVEL']]  # PROFICIENCY scores NOT available
score_2023 = pd.read_csv('/content/drive/MyDrive/Colab Data/KY EPSB/2022-2023/accountability_profile_2023.csv').loc[:,['SCHOOL YEAR','SCHOOL CODE', 'OVERALL COMBINED INDICATOR RATE', 'OVERALL INDICATOR RATING', 'LEVEL']]

score_2019 = rename_five_cols(score_2019, 2019)
score_2022 = rename_five_cols(score_2022, 2022)
score_2023 = rename_five_cols(score_2023, 2023)

# Concat and standardize values

In [None]:
all_scores = pd.concat([score_2012, score_2013, score_2014, score_2015, score_2016,
                        score_2017, score_2018, score_2019, score_2022, score_2023,
                        ], axis =0).reset_index(drop=True)

In [None]:
# Clean up Classification values
all_scores['Level'].replace(['Elementary School', 'Middle School', 'High School'], ['ES', 'MS', 'HS'], inplace=True)
all_scores['Classification'].replace([1,2,3,4,5], [0,1,2,3,4], inplace=True)
all_scores['Classification'].replace(['Needs Improvement/Progressing', 'Proficient/Progressing', 'Distinguished/Progressing'],
                                     ['Needs Improvement', 'Proficient', 'Distinguished'], inplace=True)
all_scores['Classification'].replace(['Very Low', 'Low', 'Medium', 'High', 'Very High'],
                     [0,1,2,3,4], inplace=True)

In [None]:
# Create a new column 'Rating' in df_scores
all_scores['Rating Code'] = all_scores['Classification']
all_scores['Rating Code'].replace(['Needs Improvement', 'Proficient', 'Distinguished'], np.NaN, inplace=True)
all_scores['Classification'].replace([0,1,2,3,4], np.NaN, inplace=True)


In [None]:
# Create columns with the codes
all_scores['Classification Code'] = all_scores['Classification'].replace(['Needs Improvement', 'Proficient', 'Distinguished'], [0, 1, 2])
all_scores['Rating'] = all_scores['Rating Code'].replace([0,1,2,3,4], ['Very Low', 'Low', 'Medium', 'High', 'Very High'])


# Merge with header

In [None]:
df_scores = pd.merge(df_header, all_scores, on=['School Code', 'Level'], how='inner')
df_scores.sort_values(by=['End Year', 'District', 'School'], ascending=True, inplace=True)
df_scores['End Year Code'] = df_scores['End Year'] - 2012
df_scores['Level Code'] = df_scores['Level'].replace(['ES', 'MS', 'HS'], [0, 1, 2])
df_scores.reset_index(drop=True, inplace=True)

In [None]:
# Reorder columns
reordered_columns = ['End Year', 'End Year Code',
                     'District', 'District Code',
                     'School', 'School Code',
                     'Level', 'Level Code',
                     'Proficiency Score',
                     'Classification', 'Classification Code',
                     'Rating', 'Rating Code']
df_scores = df_scores[reordered_columns]


In [None]:
# Round and convert to an integer
df_scores['Proficiency Score'] = df_scores['Proficiency Score'].round(0).astype('Int64')

In [None]:
# Check for duplicated rows
df_scores[df_scores.duplicated(subset = ['End Year', 'School Code', 'Level'] , keep= False)]

Unnamed: 0,End Year,End Year Code,District,District Code,School,School Code,Level,Level Code,Proficiency Score,Classification,Classification Code,Rating,Rating Code


In [None]:
df_scores.to_csv('/content/drive/MyDrive/Colab Data/KY EPSB/All Years/Unpublished/df_scores.csv', index = False)

In [None]:
df_scores

Unnamed: 0,End Year,End Year Code,District,District Code,School,School Code,Level,Level Code,Proficiency Score,Classification,Classification Code,Rating,Rating Code
0,2012,0,Adair County,1,Adair County Elementary School,1016,ES,0,63,Proficient,1.0,,
1,2012,0,Adair County,1,Adair County High School,1010,HS,2,55,Needs Improvement,0.0,,
2,2012,0,Adair County,1,Adair County Middle School,1014,MS,1,48,Needs Improvement,0.0,,
3,2012,0,Adair County,1,John Adair Intermediate School,1013,ES,0,69,Proficient,1.0,,
4,2012,0,Allen County,5,Allen County Intermediate Center,5060,ES,0,56,Needs Improvement,0.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
12718,2023,11,Woodford County,601,Northside Elementary School,601120,ES,0,60,,,Medium,2.0
12719,2023,11,Woodford County,601,Simmons Elementary School,601075,ES,0,60,,,Medium,2.0
12720,2023,11,Woodford County,601,Southside Elementary School,601050,ES,0,70,,,Medium,2.0
12721,2023,11,Woodford County,601,Woodford County High School,601084,HS,2,62,,,Medium,2.0


In [None]:
df_scores.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12723 entries, 0 to 12722
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   End Year             12723 non-null  int64  
 1   End Year Code        12723 non-null  int64  
 2   District             12723 non-null  object 
 3   District Code        12723 non-null  int64  
 4   School               12723 non-null  object 
 5   School Code          12723 non-null  int64  
 6   Level                12723 non-null  object 
 7   Level Code           12723 non-null  int64  
 8   Proficiency Score    12723 non-null  Int64  
 9   Classification       6395 non-null   object 
 10  Classification Code  6395 non-null   float64
 11  Rating               3794 non-null   object 
 12  Rating Code          3794 non-null   float64
dtypes: Int64(1), float64(2), int64(5), object(5)
memory usage: 1.3+ MB


In [None]:
df_scores.nunique()

End Year                 10
End Year Code            10
District                174
District Code           174
School                 1223
School Code            1249
Level                     3
Level Code                3
Proficiency Score       101
Classification            3
Classification Code       3
Rating                    5
Rating Code               5
dtype: int64