In [1]:
'''Clear out temporary variables and get a clean session'''
try:
    from IPython import get_ipython
    get_ipython().magic('clear')
    get_ipython().magic('reset -f')
except:
    pass




### Data preparation
The data in this dataset come from a real-world admissions board for a large high school district.  The district runs several specialized programs for which they run admissions.  As such, the data is taken from spreadsheets created over the course of 10 years.  As such, column names, admissions criteria used and data maintained over the years has changed. This initial data preparation involves getting the original data into a consistent format.  There will be no data manipulation or analysis conducted at this stage.

In [2]:
'''Import necessary libraries'''
import pandas as pd
import numpy as np
import os
import glob
import re

import missingno as msno
import pickle
import bz2

import sklearn
from sklearn.model_selection import GridSearchCV, RepeatedStratifiedKFold, train_test_split, RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer,KNNImputer
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn import set_config
#from sklearn import missingpy

%matplotlib inline

In [None]:
'''Import all csv files in active folder and create dictionary of dataframes'''
csv_files = glob.glob('*.csv')

var_dict = {}

for file in csv_files:
    key = 'df_'+str(re.split('.csv', file)[0])
    value = pd.read_csv(file)
    var_dict[key] = value

In [None]:
'''Update global variable table and create individual dataframes'''
'''globals().update(var_dict)

df_list = []
for key, value in var_dict.items():
    print(key,value)'''

In [21]:
'''List of all dataframes'''
var_dict.keys()

dict_keys(['df_2008_2009_ib_magnet', 'df_2009_2010_ib', 'df_2009_2010_magnet', 'df_2010-2011_ib', 'df_2010-2011_magnet', 'df_2011_2012_in_district', 'df_2011_2012_out_district', 'df_2012_2013_in_district', 'df_2012_2013_out_district', 'df_2013_2014_in_district', 'df_2013_2014_out_district', 'df_2014_2015_in_district', 'df_2014_2015_out_district', 'df_2015_2016_in_district', 'df_2015_2016_out_district', 'df_2016_2017_in_district', 'df_2016_2017_out_district', 'df_2017_2018_in_district', 'df_2017_2018_out_district', 'df_2018_2019_in_district', 'df_2018_2019_out_district', 'df_2019_2020_in_district', 'df_2019_2020_out_district', 'df_2020_2021_in_district', 'df_2020_2021_out_district', 'df_2021_2022_in_district', 'df_2021_2022_out_district'])

In [None]:
for key, item in var_dict.items():
    print(key, item.columns)

In [None]:
'''List of columns to drop from original files'''
drop_columns = ['Record Created Date', '#', 'Sibling', 'NYr Grade', 'Also GHS', 'Tier', 'DOB', 'Address', 'City', 'State',\
                'ZIP', 'Home Phone', 'Work/Cell Phone', 'Parent email', 'ID', 'District of Residence', 'Sending District',\
               'Current School', 'Middle School', 'Current School', 'Current Grade', 'Grade Level', 'OHS', 'Academy?',\
               'Academy', 'Math Course', 'Gender', 'G', 'Foreign Language', 'Length', 'IEP?', 'IEP/ISP', 'IEP', '504?',\
               '504', '7th Grade Math', '7th Grade Sci', '7th Grade Eng', '7th Grade SS', '7th Grade WL', '8th Grade Math',\
               '8th Grade Sci', '8th Grade Eng', '8th Grade SS', '8th Grade WL', 'Transcripts', 'Transcript', 'Transcript?',\
               'Attendance?', 'Attendance', 'Discipline?', 'Discipline', 'Tests?', 'Recommendations', 'Recs', 'Rec',\
               'Notes', 'Notes', 'Notes.1', 'Notes.2' 'Math/Sci Rec', 'Math/Sci Adv Course', 'Eng/SS Adv Course', 'Adv Courses',\
               'Waitlist #', 'WL #', 'Withdrew  in first year', 'Withdrew', 'Reason', 'Acc Sp.', 'Acc Sp Gr', 'GPA 1st year',\
               'Prediction', 'Student email', 'Requested HS', 'Sending Supt.', 'Preference', 'Info Release consent',\
               'Tuition Code', 'Reception Letter Sent', 'Reception RSVP', 'Testing Number', 'Ethnic', 'Unnamed: 49', \
               'Unnamed: 53', 'Unnamed: 52', 'Prospect Street', 'Prospect City', 'Prospect State', 'Prospect Zip', \
               'Resident District', 'Filler', 'MCST Academy', 'P.I.Score', 'PI Score', 'PIScore', 'AX Offer?.1', 'Car Pooling info',\
               'Unnamed: 59', 'Unnamed: 60', 'Eng/SS Rec', 'Math/Sci Rec', 'Math/Sci Adv', 'Eng/SS Adv', 'Adv. Courses', \
               'Parent/Guardian Name']

drop_columns = [column.lower() for column in drop_columns]

In [None]:
'''Drop unnecessary columns from original data'''
for key, item in var_dict.items():
    for column in item.columns:
        column_lower = column.strip().lower()
        if column_lower in drop_columns:
            item.drop(column, axis=1, inplace = True)
    print(key, item.columns)

In [None]:
'''Check for duplicate columns'''
columns_found = 0
for key, item in var_dict.items():
    if len(item.columns) != len(set(item.columns)):
        print(key, len(item.columns), len(set(item.columns)))
        columns_found += 1
if columns_found == 0:
    print('No duplicate columns found')

In [None]:
'''Dictionary to standardize column names'''
rename_dict = {'CL': 'program',
               'Program': 'program',
               'L.N': 'last_name', 'Last Name': 'last_name',
               'F.N': 'first_name', 'First Name': 'first_name',
               'ES': 'essay_raw', 'WP': 'essay_raw', 'ACC Score Raw': 'essay_raw',
               'ESC': 'essay_converted', 'WPC': 'essay_converted', 'ACC C': 'essay_converted', 
               'ESW': 'essay_weighted', 'WPW': 'essay_weighted', 'ACC W': 'essay_weighted',
               'ME': 'math_raw', 
               'AR': 'arithmetic_raw',
               'MEC': 'math_converted', 
               'ARC': 'arithmetic_converted', 
               'MEW': 'math_weighted', 
               'ARW': 'arithmetic_weighted', 
               'GP': 'gpa_raw', 'GPA': 'gpa_raw',
               'GPC': 'gpa_converted', 'GPAC': 'gpa_converted',
               'GPW': 'gpa_weighted', 'GPAW': 'gpa_weighted', 
               'STE': 'standard_ela',
               'STEC': 'standard_ela_converted',
               'STEW': 'standard_ela_weighted',
               'STM':  'standard_math',
               'TOTAL': 'standard_total',  
               'STC': 'standard_total_converted', 'STMC': 'standard_total_converted', 
               'STW': 'standard_total_weighted', 'STMW': 'standard_total_weighted', 
               'RD': 'reading_score_raw',
               'RDC': 'reading_score_converted',
               'RDW': 'reading_score_weighted',
               'EA': 'qas', 'QAS': 'qas', 'QAS ': 'qas',
               'EAC': 'qas_converted', 'QASC': 'qas_converted',
               'EAW': 'qas_weighted', 'QASW': 'qas_weighted',         
               'RC': 'recommend', 'Rec': 'recommend', 
               'RCC': 'recommend_converted', 
               'RCW': 'recommend_weighted', 'RecW': 'recommend_weighted', 'RECW': 'recommend_weighted',
               'IN': 'interview', 
               'INC': 'interview_converted', 
               'INW' : 'interview_weighted', 
               'Admit Score': 'admit_score', 'Total': 'admit_score',
               'Decision': 'admit_decision', 'Admit Status': 'admit_decision', 'Accepted Offer?': 'admit_decision', 'Status': 'admit_decision',
               'Accepted Offer?': "admit_offer_accepted", 'AX Offer?': "admit_offer_accepted", 'Attending?': "admit_offer_accepted"
}

In [None]:
'''Standardize Column Names'''
drop_names = ['last_name', 'first_name', 'Name']

for key,item in var_dict.items():
    item.rename(columns = rename_dict, inplace = True)
    for column in item.columns:
        if column in drop_names:
            item.drop(column, axis = 1, inplace = True)
    print(key,":", item.columns)

In [None]:
'''Check for duplicate columns'''
columns_found = 0
for key, item in var_dict.items():
    if len(item.columns) != len(set(item.columns)):
        print(key, len(item.columns), len(set(item.columns)))
        columns_found += 1
if columns_found == 0:
    print('No duplicate columns found')

In [None]:
'''Find unique values in each column'''
unique_check = ['program', 'admit_decision', 'admit_offer_accepted']

for key,item in var_dict.items():
    print("\n",key)
    for column in item:
        if column in unique_check:
            print(column, item[column].unique())

In [None]:
'''Drop empty and non-IB/magnet rows'''
for key,item in var_dict.items():
    if len(item.columns) > 20:
        item.dropna(axis = 0, thresh = 12, inplace = True)
    else: item.dropna(axis = 0, thresh = 5, inplace = True)
    item.dropna(axis = 0, subset = ['program'], inplace=True)
    item = item[item['program'] != 'Regular']
    print(key, item)

In [None]:
'''Correct program in certain dataframes'''
ib_list = ['df_2009_2010_ib', 'df_2010-2011_ib']
magnet_list = ['df_2009_2010_magnet', 'df_2010-2011_magnet']

for key,item in var_dict.items():
    if key in ib_list:
        item['program'] = 'IB'
    if key in magnet_list:
        item['program'] = 'Magnet'

In [None]:
'''Find unique values in each column'''
unique_check = ['program', 'admit_decision', 'admit_offer_accepted']

for key,item in var_dict.items():
    print("\n",key)
    for column in item:
        if column in unique_check:
            print(column, item[column].unique())

In [None]:
'''Standardize data'''
for key,item in var_dict.items():
    item['admit_decision'] = item['admit_decision'].apply(lambda x: 'ax' if 'ax' in str(x).lower() else 'rj'if ('rj' or 'ret' or 'df') in \
                                                          str(x).lower() else 'wd' if ('wd'or'w/d') in str(x).lower() else 'rj')
    item['admit_offer_accepted'] = item['admit_offer_accepted'].apply(lambda x: 'yes' if 'y' in str(x).lower() else 'no'if ('no' or 'n') in \
                                                          str(x).lower() else 'no')

In [None]:
'''Print cleaned and standardized dataframes'''
for key,item in var_dict.items():
    print(key, item.info())

### Data cleaning
At this point, the data is standardized and contains only the observations of interest.  It is now time to clean the data.  This will involve dealing with NaN values and standardizing the data.

In [7]:
'''Create list with only numeric variables'''
def select_numeric (df):
    return df.select_dtypes(include="number").columns.tolist()

In [8]:
'''Create list with only object variables'''
def select_object (df):
    return df.select_dtypes(include="object").columns.tolist()

In [9]:
'''Create list with only categorical variables'''
def select_categorical (df):
    return df.select_dtypes(include="category").columns.tolist()

In [None]:
'''Check for missing data'''
for key,item in var_dict.items():
    num_cols = []
    cat_cols = []
    num_cols = select_numeric(item)
    print(key, 'num_cols:', num_cols)
    item[num_cols] = item[num_cols].astype('float64')
    cat_cols = select_object(item)
    item[cat_cols] = item[cat_cols].astype('category')
    print(key, item.info())
    num_cols = []
    cat_cols = []

In [None]:
'''Visualize missing data'''
for key,item in var_dict.items():
    msno.matrix(item.sort_values('admit_decision'))

Given the low rate and nature of missingness, we are going to utilize MissForest imputation to fill missing values.  However, in order to prevent data leakage, this will be conducted within a pipeline when we are ready to build our model.

### In the following cells, the missing data in each dataframe will be imputed and standardized separately. This will be done since different dependent measures were utilized across years.  Each year will be imputed and standarized separately and then the dataframes will be concatanated.

In [None]:
'''Pickle datafile'''
with open('mib_cleaned_data.pkl', 'wb') as f:
    pickle.dump(var_dict, f)

In [3]:
'''Open pickled file'''
with open('mib_cleaned_data.pkl', 'rb') as f:
    var_dict = pickle.load(f)

In [None]:
''''''
column_list_complete = []
for key, item in var_dict.items():
    for column in item.columns.to_list():
        if column not in column_list_complete:
            column_list_complete.append(column)

for key, item in var_dict.items():
    difference_columns = [column for column in column_list_complete if column not in item.columns]
    for column in difference_columns:
        item[column] = np.nan
    print(key, item)
            

In [4]:
'''Transformer for numerical features'''
num_pipe = Pipeline(
    [
        ('imputer_num', KNNImputer()),
        ('scaler', StandardScaler())
    ]
)

In [5]:
'''Transformer for categorical features'''
cat_pipe = Pipeline(
    [
        ('imputer_cat', SimpleImputer(strategy = 'constant', fill_value = 'missing')),
        ('one_hot', OneHotEncoder(handle_unknown='ignore', sparse=False)) 
    ]
)

In [15]:
num_cols = []
print(num_cols)
cat_cols = []
#cat_cols.remove('program')
print (cat_cols)        

'''Combine transformers into ColumnTransformer'''
preprocessor = ColumnTransformer(
    [
        ('categorical', cat_pipe, cat_cols),
        ('numerical', num_pipe, num_cols)
    ],
    remainder = 'passthrough',
)

[]
[]


In [17]:
for key,item in var_dict.items():
    df = (var_dict[key])
    print(key)
    num_cols = select_numeric(df)
    #print('num_cols:', num_cols, '\n')
    cat_cols = select_categorical(df)
    if 'program' not in cat_cols:
        X=df
        print('program_no', X.columns, len(X.columns))
    if 'program' in cat_cols:
        cat_cols.remove('program')
        print('cat_cols:', cat_cols, '\n')
        X = df.drop('program', axis = 1)
        print('program_yes', X.columns, len(X.columns))

    num_cols = select_numeric(item)
    print(num_cols)
    cat_cols = select_categorical(item)
    cat_cols.remove('program')
    print (cat_cols)        
        
    '''Combine transformers into ColumnTransformer'''
    preprocessor = ColumnTransformer(
        [
            ('categorical', cat_pipe, cat_cols),
            ('numerical', num_pipe, num_cols)
        ],
        remainder = 'passthrough',
    )

    
    '''Fit Model'''
    print('X-columns:', X.columns, len(X.columns), '\n')
    preprocessor.fit(X)

    '''Prepare column names'''
    cat_columns = preprocessor.named_transformers_['categorical']['one_hot'].get_feature_names_out(cat_cols)
    print('cat_columns:', cat_columns, len(cat_columns), '\n')
    print('num_columns:', num_cols, len(num_cols), '\n')
    columns = np.append(cat_columns, num_cols)
    print('columns:', columns, len(columns), '\n')

    
    preprocess_trans = preprocessor.transform(X)
    preprocess_trans_cols = preprocessor.named_transformers_['categorical']['one_hot'].get_feature_names_out(cat_cols)
    print(preprocess_trans_cols, len(preprocess_trans_cols))
    print(preprocess_trans[0], len(preprocess_trans[0]))
    df_trans = pd.DataFrame(preprocessor.transform(X), columns = columns)
    var_dict[key] = df_trans
    cat_cols = []
    num_cols = []
    columns = []

df_2008_2009_ib_magnet
cat_cols: ['admit_decision', 'admit_offer_accepted'] 

program_yes Index(['essay_raw', 'essay_converted', 'essay_weighted', 'math_raw',
       'math_converted', 'math_weighted', 'gpa_raw', 'gpa_converted',
       'gpa_weighted', 'standard_ela', 'standard_math', 'standard_total',
       'standard_total_converted', 'standard_total_weighted', 'recommend',
       'recommend_converted', 'recommend_weighted', 'interview',
       'interview_converted', 'interview_weighted', 'admit_score',
       'admit_decision', 'admit_offer_accepted'],
      dtype='object') 23
['essay_raw', 'essay_converted', 'essay_weighted', 'math_raw', 'math_converted', 'math_weighted', 'gpa_raw', 'gpa_converted', 'gpa_weighted', 'standard_ela', 'standard_math', 'standard_total', 'standard_total_converted', 'standard_total_weighted', 'recommend', 'recommend_converted', 'recommend_weighted', 'interview', 'interview_converted', 'interview_weighted', 'admit_score']
['admit_decision', 'admit_offer_acce

['admit_score', 'admit_decision', 'admit_offer_accepted']
X-columns: Index(['essay_raw', 'essay_converted', 'essay_weighted', 'math_raw',
       'math_converted', 'math_weighted', 'gpa_raw', 'gpa_converted',
       'gpa_weighted', 'standard_ela', 'standard_math', 'standard_total',
       'standard_total_converted', 'standard_total_weighted', 'recommend',
       'recommend_converted', 'recommend_weighted', 'interview',
       'interview_converted', 'interview_weighted', 'admit_score',
       'admit_decision', 'admit_offer_accepted'],
      dtype='object') 23 

cat_columns: ['admit_score_39.0' 'admit_score_40.4' 'admit_score_40.7'
 'admit_score_47.6' 'admit_score_51.1' 'admit_score_51.5'
 'admit_score_52.0' 'admit_score_53.7' 'admit_score_54.2'
 'admit_score_55.3' 'admit_score_56.1' 'admit_score_56.3'
 'admit_score_56.5' 'admit_score_57.3' 'admit_score_57.7'
 'admit_score_57.9' 'admit_score_58.4' 'admit_score_58.6'
 'admit_score_58.9' 'admit_score_59.2' 'admit_score_59.8'
 'admit_score_6

df_2013_2014_out_district
cat_cols: ['admit_decision', 'admit_offer_accepted'] 

program_yes Index(['essay_raw', 'essay_converted', 'essay_weighted', 'math_raw',
       'math_converted', 'math_weighted', 'gpa_raw', 'gpa_converted',
       'gpa_weighted', 'standard_ela', 'standard_ela_converted',
       'standard_ela_weighted', 'standard_math', 'standard_total_converted',
       'standard_total_weighted', 'recommend', 'recommend_converted',
       'recommend_weighted', 'interview', 'interview_converted',
       'interview_weighted', 'admit_score', 'admit_decision',
       'admit_offer_accepted'],
      dtype='object') 24
['essay_raw', 'essay_converted', 'essay_weighted', 'math_raw', 'math_converted', 'math_weighted', 'gpa_raw', 'gpa_converted', 'gpa_weighted', 'standard_ela', 'standard_ela_converted', 'standard_ela_weighted', 'standard_math', 'standard_total_converted', 'standard_total_weighted', 'recommend', 'recommend_converted', 'recommend_weighted', 'interview', 'interview_converted

cat_columns: ['admit_decision_ax' 'admit_decision_rj' 'admit_offer_accepted_no'
 'admit_offer_accepted_yes'] 4 

num_columns: ['essay_raw', 'essay_converted', 'essay_weighted', 'math_raw', 'math_converted', 'math_weighted', 'gpa_raw', 'gpa_converted', 'gpa_weighted', 'reading_score_raw', 'reading_score_converted', 'reading_score_weighted', 'arithmetic_raw', 'arithmetic_converted', 'arithmetic_weighted', 'recommend', 'recommend_converted', 'recommend_weighted', 'interview', 'interview_converted', 'interview_weighted', 'admit_score'] 22 

columns: ['admit_decision_ax' 'admit_decision_rj' 'admit_offer_accepted_no'
 'admit_offer_accepted_yes' 'essay_raw' 'essay_converted' 'essay_weighted'
 'math_raw' 'math_converted' 'math_weighted' 'gpa_raw' 'gpa_converted'
 'gpa_weighted' 'reading_score_raw' 'reading_score_converted'
 'reading_score_weighted' 'arithmetic_raw' 'arithmetic_converted'
 'arithmetic_weighted' 'recommend' 'recommend_converted'
 'recommend_weighted' 'interview' 'interview_conve

cat_columns: ['admit_decision_ax' 'admit_decision_rj' 'admit_offer_accepted_no'
 'admit_offer_accepted_yes'] 4 

num_columns: ['gpa_raw', 'gpa_converted', 'gpa_weighted', 'qas', 'qas_converted', 'qas_weighted', 'essay_raw', 'essay_converted', 'essay_weighted', 'arithmetic_raw', 'arithmetic_converted', 'arithmetic_weighted', 'reading_score_raw', 'reading_score_converted', 'reading_score_weighted', 'recommend_weighted', 'admit_score'] 17 

columns: ['admit_decision_ax' 'admit_decision_rj' 'admit_offer_accepted_no'
 'admit_offer_accepted_yes' 'gpa_raw' 'gpa_converted' 'gpa_weighted' 'qas'
 'qas_converted' 'qas_weighted' 'essay_raw' 'essay_converted'
 'essay_weighted' 'arithmetic_raw' 'arithmetic_converted'
 'arithmetic_weighted' 'reading_score_raw' 'reading_score_converted'
 'reading_score_weighted' 'recommend_weighted' 'admit_score'] 21 

['admit_decision_ax' 'admit_decision_rj' 'admit_offer_accepted_no'
 'admit_offer_accepted_yes'] 4
[ 1.          0.          0.          1.          0.3

In [18]:
for key, item in var_dict.items():
    print(key, item)

df_2008_2009_ib_magnet     admit_decision_ax  admit_decision_rj  admit_decision_wd  \
0                 0.0                1.0                0.0   
1                 0.0                1.0                0.0   
2                 0.0                0.0                1.0   
3                 1.0                0.0                0.0   
4                 1.0                0.0                0.0   
..                ...                ...                ...   
83                1.0                0.0                0.0   
84                1.0                0.0                0.0   
85                1.0                0.0                0.0   
86                1.0                0.0                0.0   
87                1.0                0.0                0.0   

    admit_offer_accepted_no  admit_offer_accepted_yes  essay_raw  \
0                       1.0                       0.0  -0.099089   
1                       1.0                       0.0   0.708301   
2               

In [19]:
'''Pickle datafile'''
with open('mib_clean_agg_data.pkl', 'wb') as f:
    pickle.dump(var_dict, f)

In [20]:
'''Open pickled file'''
with open('mib_clean_agg_data.pkl', 'rb') as f:
    var_dict = pickle.load(f)

### Combining all dataframes into a single dataframe

In [None]:
column_list = var_dict['df_2008_2009_ib_magnet'].columns.tolist()
for key, item in var_dict.items():
    for column in item.columns:
        if column not in column_list:
            column_list.append(column)
print(column_list)

for key, item in var_dict.items():
    for column in item.columns:
        if item in intersection(item.columns, column_list):
            item[column] = np.nan
    

In [None]:
for key, item in var_dict.items():
    print(len(item.columns))

In [22]:
df1 = var_dict['df_2008_2009_ib_magnet']
df2 = var_dict['df_2009_2010_ib']

df1.head()

Unnamed: 0,admit_decision_ax,admit_decision_rj,admit_decision_wd,admit_offer_accepted_no,admit_offer_accepted_yes,essay_raw,essay_converted,essay_weighted,math_raw,math_converted,...,standard_total,standard_total_converted,standard_total_weighted,recommend,recommend_converted,recommend_weighted,interview,interview_converted,interview_weighted,admit_score
0,0.0,1.0,0.0,1.0,0.0,-0.099089,0.044269,-1.176804,-0.780948,-1.247101,...,-0.223847,0.227305,0.227305,-0.509563,-0.509563,-0.982368,-0.749739,-2.084892,-1.51381,-1.486848
1,0.0,1.0,0.0,1.0,0.0,0.708301,0.39842,-1.007128,-0.385228,-1.247101,...,1.039632,0.753694,0.753694,-0.35848,-0.35848,-0.887239,-0.566105,-2.084892,-1.51381,-1.570076
2,0.0,0.0,1.0,1.0,0.0,0.708301,0.39842,-1.007128,1.901155,1.403742,...,0.465323,0.227305,0.227305,0.92573,0.92573,-0.078643,-1.711631,-2.084892,-1.51381,0.952382
3,1.0,0.0,0.0,0.0,1.0,0.708301,0.39842,-1.007128,1.769249,1.403742,...,1.200438,0.753694,0.753694,0.850188,0.850188,-0.126208,-2.183833,-0.595683,-1.306277,1.195665
4,1.0,0.0,0.0,0.0,1.0,-0.906478,-0.309882,-1.346479,1.065746,0.873573,...,0.396406,0.227305,0.227305,0.472479,0.472479,-0.36403,-0.609828,0.297842,-1.181758,0.529838


In [55]:
keys_list = list(var_dict)

for key, item in var_dict.items():
        df1 = item
        next_index = (keys_list.index(key)+1)
        if next_index < len(keys_list):
            next_index_item = keys_list[next_index]
            df2 = var_dict[next_index_item]
            if keys_list.index(key) == 0:
                df_concat = pd.concat([df1,df2], axis=0, ignore_index=True)
            else: df_concat = pd.concat([df_concat, df2], axis=0, ignore_index=True)
df_concat

Unnamed: 0,admit_decision_ax,admit_decision_rj,admit_decision_wd,admit_offer_accepted_no,admit_offer_accepted_yes,essay_raw,essay_converted,essay_weighted,math_raw,math_converted,...,standard_ela_weighted,reading_score_raw,reading_score_converted,reading_score_weighted,arithmetic_raw,arithmetic_converted,arithmetic_weighted,qas,qas_converted,qas_weighted
0,0.0,1.0,0.0,1.0,0.0,-0.099089,0.044269,-1.176804,-0.780948,-1.247101,...,,,,,,,,,,
1,0.0,1.0,0.0,1.0,0.0,0.708301,0.398420,-1.007128,-0.385228,-1.247101,...,,,,,,,,,,
2,0.0,0.0,1.0,1.0,0.0,0.708301,0.398420,-1.007128,1.901155,1.403742,...,,,,,,,,,,
3,1.0,0.0,0.0,0.0,1.0,0.708301,0.398420,-1.007128,1.769249,1.403742,...,,,,,,,,,,
4,1.0,0.0,0.0,0.0,1.0,-0.906478,-0.309882,-1.346479,1.065746,0.873573,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2625,0.0,1.0,,1.0,0.0,0.430868,0.532001,0.532001,,,...,,,,,,,,,,
2626,0.0,1.0,,1.0,0.0,0.241290,0.532001,0.532001,,,...,,,,,,,,,,
2627,0.0,1.0,,1.0,0.0,0.170199,-0.343755,-0.343755,,,...,,,,,,,,,,
2628,0.0,1.0,,1.0,0.0,0.051713,-2.971022,-2.971022,,,...,,,,,,,,,,


['df_2008_2009_ib_magnet', 'df_2009_2010_ib', 'df_2009_2010_magnet', 'df_2010-2011_ib', 'df_2010-2011_magnet', 'df_2011_2012_in_district', 'df_2011_2012_out_district', 'df_2012_2013_in_district', 'df_2012_2013_out_district', 'df_2013_2014_in_district', 'df_2013_2014_out_district', 'df_2014_2015_in_district', 'df_2014_2015_out_district', 'df_2015_2016_in_district', 'df_2015_2016_out_district', 'df_2016_2017_in_district', 'df_2016_2017_out_district', 'df_2017_2018_in_district', 'df_2017_2018_out_district', 'df_2018_2019_in_district', 'df_2018_2019_out_district', 'df_2019_2020_in_district', 'df_2019_2020_out_district', 'df_2020_2021_in_district', 'df_2020_2021_out_district', 'df_2021_2022_in_district', 'df_2021_2022_out_district']
