In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from env import user, password, host
from scipy.stats import levene, ttest_ind
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, plot_confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_text
import math
import numpy as np
import os
import pandas as pd
import requests
import seaborn as sns
import statsmodels.api as sm
import warnings
warnings.filterwarnings("ignore")

In [2]:
df22 = pd.read_csv('DISTRICT_summary_22.csv')
df21 = pd.read_csv('DISTRICT_summary_21.csv')
df20 = pd.read_csv('DISTRICT_summary_20.csv')
df19 = pd.read_csv('DISTRICT_summary_19.csv')

In [3]:
df22

Unnamed: 0,AGGREGATION LEVEL,REGION,DISTNAME,DISTRICT,CHARTER_STATUS,SECTION,HEADING,HEADING NAME,count
0,DISTRICT SUMMARY,10,A W BROWN LEADERSHIP ACADEMY,57816,OPEN ENROLLMENT CHARTER,A-PARTICIPATION,A01,DISTRICT CUMULATIVE YEAR END ENROLLMENT,1177
1,DISTRICT SUMMARY,10,A W BROWN LEADERSHIP ACADEMY,57816,OPEN ENROLLMENT CHARTER,A-PARTICIPATION,A02,DISTRICT DISCIPLINE POPULATION,156
2,DISTRICT SUMMARY,10,A W BROWN LEADERSHIP ACADEMY,57816,OPEN ENROLLMENT CHARTER,A-PARTICIPATION,A03,DISTRICT DISCIPLINE RECORD COUNT,248
3,DISTRICT SUMMARY,10,A W BROWN LEADERSHIP ACADEMY,57816,OPEN ENROLLMENT CHARTER,B-DISCIPLINE DATA TRENDS,B10,COUNT OF STUDENTS SUSPENDED IN SCHOOL,45
4,DISTRICT SUMMARY,10,A W BROWN LEADERSHIP ACADEMY,57816,OPEN ENROLLMENT CHARTER,B-DISCIPLINE DATA TRENDS,B13,STUDENTS SUSPENDED OUT OF SCHOOL,132
...,...,...,...,...,...,...,...,...,...
66484,DISTRICT SUMMARY,15,ZEPHYR ISD,25906,TRADITIONAL ISD/CSD,V-AT RISK IN SCHOOL SUS.,F19,NON AT RISK IN SCHOOL SUSPENSIONS,-999
66485,DISTRICT SUMMARY,15,ZEPHYR ISD,25906,TRADITIONAL ISD/CSD,W-REASON INCIDENT COUNTS,G21,21-VIOLATED LOCAL CODE OF CONDUCT,-999
66486,DISTRICT SUMMARY,15,ZEPHYR ISD,25906,TRADITIONAL ISD/CSD,W-REASON INCIDENT COUNTS,G61,61-BULLYING  TEC 37.0052(B),-999
66487,DISTRICT SUMMARY,15,ZEPHYR ISD,25906,TRADITIONAL ISD/CSD,X-DISCIPLINE ACTION COUNTS,H05,05-OUT-OF-SCHOOL SUSPENSION,-999


In [4]:
def prep22(df):
    global df22
    df22=df22.rename(columns={'AGGREGATION LEVEL':'agg_level', 'REGION':'region', 'DISTNAME':'dist_name', 
                              'DISTRICT':'district_num', 'CHARTER_STATUS':'charter_status', 'SECTION':'section', 
                              'HEADING':'heading', 'HEADING NAME':'heading_name'})
    df22['charter_encoded'] = df22.charter_status.map({'OPEN ENROLLMENT CHARTER': 1, 'TRADITIONAL ISD/CSD':0})
    df22=df22[(df22.heading == 'A01') | (df22.heading ==  'A03')]
    df22 = df22[df22['count'] != -999]
    df22.dropna()
    df22=df22.drop_duplicates()
    df22pivot=df22.pivot(index='district_num', columns='heading_name', values= 'count').dropna()
    df22=df22.merge(df22pivot,how= 'right', on= 'district_num')
    df22=df22.drop(columns=['heading', 'heading_name','count', 'agg_level', 'region', 'district_num', 'section',
                            'charter_status'])
    df22=df22.rename(columns={'DISTRICT CUMULATIVE YEAR END ENROLLMENT': 'enrollment', 
                              'DISTRICT DISCIPLINE RECORD COUNT':'disciplined'})
    df22=df22.drop_duplicates()
    df22.dropna()
    df22=df22.reset_index(drop=True)
    df22['discipline_percent']= ((df22['disciplined']/df22['enrollment'])*100)
    df22=df22.round({'discipline_percent': 0})
    

In [5]:
prep22(df22)

In [6]:
df22

Unnamed: 0,dist_name,charter_encoded,enrollment,disciplined,discipline_percent
0,CAYUGA ISD,0,609.0,62.0,10.0
1,ELKHART ISD,0,1301.0,246.0,19.0
2,FRANKSTON ISD,0,868.0,204.0,24.0
3,NECHES ISD,0,367.0,69.0,19.0
4,PALESTINE ISD,0,3648.0,1031.0,28.0
...,...,...,...,...,...
1027,NEWCASTLE ISD,0,233.0,27.0,12.0
1028,OLNEY ISD,0,776.0,140.0,18.0
1029,ZAPATA COUNTY ISD,0,3520.0,550.0,16.0
1030,CRYSTAL CITY ISD,0,1882.0,486.0,26.0


In [7]:
def prep21(df):
    global df21
    df21=df21.rename(columns={'AGGREGATION LEVEL':'agg_level', 'REGION':'region', 'DISTNAME':'dist_name', 
                              'DISTRICT':'district_num', 'CHARTER_STATUS':'charter_status', 'SECTION':'section', 
                              'HEADING':'heading', 'HEADING NAME':'heading_name'})
    df21['charter_encoded'] = df21.charter_status.map({'OPEN ENROLLMENT CHARTER': 1, 'TRADITIONAL ISD/CSD':0})
    df21=df21[(df21.heading == 'A01') | (df21.heading ==  'A03')]
    df21 = df21[df21['count'] != -999]
    df21.dropna()
    df21=df21.drop_duplicates()
    df21pivot=df21.pivot(index='district_num', columns='heading_name', values= 'count').dropna()
    df21=df21.merge(df21pivot,how= 'right', on= 'district_num')
    df21=df21.drop(columns=['heading', 'heading_name','count', 'agg_level', 'region', 'district_num', 
                            'section','charter_status'])
    df21=df21.rename(columns={'DISTRICT CUMULATIVE YEAR END ENROLLMENT': 'enrollment', 
                              'DISTRICT DISCIPLINE RECORD COUNT':'disciplined'})
    df21=df21.drop_duplicates()
    df21.dropna()
    df21=df21.reset_index(drop=True)
    df21['discipline_percent']= ((df21['disciplined']/df21['enrollment'])*100)
    df21=df21.round({'discipline_percent': 0})

In [8]:
prep21(df21)

In [9]:
df21

Unnamed: 0,dist_name,charter_encoded,enrollment,disciplined,discipline_percent
0,CAYUGA ISD,0,586.0,60.0,10.0
1,ELKHART ISD,0,1321.0,210.0,16.0
2,FRANKSTON ISD,0,847.0,76.0,9.0
3,NECHES ISD,0,362.0,35.0,10.0
4,PALESTINE ISD,0,3707.0,709.0,19.0
...,...,...,...,...,...
953,NEWCASTLE ISD,0,224.0,11.0,5.0
954,OLNEY ISD,0,774.0,106.0,14.0
955,ZAPATA COUNTY ISD,0,3592.0,11.0,0.0
956,CRYSTAL CITY ISD,0,1923.0,53.0,3.0


In [10]:
def prep20(df):
    global df20
    df20=df20.rename(columns={'AGGREGATION LEVEL':'agg_level', 'REGION':'region', 'DISTNAME':'dist_name', 
                              'DISTRICT':'district_num', 'CHARTER_STATUS':'charter_status', 'SECTION':'section', 
                              'HEADING':'heading', 'HEADING NAME':'heading_name'})
    df20['charter_encoded'] = df20.charter_status.map({'OPEN ENROLLMENT CHARTER': 1, 'TRADITIONAL ISD/CSD':0})
    df20=df20[(df20.heading == 'A01') | (df20.heading ==  'A03')]
    df20 = df20[df20['count'] != -999]
    df20.dropna()
    df20=df20.drop_duplicates()
    df20pivot=df20.pivot(index='district_num', columns='heading_name', values= 'count').dropna()
    df20=df20.merge(df20pivot,how= 'right', on= 'district_num')
    df20=df20.drop(columns=['heading', 'heading_name','count', 'agg_level', 'region', 'district_num', 
                            'section','charter_status'])
    df20=df20.rename(columns={'DISTRICT CUMULATIVE YEAR END ENROLLMENT': 'enrollment', 
                              'DISTRICT DISCIPLINE RECORD COUNT':'disciplined'})
    df20=df20.drop_duplicates()
    df20.dropna()
    df20=df20.reset_index(drop=True)
    df20['discipline_percent']= ((df20['disciplined']/df20['enrollment'])*100)
    df20=df20.round({'discipline_percent': 0})

In [11]:
prep20(df20)

In [12]:
df20

Unnamed: 0,dist_name,charter_encoded,enrollment,disciplined,discipline_percent
0,CAYUGA ISD,0,599.0,74.0,12.0
1,ELKHART ISD,0,1359.0,273.0,20.0
2,FRANKSTON ISD,0,862.0,41.0,5.0
3,NECHES ISD,0,389.0,51.0,13.0
4,PALESTINE ISD,0,3713.0,798.0,21.0
...,...,...,...,...,...
1016,NEWCASTLE ISD,0,215.0,17.0,8.0
1017,OLNEY ISD,0,714.0,149.0,21.0
1018,ZAPATA COUNTY ISD,0,3639.0,438.0,12.0
1019,CRYSTAL CITY ISD,0,1986.0,434.0,22.0


In [13]:
def prep19(df):
    global df19
    df19=df19.rename(columns={'AGGREGATION LEVEL':'agg_level', 'REGION':'region', 'DISTNAME':'dist_name', 
                              'DISTRICT':'district_num', 'CHARTER_STATUS':'charter_status', 'SECTION':'section', 
                              'HEADING':'heading', 'HEADING NAME':'heading_name'})
    df19['charter_encoded'] = df19.charter_status.map({'OPEN ENROLLMENT CHARTER': 1, 'TRADITIONAL ISD/CSD':0})
    df19=df19[(df19.heading == 'A01') | (df19.heading ==  'A03')]
    df19 = df19[df19['count'] != -999]
    df19.dropna()
    df19=df19.drop_duplicates()
    df19pivot=df19.pivot(index='district_num', columns='heading_name', values= 'count').dropna()
    df19=df19.merge(df19pivot,how= 'right', on= 'district_num')
    df19=df19.drop(columns=['heading', 'heading_name','count', 'agg_level', 'region', 'district_num', 
                            'section','charter_status'])
    df19=df19.rename(columns={'DISTRICT CUMULATIVE YEAR END ENROLLMENT': 'enrollment',
                              'DISTRICT DISCIPLINE RECORD COUNT':'disciplined'})
    df19=df19.drop_duplicates()
    df19.dropna()
    df19=df19.reset_index(drop=True)
    df19['discipline_percent']= ((df19['disciplined']/df19['enrollment'])*100)
    df19=df19.round({'discipline_percent': 0})

In [14]:
prep19(df19)

In [15]:
df19

Unnamed: 0,dist_name,charter_encoded,enrollment,disciplined,discipline_percent
0,CAYUGA ISD,0,603.0,82.0,14.0
1,ELKHART ISD,0,1372.0,538.0,39.0
2,FRANKSTON ISD,0,888.0,72.0,8.0
3,NECHES ISD,0,392.0,43.0,11.0
4,PALESTINE ISD,0,3640.0,1190.0,33.0
...,...,...,...,...,...
1039,NEWCASTLE ISD,0,226.0,28.0,12.0
1040,OLNEY ISD,0,745.0,270.0,36.0
1041,ZAPATA COUNTY ISD,0,3612.0,471.0,13.0
1042,CRYSTAL CITY ISD,0,2034.0,685.0,34.0


In [16]:
def df_combine(a,b,c,d):
    df=pd.concat([df19,df20,df21,df22], ignore_index=True)
    return(df)

In [17]:
df_combine(df19,df20,df21,df22)

Unnamed: 0,dist_name,charter_encoded,enrollment,disciplined,discipline_percent
0,CAYUGA ISD,0,603.0,82.0,14.0
1,ELKHART ISD,0,1372.0,538.0,39.0
2,FRANKSTON ISD,0,888.0,72.0,8.0
3,NECHES ISD,0,392.0,43.0,11.0
4,PALESTINE ISD,0,3640.0,1190.0,33.0
...,...,...,...,...,...
4050,NEWCASTLE ISD,0,233.0,27.0,12.0
4051,OLNEY ISD,0,776.0,140.0,18.0
4052,ZAPATA COUNTY ISD,0,3520.0,550.0,16.0
4053,CRYSTAL CITY ISD,0,1882.0,486.0,26.0


In [23]:
df

NameError: name 'df' is not defined

In [22]:
df.dist_name.unique()

NameError: name 'df' is not defined

In [18]:
def split_tea_data(df):
    '''
    This function performs split on tea data, stratify charter_encoded.
    Returns train, validate, and test dfs.
    '''
    train_validate, test = train_test_split(df, test_size=.2, 
                                        random_state=123, 
                                        stratify=df.charter_encoded)
    train, validate = train_test_split(train_validate, test_size=.3, 
                                   random_state=123, 
                                   stratify=train_validate.charter_encoded)
    return train, validate, test

In [20]:
train, validate, test= split_tea_data() 

TypeError: split_tea_data() missing 1 required positional argument: 'df'

In [None]:
train.head()