In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from env import user, password, host
from scipy.stats import levene, ttest_ind
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, plot_confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_text
import math
import numpy as np
import os
import pandas as pd
import requests
import seaborn as sns
import statsmodels.api as sm
import wrangle_campus as wc
import warnings
warnings.filterwarnings("ignore")

In [2]:
cs22 = pd.read_csv('CAMPUS_summary_22.csv')
cs21 = pd.read_csv('CAMPUS_summary_21.csv')
cs20 = pd.read_csv('CAMPUS_summary_20.csv')
cs19 = pd.read_csv('CAMPUS_summary_19.csv')
cs18 = pd.read_csv('CAMPUS_summary_18.csv')

In [3]:
cs22=wc.campus_prep(cs22)
cs21=wc.campus_prep(cs21)
cs20=wc.campus_prep(cs20)
cs19=wc.campus_prep(cs19)
cs18=wc.campus_prep(cs18)

In [4]:
df=wc.df_combine(cs18,cs19,cs20,cs21,cs22)

In [10]:
df.head(50)

Unnamed: 0,campus_name_num,charter_encoded,student_enrollment,discipline_count,iss,discipline_percent,iss_percent
0,ELKHART H S 001903001,0,411.0,205.0,173.0,50.0,42.0
1,ELKHART MIDDLE 001903041,0,298.0,151.0,138.0,51.0,46.0
2,ELKHART EL 001903101,0,379.0,11.0,11.0,3.0,3.0
3,ELKHART INT 001903102,0,276.0,151.0,146.0,55.0,53.0
4,PALESTINE H S 001907001,0,1044.0,582.0,463.0,56.0,44.0
5,STORY INT 001907110,0,815.0,248.0,217.0,30.0,27.0
6,WESTWOOD H S 001908002,0,462.0,275.0,182.0,60.0,39.0
7,SLOCUM H S 001909001,0,123.0,35.0,24.0,28.0,20.0
8,SLOCUM PK - 8 001909101,0,331.0,19.0,15.0,6.0,5.0
9,ANDREWS H S 002901001,0,1097.0,233.0,159.0,21.0,14.0


In [11]:
train, validate, test= wc.split_campus_data(df)

In [12]:
train.head()

Unnamed: 0,campus_name_num,charter_encoded,student_enrollment,discipline_count,iss,discipline_percent,iss_percent
4629,HILL MIDDLE 015910059,0,1165.0,207.0,161.0,18.0,14.0
5028,EAST CHAMBERS H S 036903001,0,451.0,145.0,96.0,32.0,21.0
19247,KIPP VOYAGE ACADEMY FOR GIRLS 227820055,1,365.0,48.0,34.0,13.0,9.0
1767,BIRCH EL 092904104,0,744.0,170.0,91.0,23.0,12.0
10382,LA MESA EL 095905108,0,525.0,39.0,36.0,7.0,7.0


In [13]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10978 entries, 4629 to 587
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   campus_name_num     10978 non-null  object 
 1   charter_encoded     10978 non-null  int64  
 2   student_enrollment  10978 non-null  float64
 3   discipline_count    10978 non-null  float64
 4   iss                 10978 non-null  float64
 5   discipline_percent  10978 non-null  float64
 6   iss_percent         10978 non-null  float64
dtypes: float64(5), int64(1), object(1)
memory usage: 686.1+ KB


In [14]:
validate.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4706 entries, 18501 to 6024
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   campus_name_num     4706 non-null   object 
 1   charter_encoded     4706 non-null   int64  
 2   student_enrollment  4706 non-null   float64
 3   discipline_count    4706 non-null   float64
 4   iss                 4706 non-null   float64
 5   discipline_percent  4706 non-null   float64
 6   iss_percent         4706 non-null   float64
dtypes: float64(5), int64(1), object(1)
memory usage: 294.1+ KB


In [15]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3921 entries, 15018 to 18672
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   campus_name_num     3921 non-null   object 
 1   charter_encoded     3921 non-null   int64  
 2   student_enrollment  3921 non-null   float64
 3   discipline_count    3921 non-null   float64
 4   iss                 3921 non-null   float64
 5   discipline_percent  3921 non-null   float64
 6   iss_percent         3921 non-null   float64
dtypes: float64(5), int64(1), object(1)
memory usage: 245.1+ KB


In [16]:
train.head()

Unnamed: 0,campus_name_num,charter_encoded,student_enrollment,discipline_count,iss,discipline_percent,iss_percent
4629,HILL MIDDLE 015910059,0,1165.0,207.0,161.0,18.0,14.0
5028,EAST CHAMBERS H S 036903001,0,451.0,145.0,96.0,32.0,21.0
19247,KIPP VOYAGE ACADEMY FOR GIRLS 227820055,1,365.0,48.0,34.0,13.0,9.0
1767,BIRCH EL 092904104,0,744.0,170.0,91.0,23.0,12.0
10382,LA MESA EL 095905108,0,525.0,39.0,36.0,7.0,7.0


In [18]:
dfdp=(train.discipline_percent > 100)

In [19]:
dfdp

4629     False
5028     False
19247    False
1767     False
10382    False
         ...  
8400      True
3891     False
1947     False
18708    False
587      False
Name: discipline_percent, Length: 10978, dtype: bool

In [46]:
df_filtered = df.loc[df['discipline_percent'] > 300]


In [47]:
df_filtered

Unnamed: 0,campus_name_num,charter_encoded,student_enrollment,discipline_count,iss,discipline_percent,iss_percent
2691,OZEN H S 123910004,0,1220.0,3790.0,1869.0,311.0,153.0
2694,SMITH MIDDLE 123910042,0,582.0,1913.0,971.0,329.0,167.0
4088,LIBERTY ACADEMY 235902006,0,253.0,780.0,568.0,308.0,225.0
4553,E T WRENN MIDDLE 015905044,0,685.0,2204.0,1287.0,322.0,188.0
5509,MESQUITE ACADEMY 057914656,0,63.0,349.0,267.0,554.0,424.0
7013,BEAUMONT UNITED H S 123910014,0,2400.0,11381.0,5656.0,474.0,236.0
7014,SMITH MIDDLE 123910042,0,625.0,2618.0,1458.0,419.0,233.0
7019,VINCENT MIDDLE 123910048,0,596.0,2373.0,1496.0,398.0,251.0
11166,SOUTH PARK MIDDLE 123910045,0,13.0,92.0,18.0,708.0,138.0
12489,LIBERTY CREDIT RECOVERY 235902006,0,326.0,1063.0,592.0,326.0,182.0


In [28]:
df_filtered.student_enrollment.max()

3475.0

In [29]:
max_sch=df_filtered.loc[df_filtered['student_enrollment']==3475]

In [30]:
max_sch

Unnamed: 0,campus_name_num,charter_encoded,student_enrollment,discipline_count,iss,discipline_percent,iss_percent
17305,DAVIS H S ALDINE 101902012,0,3475.0,4159.0,1508.0,120.0,43.0


In [39]:
charters=df_filtered[df_filtered['charter_encoded']==0]

In [38]:
charters

Unnamed: 0,campus_name_num,charter_encoded,student_enrollment,discipline_count,iss,discipline_percent,iss_percent
921,UPLIFT WILLIAMS PREPARATORY PRI 057803112,1,588.0,25.0,11.0,4.0,2.0
1843,KIPP ZENITH ACADEMY 101813114,1,866.0,27.0,17.0,3.0,2.0
2444,IDEA COLLEGE PREPARATORY MISSION 108807004,1,819.0,35.0,19.0,4.0,2.0
2989,HARMONY SCIENCE ACADEMY- FORT WORT 161807007,1,679.0,29.0,10.0,4.0,1.0
5268,UPLIFT SUMMIT INTERNATIONAL PRI 057803109,1,622.0,27.0,13.0,4.0,2.0
5277,TRINITY BASIN PREPARATORY 057813104,1,1545.0,66.0,26.0,4.0,2.0
5278,GOLDEN RULE CHARTER SCHOOL 057835001,1,867.0,32.0,16.0,4.0,2.0
5279,LA ACADEMIA DE ESTRELLAS 057839101,1,1127.0,46.0,28.0,4.0,2.0
5683,COMPASS ACADEMY CHARTER SCHOOL 068802001,1,1129.0,32.0,32.0,3.0,3.0
6765,IDEA COLLEGE PREP 108807001,1,773.0,27.0,16.0,3.0,2.0


In [48]:
camelot=df[df['campus_name_num']=='CAMELOT EL 015910118']

In [49]:
camelot

Unnamed: 0,campus_name_num,charter_encoded,student_enrollment,discipline_count,iss,discipline_percent,iss_percent
307,CAMELOT EL 015910118,0,702.0,234.0,46.0,33.0,7.0
4643,CAMELOT EL 015910118,0,675.0,270.0,58.0,40.0,9.0
9019,CAMELOT EL 015910118,0,638.0,218.0,28.0,34.0,4.0
12912,CAMELOT EL 015910118,0,521.0,91.0,11.0,17.0,2.0
15753,CAMELOT EL 015910118,0,599.0,242.0,56.0,40.0,9.0
