# Importing necessary libraries

In [38]:
import pandas as pd
import matplotlib.pyplot as plt
from pylab import rcParams
import seaborn as sb
import sklearn
from sklearn.cluster import DBSCAN
from collections import Counter
%matplotlib inline
rcParams['figure.figsize'] = 10, 8
sb.set_style('whitegrid')

# Reading dataset as a dataframe

We also conduct some simple data cleaning here, dropping any rows containing null values and dropping the PRIMARY_KEY and STATE columns.

In [8]:
df = pd.read_csv('states_all.csv')
df_cleaned = df.dropna()
df_cleaned = df_cleaned.drop(['PRIMARY_KEY', 'STATE'], axis=1)

# Classification

## Decision Tree

Here we are classifying the average reading test score for grade 8 using all the columns in our cleaned dataset.

In [41]:
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier


y = df_cleaned['AVG_READING_8_SCORE']
x = df_cleaned.drop(['AVG_READING_8_SCORE'], axis=1)

clf = DecisionTreeClassifier(random_state=0)
cross_val_score(clf, x, y, cv=10)



array([0.10909091, 0.12765957, 0.14285714, 0.15789474, 0.17142857,
       0.09090909, 0.2       , 0.10714286, 0.04      , 0.04545455])

In [45]:
x.columns

Index(['YEAR', 'ENROLL', 'TOTAL_REVENUE', 'FEDERAL_REVENUE', 'STATE_REVENUE',
       'LOCAL_REVENUE', 'TOTAL_EXPENDITURE', 'INSTRUCTION_EXPENDITURE',
       'SUPPORT_SERVICES_EXPENDITURE', 'OTHER_EXPENDITURE',
       'CAPITAL_OUTLAY_EXPENDITURE', 'GRADES_PK_G', 'GRADES_KG_G',
       'GRADES_4_G', 'GRADES_8_G', 'GRADES_12_G', 'GRADES_1_8_G',
       'GRADES_9_12_G', 'GRADES_ALL_G', 'AVG_MATH_4_SCORE', 'AVG_MATH_8_SCORE',
       'AVG_READING_4_SCORE'],
      dtype='object')

Here we are splitting up our data into training and test sets with a test set as 20% of the data.

In [43]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.20)
clf = clf.fit(X_train, y_train)

After training, we test the classifier against our test set, receiving an accuracy score of 98%

In [44]:
predictions = clf.predict(X_test)
clf.score(X_test, y_test)

0.09859154929577464

We can visualize the nodes within the tree here

In [37]:
from sklearn.externals.six import StringIO  
from sklearn import tree
import pydot
import os
os.environ["PATH"] += os.pathsep + 'C:/Program Files (x86)/Graphviz2.38/bin/'

def generateTree(clf, out_name):
    dot_data = StringIO()
    tree.export_graphviz(clf, out_file=dot_data)
    graph = pydot.graph_from_dot_data(dot_data.getvalue())
    graph[0].write_pdf(out_name) 
    
generateTree(clf, "decision_tree.pdf")