In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
from cassandra.cluster import Cluster, ExecutionProfile, EXEC_PROFILE_DEFAULT
from cassandra.policies import DCAwareRoundRobinPolicy, TokenAwarePolicy
from cassandra.auth import PlainTextAuthProvider
import config

def pandas_factory(colnames, rows):
    return pd.DataFrame(rows, columns=colnames)

def getCluster():
    profile = ExecutionProfile(load_balancing_policy=TokenAwarePolicy(DCAwareRoundRobinPolicy(local_dc=config.DB_DATACENTER)),
                               row_factory=pandas_factory)
    return Cluster(
        execution_profiles={EXEC_PROFILE_DEFAULT: profile},
        contact_points=config.DB_HOST,
        port=config.DB_PORT,
        auth_provider = PlainTextAuthProvider(username=config.DB_USER, password=config.DB_PASS))

cluster = getCluster()
session = cluster.connect()

In [2]:
query = "SELECT * FROM demo.flight_features WHERE cancelled = 0 and diverted = 0 LIMIT 50000 ALLOW FILTERING;"
rows = session.execute(query)
df = rows._current_rows

In [3]:
#split dataset in features and target variable
feature_cols = ["actual_elapsed_time", "air_time", 
                "arr_time", "crs_arr_time", "crs_dep_time", 
                "crs_elapsed_time", "dep_time", "distance", "taxi_in", "taxi_out", 
                "wheels_off", "wheels_on", "arr_delay"]

X = df[feature_cols] # Features
y = df.dep_delay # Target variable

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # 70% training and 30% test


In [4]:
# Create Decision Tree classifer object
clf = DecisionTreeClassifier()

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.0521978021978022


In [7]:
from sklearn.tree import export_text

tree_rules = export_text(clf, feature_names=feature_cols)
print(tree_rules)

|--- arr_delay <= 6.50
|   |--- arr_delay <= -13.50
|   |   |--- taxi_out <= 18.50
|   |   |   |--- distance <= 864.00
|   |   |   |   |--- crs_elapsed_time <= 164.50
|   |   |   |   |   |--- taxi_in <= 22.00
|   |   |   |   |   |   |--- taxi_in <= 16.50
|   |   |   |   |   |   |   |--- arr_delay <= -21.50
|   |   |   |   |   |   |   |   |--- taxi_in <= 14.50
|   |   |   |   |   |   |   |   |   |--- air_time <= 125.00
|   |   |   |   |   |   |   |   |   |   |--- taxi_in <= 12.50
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 13
|   |   |   |   |   |   |   |   |   |   |--- taxi_in >  12.50
|   |   |   |   |   |   |   |   |   |   |   |--- class: -5.0
|   |   |   |   |   |   |   |   |   |--- air_time >  125.00
|   |   |   |   |   |   |   |   |   |   |--- class: -9.0
|   |   |   |   |   |   |   |   |--- taxi_in >  14.50
|   |   |   |   |   |   |   |   |   |--- class: -6.0
|   |   |   |   |   |   |   |--- arr_delay >  -21.50
|   |   |   |   |   |   |   |   |--- c