In [91]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
from sqlalchemy import create_engine
from cassandra.cluster import Cluster, ExecutionProfile, EXEC_PROFILE_DEFAULT
from cassandra.policies import DCAwareRoundRobinPolicy, TokenAwarePolicy
from cassandra.auth import PlainTextAuthProvider
def getCluster():
    profile = ExecutionProfile(load_balancing_policy=TokenAwarePolicy(DCAwareRoundRobinPolicy(local_dc='AWS_US_EAST_1')),
                               row_factory=pandas_factory)
    return Cluster(
        execution_profiles={EXEC_PROFILE_DEFAULT: profile},
        contact_points=[
           "node-0.aws_us_east_1.**********.clusters.scylla.cloud",
           "node-1.aws_us_east_1.**********.clusters.scylla.cloud",
           "node-2.aws_us_east_1.**********.clusters.scylla.cloud"
        ],
        port=9042,
        auth_provider = PlainTextAuthProvider(username="scylla", password="*******"))


def pandas_factory(colnames, rows):
    return pd.DataFrame(rows, columns=colnames)

cluster = getCluster()
session = cluster.connect()

In [99]:
query = "SELECT * FROM demo.flight_features WHERE cancelled = 0 and diverted = 0 LIMIT 50000;"
rows = session.execute(query)
df = rows._current_rows

In [100]:
#split dataset in features and target variable
feature_cols = ["actual_elapsed_time", "air_time", 
                "arr_time", "crs_arr_time", "crs_dep_time", 
                "crs_elapsed_time", "dep_time", "distance", "taxi_in", "taxi_out", 
                "wheels_off", "wheels_on", "arr_delay"]

X = df[feature_cols] # Features
y = df.dep_delay # Target variable

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # 70% training and 30% test


In [103]:
# Create Decision Tree classifer object
clf = DecisionTreeClassifier()

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.052884615384615384


In [None]:
from sklearn.tree import export_graphviz
from six import StringIO  
from IPython.display import Image  
import pydotplus

dot_data = StringIO()
export_graphviz(clf, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True,feature_names = feature_cols,class_names=['0','1'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
graph.write_png('flight_delayed.png')
Image(graph.create_png())