In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

In [None]:
df = pd.read_csv('clustered_data.csv')

In [None]:
X = df.iloc[:, 4:-13]
Y = df.iloc[:, -13]

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state = 42, test_size = .2)

In [None]:
knn = KNeighborsClassifier()
#knn_cv_scores = cross_val_score(knn, X_train, Y_train, cv = 10)
#knn_cv_scores.mean()

In [None]:
knn_params = {'n_neighbors': list(range(2,41)),
            'weights': ['uniform', 'distance']}

In [None]:
knn_cv = GridSearchCV(knn, knn_params, cv = 5, scoring = 'accuracy')
knn_cv.fit(X_train, Y_train)

In [None]:
1-knn_cv.best_score_

In [None]:
knn_cv.best_params_

In [None]:
knn_uniform_cv_error = []
for i in range(0,78,2):
    knn_uniform_cv_error.append(1-knn_cv.cv_results_['mean_test_score'][i])

In [None]:
knn_distance_cv_error = []
for i in range(1,78,2):
    knn_distance_cv_error.append(1-knn_cv.cv_results_['mean_test_score'][i])

In [None]:
u = plt.plot(range(2,41), knn_uniform_cv_error)
u = plt.plot(range(2,41), knn_distance_cv_error)

plt.legend('Distance')
plt.xlabel('K')
plt.ylabel('Mean CV Error')
plt.title('KNN CV Error')
plt.legend(['"Uniform" Weights', '"Distance" Weights'])

plt.show()

In [None]:
knn = KNeighborsClassifier(n_neighbors = 22, weights = 'distance')
knn.fit(X_train, Y_train)
knn_pred = knn.predict(X_test)

In [None]:
knn_acc = knn.score(X_test, Y_test)
knn_acc

In [None]:
print(confusion_matrix(Y_test, knn_pred))
print(classification_report(Y_test, knn_pred))

In [None]:
clf_tree = tree.DecisionTreeClassifier()
tree_params = {'max_depth': list(range(1,31, 2)),
            'min_samples_split': list(range(1,21,3)) ,
            'min_samples_leaf': list(range(1,11,3)) ,
            'max_features': [None, 'sqrt', 'log2'],
            'criterion': ['gini', 'entropy'],
            }

In [None]:
tree_cv = GridSearchCV(clf_tree, tree_params, cv = 5, scoring = 'accuracy')
tree_cv.fit(X_train, Y_train)

In [None]:
tree_cv.best_params_

In [None]:
1-tree_cv.best_score_

In [None]:
clf_tree = tree.DecisionTreeClassifier(criterion = 'entropy', max_depth = 5, max_features = None, 
                                       min_samples_leaf= 1, min_samples_split= 15)
clf_tree.fit(X_train, Y_train)


In [None]:
tree_acc = clf_tree.score(X_test, Y_test)

In [None]:
tree_acc

In [None]:
tree_pred = clf_tree.predict(X_test)
print(confusion_matrix(Y_test, tree_pred))
print(classification_report(Y_test, tree_pred))

In [None]:
tree.plot_tree(clf_tree, filled = True, fontsize=8, rounded = True)
plt.show

In [None]:
tree_features = []
tree_importance = []

for i in range(0, 34):
    tree_features.append(X_train.columns[i])
    tree_importance.append(clf_tree.feature_importances_[i])

In [None]:
f1 = plt.bar(range(len(tree_features)), tree_importance)

plt.xlabel('feature')
plt.ylabel('feature importance')
plt.title('Decision Tree Feature Importance')
plt.xticks(range(len(tree_features)), tree_features, rotation = 90)

plt.show()

In [None]:
clf_rforest = RandomForestClassifier()
rf_params = {'n_estimators' : list(range(50,301,50)),
            'max_depth': list(range(1,31, 10)),
            'min_samples_split': [2,5,10] ,
            'min_samples_leaf': [1,2,4] ,
            'max_features': ['sqrt', 'log2'],
            'criterion': ['gini', 'entropy'],
            }

In [None]:
rf_cv = GridSearchCV(clf_rforest, rf_params, cv = 5, scoring = 'accuracy')
rf_cv.fit(X_train, Y_train)

In [None]:
rf_cv.best_params_

In [None]:
1-rf_cv.best_score_

In [None]:
rf = RandomForestClassifier(criterion= 'entropy', max_depth= 11, max_features= 'log2', min_samples_leaf= 1, min_samples_split= 2, n_estimators= 250)
rf.fit(X_train, Y_train)
rf_acc = rf.score(X_test, Y_test)

In [None]:
rf_acc

In [None]:
rf = RandomForestClassifier(*input parameters*)
rf.fit(X_train,Y_train)
rf_pred = rf.predict(X_test)
print(confusion_matrix(Y_test, rf_pred))
print(classification_report(Y_test, rf_pred))

In [None]:
rf_features = []
rf_importance = []

for i in range(0, 34):
    rf_features.append(X_train.columns[i])
    rf_importance.append(rf.feature_importances_[i])

In [None]:
rf.feature_importances_

In [None]:
f1 = plt.bar(range(len(rf_features)), rf_importance)

plt.xlabel('feature')
plt.ylabel('feature importance')
plt.title('Random Forest Feature Importance')
plt.xticks(range(len(rf_features)), rf_features, rotation = 90)

plt.show()

In [None]:
clf_rforest = RandomForestClassifier()
rf_params = {'n_estimators' : list(range(50,301,50)),
            'max_depth': list(range(1,31, 10)),
            'min_samples_split': [2,5,10] ,
            'min_samples_leaf': [1,2,4] ,
            'max_features': ['sqrt', 'log2'],
            'criterion': ['gini', 'entropy'],
            }

In [None]:
rf_cv = GridSearchCV(clf_rforest, rf_params, cv = 5, scoring = 'accuracy')
rf_cv.fit(X_train, Y_train)

In [None]:
nn = MLPClassifier()
nn_params = {'hidden_layer_sizes' : [(300,),(50,50)],
            'alpha': [.0001, .05, .1]
            }


In [None]:
nn_cv = GridSearchCV(nn, nn_params, cv = 5, scoring = 'accuracy')
nn_cv.fit(X_train, Y_train)

In [None]:
nn_cv.best_score_

In [None]:
nn_cv.best_params_

In [None]:
nn = MLPClassifier(hidden_layer_sizes= (300,), alpha= 0.1)
nn.fit(X_train, Y_train)

In [None]:
nn_acc = nn.score(X_test, Y_test)

In [None]:
nn_acc

In [None]:
nn_pred = nn.predict(X_test)
print(confusion_matrix(Y_test, nn_pred))
print(classification_report(Y_test, nn_pred))