In [11]:
# all the standard libraries imports
import pandas as pd
import numpy as np 
import sklearn as sk
import string

# metrics reporting imports
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

# models imports
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVC
from sklearn.svm import SVR
from sklearn import neighbors
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.neural_network import MLPRegressor
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import SGDRegressor
from sklearn.multiclass import OneVsRestClassifier

# others
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

In [12]:
# load the datasets
advertising = pd.read_csv('./data/Advertising.csv', index_col = 0) # no need for OHE
college = pd.read_csv('./data/College.csv', index_col = 0)
auto = pd.read_csv('./data/Auto.csv') # ordinal data in "year"
ch10ex11 = pd.read_csv('./data/Ch10Ex11.csv') # no need for OHE 
credit = pd.read_csv('./data/Credit.csv', index_col = 0) # OHE "ethnicity", "gender", "student" and "married".
fortune500 = pd.read_csv('./data/fortune500.csv', index_col='Rank') # ordinal data in "year", categorical in "company". 
heart = pd.read_csv('./data/Heart.csv', index_col = 0) # OHE chestpain, classify for ADH 
income1 = pd.read_csv('./data/income1.csv', index_col = 0) # no need for OHE
income2 = pd.read_csv('./data/income2.csv', index_col = 0) # no need for OHE 
iris = pd.read_csv('./data/iris.csv', index_col = 'Id') # no need for OHE, classify for species - ready
iris5 = pd.read_csv('./data/iris5.csv') # no need for OHE, classify for species - ready
SAheart1 = pd.read_csv('./data/SAheart.csv') # OHE famhist, classify for chd
SAheart2 = pd.read_csv('./data/SAheart2.csv', index_col = 'row.names') # OHE famhist, classify for chd
# all ids are kept in index columns

In [15]:
arr1 = list(string.ascii_lowercase)
arr2 = [(letter + letter) for letter in arr1[:14]]
ch10ex11.columns = arr1 + arr2

fortune500.rename(columns={'Profit (in millions)':'Profit'}, inplace=True)
college.rename(columns={'Grad.Rate':'Gradrate'}, inplace=True)
fortune500 = fortune500.dropna()
heart = heart.dropna()

# replace categorical and ordinal features with one-hot encoding where necessary
SAheart1 = pd.get_dummies(SAheart1, columns = ['famhist'], drop_first = True)
SAheart2 = pd.get_dummies(SAheart2, columns = ['famhist'], drop_first = True)
auto = pd.get_dummies(auto, columns = ['year'], drop_first = True) # year_70 is a baseline, classify for 'name'
credit = pd.get_dummies(credit, columns = ["Ethnicity", "Gender", "Student", "Married"], drop_first = True)
fortune500 = pd.get_dummies(fortune500, columns = ["Year", "Company"], drop_first = True)
heart = pd.get_dummies(heart, columns = ["ChestPain", "Thal"], drop_first = True)
college = pd.get_dummies(college, columns = ["Private"], drop_first = True)

In [16]:
def split_data(dataframe, target_column):
    # returns X_train, X_test, y_train, y_test
    return train_test_split(dataframe.drop(target_column, axis = 1), dataframe[[target_column]], test_size = 0.3, random_state = 100)

In [17]:
# function for printing a report on certain model's accuracy on a given dataset
def evaluate_classifier(model_name, dataset_name, model, dataset, target_column):
    X_train, X_test, y_train, y_test = split_data(dataset, target_column)
    model.fit(X_train, y_train.values.ravel())    # build the model on training data
    y_pred = model.predict(X_test)    # make predictions for test data
    accuracy = accuracy_score(y_test, y_pred) * 100    # calculate the accuracy score
    print(f'{model_name} accuracy on {dataset_name} is: {accuracy}')
    
def evaluate_regressor(model_name, dataset_name, model, dataset, target_column):
    X_train, X_test, y_train, y_test = split_data(dataset, target_column)
    model.fit(X_train, y_train.values.ravel()) #  build the model on training data
    y_pred = model.predict(X_test) # make predictions for test data
    score = model.score(dataset.drop(target_column, axis = 1), dataset[[target_column]])    # calculate the RSS
    print(f'{model_name} score on {dataset_name} is: {score}')

In [18]:
# to report the performance of all considered classifiers for a given dataset
def report_all_classifiers(dataset_name, dataset, target_column):
    evaluate_classifier("Decision tree classifier", dataset_name, tree.DecisionTreeClassifier(criterion='entropy', max_depth=5), dataset, target_column)
    evaluate_classifier("Random forest", dataset_name, RandomForestClassifier(n_estimators=10), dataset, target_column)
    evaluate_classifier("SVC", dataset_name, SVC(gamma='auto'), dataset, target_column)
    evaluate_classifier("K Nearest Neighbors", dataset_name, neighbors.KNeighborsClassifier(), dataset, target_column)
    evaluate_classifier("Gaussian Naive Bayes", dataset_name, GaussianNB(), dataset, target_column)
    evaluate_classifier("Multilayer perceptron(neural network)", dataset_name, MLPClassifier(), dataset, target_column)
    evaluate_classifier("Gaussian process classifier", dataset_name, GaussianProcessClassifier(), dataset, target_column)
    evaluate_classifier("SGD", dataset_name, OneVsRestClassifier(SGDClassifier()), dataset, target_column)
    
def report_all_regressors(dataset_name, dataset, target_column):  
    evaluate_regressor("Decision tree regressor", dataset_name, tree.DecisionTreeRegressor(criterion='mse', max_depth=5), dataset, target_column)
    evaluate_regressor("Random forest regressor", dataset_name, RandomForestRegressor(n_estimators=10), dataset, target_column)
    evaluate_regressor("SVR", dataset_name, SVR(gamma='auto'), dataset, target_column)
    evaluate_regressor("K Nearest Neighbors", dataset_name, neighbors.KNeighborsRegressor(), dataset, target_column)
    evaluate_regressor("Multilayer perceptron(neural network)", dataset_name, MLPRegressor(), dataset, target_column)
    evaluate_regressor("Gaussian process regressor", dataset_name, GaussianProcessRegressor(), dataset, target_column)
    evaluate_regressor("SGD", dataset_name, SGDRegressor(), dataset, target_column)

In [19]:
report_all_classifiers("Iris", iris, "Species") # works well

Decision tree classifier accuracy on Iris is: 95.55555555555556
Random forest accuracy on Iris is: 95.55555555555556
SVC accuracy on Iris is: 97.77777777777777
K Nearest Neighbors accuracy on Iris is: 97.77777777777777
Gaussian Naive Bayes accuracy on Iris is: 95.55555555555556
Multilayer perceptron(neural network) accuracy on Iris is: 100.0
Gaussian process classifier accuracy on Iris is: 95.55555555555556
SGD accuracy on Iris is: 95.55555555555556




In [20]:
report_all_classifiers("Auto", auto, "name") # results very low :(

Decision tree classifier accuracy on Auto is: 3.389830508474576
Random forest accuracy on Auto is: 2.5423728813559325
SVC accuracy on Auto is: 0.847457627118644
K Nearest Neighbors accuracy on Auto is: 1.694915254237288
Gaussian Naive Bayes accuracy on Auto is: 2.5423728813559325
Multilayer perceptron(neural network) accuracy on Auto is: 0.847457627118644
Gaussian process classifier accuracy on Auto is: 0.847457627118644
SGD accuracy on Auto is: 0.0


In [21]:
report_all_regressors("ch10ex11", ch10ex11, "nn")

Decision tree regressor score on ch10ex11 is: 0.3302193428481114
Random forest regressor score on ch10ex11 is: 0.6230212709639362
SVR score on ch10ex11 is: 0.5563103320710193
K Nearest Neighbors score on ch10ex11 is: 0.3662756639269928
Multilayer perceptron(neural network) score on ch10ex11 is: 0.5844228405036171
Gaussian process regressor score on ch10ex11 is: 0.682391045767796
SGD score on ch10ex11 is: 0.25144023328188414




In [22]:
report_all_regressors("Advertising", advertising, "sales")

Decision tree regressor score on Advertising is: 0.9756368411531154
Random forest regressor score on Advertising is: 0.9889506633909582
SVR score on Advertising is: 0.14297617464062307
K Nearest Neighbors score on Advertising is: 0.9351074123633766
Multilayer perceptron(neural network) score on Advertising is: -0.3684617632447509
Gaussian process regressor score on Advertising is: -1.516157136088004
SGD score on Advertising is: -7.734379305722402e+25


In [23]:
report_all_regressors("Income", income1, "Income")

Decision tree regressor score on Income is: 0.9749849000300786
Random forest regressor score on Income is: 0.9700660667795222
SVR score on Income is: 0.20389819583580548
K Nearest Neighbors score on Income is: 0.9658378246095239
Multilayer perceptron(neural network) score on Income is: 0.5127373035378616
Gaussian process regressor score on Income is: -1176.0849285773018
SGD score on Income is: 0.7768090873943498




In [24]:
report_all_regressors("Income2", income2, "Income")

Decision tree regressor score on Income2 is: 0.9412411103436541
Random forest regressor score on Income2 is: 0.9623491520872988
SVR score on Income2 is: -0.14259757674899665
K Nearest Neighbors score on Income2 is: 0.3938713741179512
Multilayer perceptron(neural network) score on Income2 is: 0.6894943574500172
Gaussian process regressor score on Income2 is: -0.1953245299854478
SGD score on Income2 is: -3.4736623482284055e+23




In [25]:
report_all_regressors("Credit", credit, "Balance")

Decision tree regressor score on Credit is: 0.9316660341781899
Random forest regressor score on Credit is: 0.9609537412901953
SVR score on Credit is: -0.004675177596823676
K Nearest Neighbors score on Credit is: 0.8329360066793079
Multilayer perceptron(neural network) score on Credit is: 0.8203394381233609
Gaussian process regressor score on Credit is: 0.307147675677481
SGD score on Credit is: -4.594176735614397e+25




In [None]:
report_all_regressors("fortune500", fortune500, "Profit") # too many parameters - hangs

Decision tree regressor score on fortune500 is: 0.6436235709301354
Random forest regressor score on fortune500 is: 0.8296673384841631


In [26]:
report_all_classifiers("Heart", heart, "AHD") 

Decision tree classifier accuracy on Heart is: 71.11111111111111
Random forest accuracy on Heart is: 75.55555555555556
SVC accuracy on Heart is: 57.77777777777777
K Nearest Neighbors accuracy on Heart is: 64.44444444444444
Gaussian Naive Bayes accuracy on Heart is: 81.11111111111111
Multilayer perceptron(neural network) accuracy on Heart is: 80.0
Gaussian process classifier accuracy on Heart is: 52.22222222222223
SGD accuracy on Heart is: 67.77777777777779


In [27]:
report_all_regressors("College", college, "Gradrate")

Decision tree regressor score on College is: 0.49112017371566874
Random forest regressor score on College is: 0.7607726057807915
SVR score on College is: 0.06451694002993469
K Nearest Neighbors score on College is: 0.4521040633675055
Multilayer perceptron(neural network) score on College is: -5.0112176093562155
Gaussian process regressor score on College is: -3.5661313827061343
SGD score on College is: -1.799410748079277e+32




In [28]:
report_all_classifiers("SAheart1", SAheart1, "chd")

Decision tree classifier accuracy on SAheart1 is: 65.46762589928058
Random forest accuracy on SAheart1 is: 68.34532374100719
SVC accuracy on SAheart1 is: 62.589928057553955
K Nearest Neighbors accuracy on SAheart1 is: 61.15107913669065
Gaussian Naive Bayes accuracy on SAheart1 is: 74.10071942446042
Multilayer perceptron(neural network) accuracy on SAheart1 is: 72.66187050359713
Gaussian process classifier accuracy on SAheart1 is: 57.55395683453237
SGD accuracy on SAheart1 is: 71.94244604316546




In [29]:
report_all_classifiers("SAheart2", SAheart2, "chd")

Decision tree classifier accuracy on SAheart2 is: 65.46762589928058
Random forest accuracy on SAheart2 is: 66.18705035971223
SVC accuracy on SAheart2 is: 62.589928057553955
K Nearest Neighbors accuracy on SAheart2 is: 61.15107913669065
Gaussian Naive Bayes accuracy on SAheart2 is: 74.10071942446042
Multilayer perceptron(neural network) accuracy on SAheart2 is: 71.94244604316546
Gaussian process classifier accuracy on SAheart2 is: 57.55395683453237
SGD accuracy on SAheart2 is: 62.589928057553955
