In [None]:
# Import the organization modules
import pandas as pd
import numpy as np
# Import module to ignore warnings
import warnings
warnings.filterwarnings('ignore')
# Import the plot modules
import matplotlib.pyplot as plt
import seaborn as sns
# Import own scripts
from scripts.data_cleaning import (
    read_all_csvs, 
    clean_data,
    create_csv
    )

In [None]:
# Create data frame from all single CSV files
df = read_all_csvs()
# Clean the data
df = clean_data(df)

In [None]:
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report, accuracy_score, f1_score, fbeta_score


RSEED = 42  

In [None]:
features = df.columns.to_list()
features.remove('state')
X = df[features]
y = df.state

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=RSEED, shuffle=True, stratify=y)

In [None]:
# Use DummyClassifier as a baseline model
dummy_clf = DummyClassifier(strategy="prior")
dummy_clf.fit(X_train, y_train)
y_predict_train = dummy_clf.predict(X_train)
y_predict = dummy_clf.predict(X_test)


print("f_beta_score on train set: ", fbeta_score(y_test, y_predict_train, beta = 2, average = 'macro').round(2))
print("f_beta_score on test set: ", fbeta_score(y_test, y_predict, beta = 2, average = 'macro').round(2))
print("--------"*10)

# Print accuracy of our model
print("Accuracy on train set:", round(accuracy_score(y_train, y_predict_train), 2))
print("Accuracy on test set:", round(accuracy_score(y_test, y_predict), 2))
print("--------"*10)

# Print classification report of our model
print(classification_report(y_test, y_predict))
print("--------"*10)

# Evaluate the model with a confusion matrix
cm = confusion_matrix(y_test, y_predict)

In [None]:
## Create the Confusion Matrix Display Object(cmd_obj). Note the 
## alphabetical sorting order of the labels.
cmd_obj = ConfusionMatrixDisplay(cm, display_labels=['failed', 'succesful'])
## The plot() function has to be called for the sklearn visualization
## code to do its work and the Axes object to be created.
fig, ax = plt.subplots(figsize=(10,10))

## Set the color
cmd_obj.plot(ax=ax, cmap = 'Blues')
## Use the Axes attribute 'ax_' to get to the underlying Axes object.
## The Axes object controls the labels for the X and the Y axes. It
## also controls the title.
cmd_obj.ax_.set(
                title='Baseline model prediction of kickstarter projects', 
                xlabel='Predicted State', 
                ylabel='Actual States',
                )
## Finally, call the matplotlib show() function to display the visualization
## of the Confusion Matrix.
plt.show()

In [None]:
# models = {
#     'KNN': KNeighborsClassifier(), 
#     'DecisionTree': DecisionTreeClassifier(random_state=RSEED),
#     'Logistic': LogisticRegression(random_state=RSEED),
#     'SVM': SVC(random_state=RSEED),
#     'Bayes': GaussianNB()
#     }

# predictions_train = []
# fbetas = []

# for key in models.keys():
#     model_name = key
#     clf = models[key].fit(X_train, y_train)
#     y_pred_train = clf.predict(X_train)
#     predictions_train.append(y_pred_train)
#     fbeta = fbeta_score(y_train, y_pred_train, beta = 2, average = 'macro').round(2)
#     fbetas.append(fbeta)