# Modelling Notebook

### Import the needed modules

In [18]:
# Import the organization modules
import pandas as pd
import numpy as np
# Import module to ignore warnings
import warnings
warnings.filterwarnings('ignore')
# Import the plot modules
import matplotlib.pyplot as plt
import seaborn as sns
# Import own scripts
from scripts.data_cleaning import (
    read_all_csvs, 
    clean_data,
    create_csv
    )

### Load and clean the data set

In [19]:
# Create data frame from all single CSV files
df = read_all_csvs()
# Clean the data
df = clean_data(df)

---
## Preprocessing

### Import the needed modules

In [43]:

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report, accuracy_score, f1_score, fbeta_score

RSEED = 42  

In [44]:
df.head()

Unnamed: 0,goal,state,days_launched_till_changed,days_prelaunch,days_total,project_name_len,creator_name_len,blurb_len,country_AT,country_AU,...,category_sub_wearables,category_sub_weaving,category_sub_web,category_sub_webcomics,category_sub_webseries,category_sub_woodworking,category_sub_workshops,category_sub_world music,category_sub_young adult,category_sub_zines
0,200.0,successful,45,4,49,4,6,26,0,0,...,0,0,0,0,0,0,0,0,0,0
1,400.0,successful,20,5,25,5,9,9,0,0,...,0,0,0,0,0,0,0,0,0,0
2,27224.0,successful,30,9,39,9,13,25,0,0,...,0,0,0,0,0,0,0,0,0,0
3,40000.0,successful,42,3,45,5,3,13,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1000.0,failed,30,2,32,4,11,22,0,0,...,0,0,0,0,0,0,0,0,0,0


In [66]:
# Set test features and target
features = df.columns.to_list()
features.remove('state')
X = df[features]
y = df.state

In [67]:
# Do the train-test-split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=RSEED, shuffle=True, stratify=y)

In [68]:
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [72]:
X_train_scaled = pd.DataFrame(X_train_scaled, columns=features)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=features)

---
## DummyClassifier

In [74]:
from sklearn.dummy import DummyClassifier
# Use DummyClassifier as a baseline model
dummy_clf = DummyClassifier(strategy="prior")
dummy_clf.fit(X_train_scaled, y_train)
y_predict_train = dummy_clf.predict(X_train_scaled)
y_predict = dummy_clf.predict(X_test_scaled)


print("f_beta_score on train set: ", fbeta_score(y_train, y_predict_train, beta = 2, average = 'macro').round(2))
print("f_beta_score on test set: ", fbeta_score(y_test, y_predict, beta = 2, average = 'macro').round(2))
print("--------"*10)

# Print accuracy of our model
print("Accuracy on train set:", round(accuracy_score(y_train, y_predict_train), 2))
print("Accuracy on test set:", round(accuracy_score(y_test, y_predict), 2))
print("--------"*10)

# Print classification report of our model
print(classification_report(y_test, y_predict))
print("--------"*10)

# Evaluate the model with a confusion matrix
cm = confusion_matrix(y_test, y_predict)

f_beta_score on train set:  0.43
f_beta_score on test set:  0.43
--------------------------------------------------------------------------------
Accuracy on train set: 0.53
Accuracy on test set: 0.53
--------------------------------------------------------------------------------
              precision    recall  f1-score   support

      failed       0.00      0.00      0.00     25014
  successful       0.53      1.00      0.69     28378

    accuracy                           0.53     53392
   macro avg       0.27      0.50      0.35     53392
weighted avg       0.28      0.53      0.37     53392

--------------------------------------------------------------------------------


In [None]:
## Create the Confusion Matrix Display Object(cmd_obj). Note the 
## alphabetical sorting order of the labels.
cmd_obj = ConfusionMatrixDisplay(cm, display_labels=['failed', 'succesful'])
## The plot() function has to be called for the sklearn visualization
## code to do its work and the Axes object to be created.
fig, ax = plt.subplots(figsize=(10,10))

## Set the color
cmd_obj.plot(ax=ax, cmap = 'Blues')
## Use the Axes attribute 'ax_' to get to the underlying Axes object.
## The Axes object controls the labels for the X and the Y axes. It
## also controls the title.
cmd_obj.ax_.set(
                title='Baseline model prediction of kickstarter projects', 
                xlabel='Predicted State', 
                ylabel='Actual States',
                )
## Finally, call the matplotlib show() function to display the visualization
## of the Confusion Matrix.
plt.show()

---
## Adabooster Model


In [None]:
# Import 'GridSearchCV', 'make_scorer', and any other necessary libraries
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer 

# Initialize the classifier
clf = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(), random_state=42) 

# Create the parameters list you wish to tune
parameters = {'n_estimators':[50, 120],                
              'learning_rate':[0.1, 0.5, 1.],               
              'base_estimator__min_samples_split' : np.arange(2, 8, 2),               
              'base_estimator__max_depth' : np.arange(1, 4, 1)              
             } 

# Make an fbeta_score scoring object
scorer = make_scorer(fbeta_score,beta=2)

# Perform grid search on the classifier using 'scorer' as the scoring method
grid_obj = GridSearchCV(clf, parameters, scorer, verbose=1, n_jobs=-1)

# Fit the grid search object to the training data and find the optimal parameters
grid_fit = grid_obj.fit(X_train,y_train) 

# Get the estimator
best_clf = grid_fit.best_estimator_ 

# Make predictions using the unoptimized and model
simple_predictions = (clf.fit(X_train, y_train)).predict(X_test)
best_predictions = best_clf.predict(X_test) 

# Report the before-and-afterscores
print("Unoptimized model\n------")
print("Accuracy score on testing data: {:.4f}".format(accuracy_score(y_test, simple_predictions)))
print("F-score on testing data: {:.4f}".format(fbeta_score(y_test, simple_predictions, beta = 0.5)))
print("\nOptimized Model\n------")
print("Final accuracy score on the testing data: {:.4f}".format(accuracy_score(y_test, best_predictions)))
print("Final F-score on the testing data: {:.4f}".format(fbeta_score(y_test, best_predictions, beta = 0.5)))
print(best_clf)