In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression
import statsmodels.api as sm
from statsmodels.api import OLS

from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import r2_score

from sklearn.model_selection import RandomizedSearchCV as RSCV

In [2]:
ksDf = pd.read_csv('cleaned_kickstarter_dataset.csv')
ksDf.head()

Unnamed: 0.1,Unnamed: 0,state,backers,usd_pledged_real,usd_goal_real,duration,Art,Comics,Crafts,Dance,...,Film & Video,Food,Games,Journalism,Music,Photography,Publishing,Technology,Theater,encoded_state
0,0,failed,0.0,0.0,1533.95,59,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
1,1,failed,15.0,2421.0,30000.0,60,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
2,2,failed,3.0,220.0,45000.0,45,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
3,3,failed,1.0,1.0,5000.0,30,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
4,4,canceled,14.0,1283.0,19500.0,56,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [3]:
def findClassificationScore(model, paramXTrain, paramYTrain, paramXTest, paramYTest):
    model.fit(paramXTrain, paramYTrain)
    yPred = model.predict(paramXTest)
    print("Model: ", model)
    print("Best Parameters: ", model.best_params_)
    print("Balanced Accuracy Score: ", balanced_accuracy_score(paramYTest, yPred))
    print("Precision Score: ", precision_score(paramYTest, yPred, average='macro'))
    print("Recall: ", recall_score(paramYTest, yPred, average = 'macro'))
    print("F1 Score: ", f1_score(paramYTest, yPred, average = 'macro'))
#     print("ROC AUC Score: ", roc_auc_score(paramYTest, yPred))
    print("First 10 Prediction: ", yPred[:10])
    print("First 10 Actual: ", paramYTest[:10])
    print("")

In [4]:
def findRegressorScore(model, paramXTrain, paramYTrain, paramXTest, paramYTest):
    model.fit(paramXTrain, paramYTrain)
    yPred = model.predict(paramXTest)
    print("Model: ", model)
    print("Best Parameters: ", model.best_params_)
    print("Model Score: ", model.score(paramXTest, paramYTest))
    print("Mean Absolute Error: ", mean_absolute_error(paramYTest, yPred))
    print("Mean Squared Log Error: ", mean_squared_log_error(paramYTest, yPred))
    print("R2 Score: ", r2_score(paramYTest, yPred, multiclass = "ovr"))
    print("First 10 Prediction: ", yPred[:10])
    print("First 10 Actual: ", paramYTest[:10])
    print("")

In [5]:
cpDf = ksDf.copy()
x = np.array(cpDf.drop(['state', 'encoded_state'], axis = 1))
y = np.array(ksDf['encoded_state'])

In [6]:
xtr, xts, ytr, yts = train_test_split(x, y, test_size = .1)

### Params for Linear Regression

In [7]:
copy_X = ["True", "False"]
fit_intercept = ["True", "False"]
n_jobs = ["int", "None"]
normalize = ["True", "False"]

paramLire = {
    "copy_X": copy_X,
    "fit_intercept": fit_intercept,
    "n_jobs" : n_jobs,
    "normalize" : normalize
}

lire = LinearRegression()
modelLire = RSCV(estimator=lire, param_distributions=paramLire, n_iter=100,
                               cv=5, verbose=2, random_state=41, n_jobs=-1)

### Params for Gradient Boosting Regressor

In [8]:
alpha = np.linspace(0.1,1,10)
learning_rate = np.arange(0,0.5,0.1)
loss = ['ls', 'lad', 'huber', 'quantile']
max_depth = [3, 4, 5, 6, 8, 10, 12, 15]
max_features = ['auto', 'sqrt', 'log2']
max_leaf_nodes = [10,20,30,40,50,60,70,None]
min_samples_leaf = [1,2,3,4,5,6,7,8]
min_samples_split = [2,4,6,8,10,12]
n_estimators = np.arange(100,201,10)
subsample = np.linspace(0.5,1,6)
validation_fraction = np.linspace(0,1,10)
warm_start = [True, False]


paramGBR = {
    "alpha": alpha,
    "learning_rate": learning_rate,
    "loss" : loss,
    "max_depth" : max_depth,
    "max_features" : max_features,
    "max_leaf_nodes" : max_leaf_nodes,
    "min_samples_leaf" : min_samples_leaf,
    "min_samples_split" : min_samples_split,
    "n_estimators" : n_estimators,
    "subsample" : subsample,
    "validation_fraction" : validation_fraction,
    "warm_start" : warm_start
}

GBR = GradientBoostingRegressor()
modelGBR = RSCV(estimator=GBR, param_distributions=paramGBR, n_iter=100,
                               cv=5, verbose=2, random_state=41, n_jobs=-1)

In [9]:
n_estimators = list(range(10, 100, 5))
max_features = ['auto', 'sqrt']
max_depth = list(range(10, 200, 10))
max_depth.append(None)
min_samples_split = [2, 5, 10, 15, 20]
min_samples_leaf = [1, 2, 4, 6, 8, 10]
bootstrap = [True, False]

paramRFR = {
    'n_estimators': n_estimators,
    'max_features': max_features,
    'max_depth': max_depth,
    'min_samples_split': min_samples_split,
    'min_samples_leaf' : min_samples_leaf,
    'bootstrap': bootstrap
}

RFR = RandomForestRegressor()
modelRFR = RSCV(estimator=RFR, param_distributions=paramRFR, n_iter=100,
                               cv=5, verbose=2, random_state=41, n_jobs=-1)

In [10]:
penalty = ["l1", "l2", "elasticnet", "none"]
solver = ["newton-cg", "lbfgs", "liblinear", "sag", "saga"]
max_iter = [1, 10, 100, 1000, 10000]

paramLore = {
    "penalty": penalty,
    "solver": solver,
    "max_iter" : max_iter
}

lore = LogisticRegression(multi_class = 'ovr')
modelLore = RSCV(estimator= lore, 
                             param_distributions= paramLore,
                             cv= 5)

In [11]:
k = round(xtr.shape[0] ** .5)
if k % 2 == 0:
    k+=1
    
print(k)
    
n_neighbors = list(range(k-5, k+5))
algo = ["ball_tree", "kd_tree", "brute", "auto"]

paramKNN = {
    "n_neighbors": n_neighbors,
    "algorithm": algo
}

knn = KNN()
modelKNN = RSCV(estimator= knn, 
                             param_distributions= paramKNN, 
                             cv= 5)

577


In [12]:
n_estimators = list(range(10, 100, 5))
max_features = ['auto', 'sqrt']
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

paramRFC = {'n_estimators': n_estimators,
               'max_features': max_features,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

RFC = RandomForestClassifier()
modelRFC = RSCV(estimator= RFC, 
                             param_distributions= paramRFC, 
                             cv= 5)

In [13]:
classificationModel = [modelLore, modelKNN, modelRFC]
regressorModel = [modelRFR, modelGBR, modelLire]

In [14]:
# # for i in classificationModel:
#     findClassificationScore(i, xtr, ytr, xts, yts)

In [15]:
modelLore.get_params

<bound method BaseEstimator.get_params of RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=LogisticRegression(C=1.0, class_weight=None,
                                                dual=False, fit_intercept=True,
                                                intercept_scaling=1,
                                                l1_ratio=None, max_iter=100,
                                                multi_class='ovr', n_jobs=None,
                                                penalty='l2', random_state=None,
                                                solver='lbfgs', tol=0.0001,
                                                verbose=0, warm_start=False),
                   iid='deprecated', n_iter=10, n_jobs=None,
                   param_distributions={'max_iter': [1, 10, 100, 1000, 10000],
                                        'penalty': ['l1', 'l2', 'elasticnet',
                                                    'none'],
                      

In [None]:
findClassificationScore(modelRFC, xtr, ytr, xts, yts)

In [None]:
modelKNN.get_params().keys()