## Import Libraries

In [4]:
import pandas as pd
import numpy as np
import datetime
import re
import string
import scipy.stats as spstats
from nltk import word_tokenize
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

## Convert to date function

In [2]:
def convert_to_date(series):
    """
    Convert Series to a Timestamp
    """
    return pd.to_datetime(series, format='%Y-%m-%d %H:%M:%S', errors='coerce')

## Preprocess Text

In [3]:
def preprocess_text(dataset, column):
    """
    Preprocess text by removing special characters, puncuations, change uppercase to lowercase 
    and replace nan with a space
    """    
    new_col = column + '_CL'
    dataset[new_col] = dataset[column].apply(lambda x: str(x).lower())
    dataset[new_col] = [re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,_]", " ", i) for i in dataset[new_col]] #slower than using str.maketrans
    dataset[new_col] = dataset[new_col].str.replace('\n',' ')
    dataset[new_col] = dataset[new_col].str.replace('\r',' ')
    dataset[new_col] = dataset[new_col].fillna(' ')
    return dataset[new_col].head(10)

## Tokenize Text (enhanced version of word_tokenizer; remove special characters)

In [7]:
# this function tokenize the text by replacing punctuation and numbers with spaces and lowercase all words
def tokenizer_better(text):   
    """
    Tokenize using nltk.word_tokenize; replace punctuation and numbers with spaces and make all words lowercase
    """
    punc_list = string.punctuation+'0123456789'
    t = str.maketrans(dict.fromkeys(punc_list, " "))
    text = text.lower().translate(t)
    tokens = word_tokenize(text)
    return tokens

## Stop words

In [22]:
# define stop_words 
my_stop_words = ['the', 'and', 'to', 'of', 'was', 'with', 'a', 'on', 'in', 'for', 'name',
                 'is', 'patient', 's', 'he', 'at', 'as', 'or', 'one', 'she', 'his', 'her', 'am',
                 'were', 'you', 'pt', 'pm', 'by', 'be', 'had', 'your', 'this', 'date', 'please', 'days', 'day',
                 'from', 'there', 'an', 'that', 'p', 'are', 'have', 'has', 'h', 'but', 'o', 'nameis', 
                 'namepattern', 'which', 'every', 'also', 'should', 'if', 'it', 'been', 'who', 'during', 'ml', 'mg',
                 'c', 'q', 'd', 'X', 'i', 'I', 'b', 'dr', 'ct', 'x', 'w', 'l', 'j', 'e', 'r', 't', 'w', '1','n', 'm','y']
%store my_stop_words

Stored 'my_stop_words' (list)


In [6]:
def create_quantile_binning(dataset, column, bins):
    """
    Create quantile binning using  for specified column
    """
    col = dataset[column].values.reshape(-1, 1) # reshape array before feeding into discretizer

    enc = KBinsDiscretizer(n_bins=bins, encode='onehot', strategy='quantile')
    new_col = column + '_binned'
    dataset[new_col] = enc.fit_transform(col)

In [5]:
def create_log_transform(dataset, column, title):
    """
    Create a log-transformed version of the specified column
    """
    logname = column + "_LOG" 
    dataset[logname] = np.log((1+ dataset[column])) # create log-transformed column
    
    log_mean = np.round(np.mean(dataset[logname]), 2)
    fig, ax = plt.subplots()
    dataset[logname].hist(bins=30, color='#A9C5D3', 
                                     edgecolor='black', grid=False) # plot log-transformed distribution
    plt.axvline(log_mean, color='r')
    ax.set_title(title, 
                 fontsize=12)
    ax.set_xlabel(column + ' (log scale)', fontsize=12)
    ax.set_ylabel('Frequency', fontsize=12)

In [4]:
def create_boxcox(dataset, column, title):
    """
    Create and plot boxcox-transformed version of the specified column
    """
    var = np.array(dataset[column])
    var_clean = var[~np.isnan(var)]
    l, opt_lambda = spstats.boxcox(var_clean)
    var_boxcox_name = column + '_boxcox_lambda_opt'
    dataset[var_boxcox_name] = spstats.boxcox(
        dataset[column],
        lmbda=opt_lambda)

    var_boxcox_mean = np.round(np.mean(dataset[var_boxcox_name]),2)
    fig, ax = plt.subplots()
    dataset[var_boxcox_name].hist(bins=30, 
                         color='#A9C5D3', edgecolor='black', grid=False)
    plt.axvline(var_boxcox_mean, color='r')
    ax.set_title(title, 
                 fontsize=12)
    ax.set_xlabel(column + ' (Box–Cox transformed)', fontsize=12)
    ax.set_ylabel('Frequency', fontsize=12)

In [1]:
def create_feature_subset(c,X_train,X_valid,y_train):
    clf = LogisticRegression(penalty='l1',C=c,solver='liblinear').fit(X_train,y_train)
    # create a series containing feature importance
    coef = pd.Series(np.squeeze(clf.coef_), index = X_train.columns)
    print("LogisticRegression-L1 picked " + str(sum(coef != 0)) + " variables and eliminated the other " 
          +  str(sum(coef == 0)) + " variables")
    print(coef.sort_values(ascending=False).head(5))
    # extract names of features with non-zero importance 
    set = pd.DataFrame(coef[coef!= 0]).reset_index()
    set = set['index']
    # format features names in order to be used as index to subset from complete feature space
    f_name=[]
    for i in set:
        q=''+ i +''
        f_name.append(q)
    # subset features with non-zero importance
    train_subset = X_train[[c for c in X_train.columns if c in f_name]]  
    valid_subset = X_valid[[c for c in X_valid.columns if c in f_name]]
    return train_subset, valid_subset

## Create training and test sets

In [14]:
def create_splits(features, target, test_size, train_size):
    X_train, X_test, y_train, y_test = train_test_split(
        features, target, test_size=test_size, train_size=train_size, random_state=0)
    return X_train, X_test, y_train, y_test

## Estimator Selection Helper

In [13]:
class EstimatorSelectionHelper:

    def __init__(self, models, params):
   #     if not set(models.keys()).issubset(set(params.keys())):
   #         missing_params = list(set(models.keys()) - set(params.keys()))
   #         raise ValueError("Some estimators are missing parameters: %s" % missing_params)
        self.models = models
        self.params = params
        self.keys = models.keys()
        self.grid_searches = {}

    def fit(self, X, y, cv=3, n_jobs=3, verbose=1, scoring='roc_auc', refit=False):
        for key in self.keys:
            print("Running GridSearchCV for %s." % key)
            model = self.models[key]
            params = self.params[key]
            gs = GridSearchCV(model, params, cv=cv, n_jobs=n_jobs,
                              verbose=verbose, scoring=scoring, refit=refit,
                              return_train_score=True)
            gs.fit(X,y)
            self.grid_searches[key] = gs    

    def score_summary(self, sort_by='mean_score'):
        def row(key, scores, params):
            d = {
                 'estimator': key,
                 'min_score': min(scores),
                 'max_score': max(scores),
                 'mean_score': np.mean(scores),
                 'std_score': np.std(scores),
            }
            return pd.Series({**params,**d})

        rows = []
        for k in self.grid_searches:
            print(k)
            params = self.grid_searches[k].cv_results_['params']
            scores = []
            for i in range(self.grid_searches[k].cv):
                key = "split{}_test_score".format(i)
                r = self.grid_searches[k].cv_results_[key]        
                scores.append(r.reshape(len(params),1))

            all_scores = np.hstack(scores)
            for p, s in zip(params,all_scores):
                rows.append((row(k, s, p)))

        df = pd.concat(rows, axis=1).T.sort_values([sort_by], ascending=False)

        columns = ['estimator', 'min_score', 'mean_score', 'max_score', 'std_score']
        columns = columns + [c for c in df.columns if c not in columns]

        return df[columns]

## F-beta Score & threshold tuples

In [17]:
def create_fbeta_score(y_true, y_valid_preds, thresholds, beta): 
    """Calculate fscore for each threshold and store as tuples in a list"""
    fscores_thres = []
    fscores = []
    for threshold in thresholds:
        y_valid_class = (y_valid_preds >= threshold).astype(int)
        fscore = fbeta_score(y_true, y_valid_class, pos_label=1, beta=beta)
        fscores.append(fscore)
        fscores_thres.append((threshold, fscore))
    
    best_threshold = max(fscores_thres, key = itemgetter(1))[0] 
    best_fscore = max(fscores_thres, key = itemgetter(1))[1] 
    adj_y_valid_class = (y_valid_preds >= best_threshold).astype(int)
    return beta, best_threshold, best_fscore, fscores_thres, adj_y_valid_class, fscores

In [2]:
print('sucessfully ran function nobtebook!')

sucessfully ran function nobtebook!
