# Strip PDFs and export text to csv file
- I wrote this function to automate the process of copying and pasting pdf files for a business

In [1]:
working_directory = ('whatever your working directory needs to be ')
def pdf_text_to_csv(rootdir):
    
    #TODO: update with method for extracting specific page numbers
    
    '''iterates through a folder directory, extracts text from PDFs, converts the distinct text to a dataframe,
    and exports the dataframe as a csv file with name - 'text_df' - 
     arguments: rootdir = filepath to the folder you want the function to iterate through'''
    import fitz
    import os
    
    os.chdir(working_directory)
    
    text_list = []
    
    for subdir, dirs, files in os.walk(rootdir):
        for file in files:
            doc = fitz.open(file)
            page = doc[0]
            text = page.getText("text")                    
                
            text_list.append(text)            
            doc.close()
            df = pd.DataFrame(text_list, columns = ['text'])
            df.to_csv('text_df.csv')                         #change name of csv file if you wish
    return df

# EDA/modeling Functions

In [None]:
#example usage: plotting categorical vs numeric data using multiple boxplots

# Import pandas
import pandas as pd

# Read 'gapminder.csv' into a DataFrame: df
df = pd.read_csv('gapminder.csv')

# Create a boxplot of life expectancy per region
df.boxplot('life', 'Region', rot=60)

# Show the plot
plt.show()

In [None]:
#https://towardsdatascience.com/verifying-the-assumptions-of-linear-regression-in-python-and-r-f4cd2907d4c0
%matplotlib inline
%config InlineBackend.figure_format ='retina'
import seaborn as sns 
import matplotlib.pyplot as plt
import statsmodels.stats.api as sms
sns.set_style('darkgrid')
sns.mpl.rcParams['figure.figsize'] = (15.0, 9.0)

def homoscedasticity_test(model):
    '''
    Function for testing the homoscedasticity of residuals in a linear regression model.
    It plots residuals and standardized residuals vs. fitted values and runs Breusch-Pagan and Goldfeld-Quandt tests.
    
    Args:
    * model - fitted OLS model from statsmodels
    '''
    fitted_vals = model.predict()
    resids = model.resid
    resids_standardized = model.get_influence().resid_studentized_internal

    fig, ax = plt.subplots(1,2)

    sns.regplot(x=fitted_vals, y=resids, lowess=True, ax=ax[0], line_kws={'color': 'red'})
    ax[0].set_title('Residuals vs Fitted', fontsize=16)
    ax[0].set(xlabel='Fitted Values', ylabel='Residuals')

    sns.regplot(x=fitted_vals, y=np.sqrt(np.abs(resids_standardized)), lowess=True, ax=ax[1], line_kws={'color': 'red'})
    ax[1].set_title('Scale-Location', fontsize=16)
    ax[1].set(xlabel='Fitted Values', ylabel='sqrt(abs(Residuals))')

    bp_test = pd.DataFrame(sms.het_breuschpagan(resids, model.model.exog), 
                           columns=['value'],
                           index=['Lagrange multiplier statistic', 'p-value', 'f-value', 'f p-value'])

    gq_test = pd.DataFrame(sms.het_goldfeldquandt(resids, model.model.exog)[:-1],
                           columns=['value'],
                           index=['F statistic', 'p-value'])

    print('\n Breusch-Pagan test ----')
    print(bp_test)
    print('\n Goldfeld-Quandt test ----')
    print(gq_test)
    print('\n Residuals plots ----')


In [None]:
# pickle a model 
# save the model to disk
filename = 'model.sav'
pickle.dump(m, open(filename, 'wb'))

#load model back in workspace
model = pickle.load(open('model.sav','rb'))

In [None]:
#getting replicates from multiple permutations
def draw_perm_reps(data_1, data_2, func, size=1):
    """Generate multiple permutation replicates."""

    # Initialize array of replicates: perm_replicates
    perm_replicates = np.empty(size)

    for i in range(size):
        # Generate permutation sample
        perm_sample_1, perm_sample_2 = permutation_sample(data_1, data_2)

        # Compute the test statistic
        perm_replicates[i] = func(perm_sample_1, perm_sample_2)

    return perm_replicates

In [None]:
#permutation scrambling sampling, scramble two arrays to resample for hypothesis testing
def permutation_sample(data1, data2):
    """Generate a permutation sample from two data sets."""

    # Concatenate the data sets: data
    data = np.concatenate((data1, data2))

    # Permute the concatenated array: permuted_data
    permuted_data = np.random.permutation(data)

    # Split the permuted array into two: perm_sample_1, perm_sample_2
    perm_sample_1 = permuted_data[:len(data1)]
    perm_sample_2 = permuted_data[len(data1):]

    return perm_sample_1, perm_sample_2

In [None]:
#draws many samples of the slope and intercept from a linear regression with np.polyfit so that we can plot all OLS lines on a graph
def draw_bs_pairs_linreg(x, y, size=1):
    """Perform pairs bootstrap for linear regression."""

    # Set up array of indices to sample from: inds
    inds = np.arange(len(x))

    # Initialize replicates: bs_slope_reps, bs_intercept_reps
    bs_slope_reps = np.empty(size)
    bs_intercept_reps = np.empty(size)

    # Generate replicates
    for i in range(size):
        bs_inds = np.random.choice(inds, size=len(inds))
        bs_x, bs_y = x[bs_inds], y[bs_inds]
        bs_slope_reps[i], bs_intercept_reps[i] = np.polyfit(bs_x, bs_y,1)

    return bs_slope_reps, bs_intercept_reps

#---------------example usage
# Generate replicates of slope and intercept using pairs bootstrap
bs_slope_reps, bs_intercept_reps = draw_bs_pairs_linreg(illiteracy, fertility, size=1000)

# Compute and print 95% CI for slope
print(np.percentile(bs_slope_reps, [2.5, 97.5]))

# Plot the histogram
_ = plt.hist(bs_slope_reps, bins=50, normed=True)
_ = plt.xlabel('slope')
_ = plt.ylabel('PDF')
plt.show()

#----------------------plotting bootstrapped regression lines
# Generate array of x-values for bootstrap lines: x
x = np.array([0,100])

# Plot the bootstrap lines
for i in range(100):
    _ = plt.plot(x, bs_slope_reps[i]*x + bs_intercept_reps[i],
                 linewidth=0.5, alpha=0.2, color='red')

# Plot the data
_ = plt.plot(illiteracy, fertility, marker='.', linestyle='none')

# Label axes, set the margins, and show the plot
_ = plt.xlabel('illiteracy')
_ = plt.ylabel('fertility')
plt.margins(0.02)
plt.show()

In [5]:
# Python function to print permutations of a given list 
def permutation(lst): 
  
    # If lst is empty then there are no permutations 
    if len(lst) == 0: 
        return [] 
  
    # If there is only one element in lst then, only 
    # one permuatation is possible 
    if len(lst) == 1: 
        return [lst] 
  
    # Find the permutations for lst if there are 
    # more than 1 characters 
  
    l = [] # empty list that will store current permutation 
  
    # Iterate the input(lst) and calculate the permutation 
    for i in range(len(lst)): 
       m = lst[i] 
  
       # Extract lst[i] or m from the list.  remLst is 
       # remaining list 
       remLst = lst[:i] + lst[i+1:] 
  
       # Generating all permutations where m is first 
       # element 
       for p in permutation(remLst): 
           l.append([m] + p) 
    return l 
  
# Driver program to test above function 
data = list('123') 
for p in permutation(data): 
    print(p)

['1', '2', '3']
['1', '3', '2']
['2', '1', '3']
['2', '3', '1']
['3', '1', '2']
['3', '2', '1']


In [None]:
def ecdf(data):
    """Compute ECDF for a one-dimensional array of measurements."""
    # Number of data points: n
    n = len(data)

    # x-data for the ECDF: x
    x = np.sort(data)

    # y-data for the ECDF: y
    y = np.arange(1, n + 1) / n

    return x, y 

In [None]:
#draw boostrap replicates from a sample and determine bootstrap confidence intervals

def bootstrap_replicate_1d(data, func):
    bs_sample = np.random.choice(data, len(data))
    return func(bs_sample)

def draw_bs_reps(data, func, size=1):
    """Draw bootstrap replicates."""

    # Initialize array of replicates: bs_replicates
    bs_replicates = np.empty(size)

    # Generate replicates
    for i in range(size):
        bs_replicates[i] = bootstrap_replicate_1d(data, func)

    return bs_replicates

#example usage
bootstrapped_means = draw_bs_reps(med_charges['charges'], np.mean, size = 10000)

np.percentile(bootstrapped_means, [2.5, 97.5])

In [None]:
#calculate 95% confindence interval for the mean using t-distribution
import statsmodels.stats.api as sms

sms.DescrStatsW(data).tconfint_mean()

In [141]:
#this function brings columns to wherever you wnat, 
# can be useful https://stackoverflow.com/questions/13148429/how-to-change-the-order-of-dataframe-columns
def change_column_order(df, col_name, index):
    cols = df.columns.tolist()
    cols.remove(col_name)
    cols.insert(index, col_name)
    return df[cols]

In [None]:
#displays tables side-by-side https://stackoverflow.com/questions/38783027/jupyter-notebook-display-two-pandas-tables-side-by-side
from IPython.display import display_html
def display_side_by_side(*args):
    html_str=''
    for df in args:
        html_str+=df.to_html()
    display_html(html_str.replace('table','table style="display:inline"'),raw=True)

In [None]:
# Printing the percentage of missing values per column
def percent_missing(dataframe):
    '''
    Prints the percentage of missing values for each column in a dataframe
    '''
    # Summing the number of missing values per column and then dividing by the total
    sumMissing = dataframe.isnull().values.sum(axis=0)
    pctMissing = sumMissing / dataframe.shape[0]
    
    if sumMissing.sum() == 0:
        print('No missing values')
    else:
        # Looping through and printing out each columns missing value percentage
        print('Percent Missing Values:', '\n')
        for idx, col in enumerate(dataframe.columns):
            if sumMissing[idx] > 0:
                print('{0}: {1:.2f}%'.format(col, pctMissing[idx] * 100))

In [None]:
#https://stackoverflow.com/questions/29294983/how-to-calculate-correlation-between-all-columns-and-remove-highly-correlated-on
def filter_df_corr(inp_data, corr_val):
    '''
    Returns an array or dataframe (based on type(inp_data) adjusted to drop \
        columns with high correlation to one another. Takes second arg corr_val
        that defines the cutoff

    ----------
    inp_data : np.array, pd.DataFrame
        Values to consider
    corr_val : float
        Value [0, 1] on which to base the correlation cutoff
    '''
    # Creates Correlation Matrix
    if isinstance(inp_data, np.ndarray):
        inp_data = pd.DataFrame(data=inp_data)
        array_flag = True
    else:
        array_flag = False
    corr_matrix = inp_data.corr()

    # Iterates through Correlation Matrix Table to find correlated columns
    drop_cols = []
    n_cols = len(corr_matrix.columns)

    for i in range(n_cols):
        for k in range(i+1, n_cols):
            val = corr_matrix.iloc[k, i]
            col = corr_matrix.columns[i]
            row = corr_matrix.index[k]
            if abs(val) >= corr_val:
                # Prints the correlated feature set and the corr val
                print(col, "|", row, "|", round(val, 2))
                drop_cols.append(col)
                
    #print(f'Highly Correlated Columns: {drop_cols}')
    # Drops the correlated columns (you can also just have this function print the highly correlated columns)
    drop_cols = set(drop_cols)
    inp_data = inp_data.drop(columns=drop_cols)
    # Return same type as inp
    if array_flag:
        return inp_data.values
    else:
        return inp_data

In [6]:
from scipy import stats

def drop_numerical_outliers(df, z_thresh=3):
    # Constrains will contain `True` or `False` depending on if it is a value below the threshold.
    constrains = df.select_dtypes(include=[np.number]) \
        .apply(lambda x: np.abs(stats.zscore(x)) < z_thresh, reduce=False) \
        .all(axis=1)
    # Drop (inplace) values set to be rejected
    df.drop(df.index[~constrains], inplace=True)

In [1]:
#mine
def iqr_outlier_removal(df, column):
    
    '''
    Uses the interquartile range to remove outliers in a specific column
    df = df that you are using
    column = 'column name' that you want outliers removed from
    '''
    
    #using the lower and upper quantiles to find outliers
    q1 = pd.DataFrame(df[column]).quantile(0.25)[0]
    q3 = pd.DataFrame(df[column]).quantile(0.75)[0]
    iqr = q3 - q1 #Interquartile range

    fence_low = q1 - (1.5*iqr)
    fence_high = q3 + (1.5*iqr)

    print(f'Lower Quantile Outliers are Below: {fence_low} Injections Per Day')
    print(f'Upper Quantile Outliers are Above: {fence_high} Injections Per Day')

    #I am going to remove all outliers included outside of lower and upper fences
    outliers = []
    outliers.append(df[column > fence_high][column])
    outliers.append(df[column < fence_low][column])
    outliers = pd.concat(outliers)
    outliers = [outlier for outlier in outliers]

    #drop outliers from data
    for x in df[column]:
        if x in outliers:
            df = df[df[column] != x]
    return df

In [11]:
#mine
def z_score_outliers(x, threshold):
    
    '''returns index of outliers and their values as zip object
    arguments: column of dataframe as x, threshold of standard deviations (typically 3) as z-score threshold
    '''
    import numpy as np
    from scipy import stats
    z = np.abs(stats.zscore(x))
    outliers = x[z > threshold]
    outliers_index = x[z > threshold].index
    outlier_pairs = zip(outliers_index, outliers)
    return [x for x in outlier_pairs]

In [None]:
#Another function I found, but I think I like mine more (the last one), threshold isn't active in this function
def z_score_indices_of_outliers(X, threshold=3):
    '''
    Detects outliers using Z-score standardization
    
    Input: - X: A feature in your dataset
           - threshold: The number of standard deviations from the mean
                        to be considered an outlier
                        
    Output: A data frame with all outliers beyond 3 standard deviations
    '''
    X_mean = np.mean(X)
    X_stdev = np.std(X)
    z_scores = [(y - X_mean) / X_stdev for y in X]
    z_df = pd.DataFrame(z_scores)
    pos_outliers = z_df[z_df[0] > 3]
    neg_outliers = z_df[z_df[0] < -3]
    return pos_outliers; neg_outliers

In [None]:
#flatten nested jsons
# https://towardsdatascience.com/how-to-flatten-deeply-nested-json-objects-in-non-recursive-elegant-python-55f96533103d
def flatten_json(nested_json):
    """
        Flatten json object with nested keys into a single level.
        Args:
            nested_json: A nested json object.
        Returns:
            The flattened json object if successful, None otherwise.
    """
    out = {}

    def flatten(x, name=''):
        if type(x) is dict:
            for a in x:
                flatten(x[a], name + a + '_')
        elif type(x) is list:
            i = 0
            for a in x:
                flatten(a, name + str(i) + '_')
                i += 1
        else:
            out[name[:-1]] = x

    flatten(nested_json)
    return out

In [None]:
#flatten DEEPLY nested JSON, same source as above
from itertools import chain, starmap

def flatten_json_iterative_solution(dictionary):
    """Flatten a nested json file"""

    def unpack(parent_key, parent_value):
        """Unpack one level of nesting in json file"""
        # Unpack one level only!!!
        
        if isinstance(parent_value, dict):
            for key, value in parent_value.items():
                temp1 = parent_key + '_' + key
                yield temp1, value
        elif isinstance(parent_value, list):
            i = 0 
            for value in parent_value:
                temp2 = parent_key + '_'+str(i) 
                i += 1
                yield temp2, value
        else:
            yield parent_key, parent_value    

            
    # Keep iterating until the termination condition is satisfied
    while True:
        # Keep unpacking the json file until all values are atomic elements (not dictionary or list)
        dictionary = dict(chain.from_iterable(starmap(unpack, dictionary.items())))
        # Terminate condition: not any value in the json file is dictionary or list
        if not any(isinstance(value, dict) for value in dictionary.values()) and \
           not any(isinstance(value, list) for value in dictionary.values()):
            break

    return dictionary

In [None]:
def ecdf(data):
    """Compute ECDF for a one-dimensional array of measurements."""
    # Number of data points: n
    n = len(data)

    # x-data for the ECDF: x
    x = np.sort(data)

    # y-data for the ECDF: y
    y = np.arange(1, n + 1) / n

    return x, y

In [None]:
#locate tuple inside of  column: https://stackoverflow.com/questions/29463068/return-rows-in-pandas-dataframe-where-tuple-in-column-contains-a-certain-value
df[df["Col1"].apply(lambda x: True if "cat" in x else False)]

# The lambda returns True when "cat" is in the cell. 
# That works for both strings ("cat" in "cat" is True) and tuples ("cat" in ("cat", "dog") is True).
# By subsetting the df, you get all rows where the lambda is True.

## Time Series Functions

In [None]:
from statsmodels.tsa.stattools import adfuller

def adf_test(series,title=''):
    """
    Pass in a time series and an optional title, returns an ADF report
    """
    print(f'Augmented Dickey-Fuller Test: {title}')
    result = adfuller(series.dropna(),autolag='AIC') # .dropna() handles differenced data
    
    labels = ['ADF test statistic','p-value','# lags used','# observations']
    out = pd.Series(result[0:4],index=labels)

    for key,val in result[4].items():
        out[f'critical value ({key})']=val
        
    print(out.to_string())          # .to_string() removes the line "dtype: float64"
    
    if result[1] <= 0.05:
        print("Strong evidence against the null hypothesis")
        print("Reject the null hypothesis")
        print("Data has no unit root and is stationary")
    else:
        print("Weak evidence against the null hypothesis")
        print("Fail to reject the null hypothesis")
        print("Data has a unit root and is non-stationary")

# Modeling - Evaluation

In [None]:
#example usage pipeline for regression


# Setup the pipeline steps: steps
steps = [('imputation', Imputer(missing_values='NaN', strategy='mean', axis=0)),
         ('scaler', StandardScaler()),
         ('elasticnet', ElasticNet())]
         
# Create the pipeline: pipeline
pipeline = Pipeline(steps)

# Specify the hyperparameter space
parameters = {'elasticnet__l1_ratio':np.linspace(0,1,30)}

# Create train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

# Create the GridSearchCV object: gm_cv
gm_cv = GridSearchCV(pipeline, parameters)

# Fit to the training set
gm_cv.fit(X_train, y_train)

# Compute and print the metrics
r2 = gm_cv.score(X_test, y_test)
print("Tuned ElasticNet Alpha: {}".format(gm_cv.best_params_))
print("Tuned ElasticNet R squared: {}".format(r2))

In [None]:
#use pipeline and gridsearch to predict

from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
# Setup the pipeline
steps = [('scaler', StandardScaler()),
         ('SVM', SVC())]

pipeline = Pipeline(steps)

# Specify the hyperparameter space
parameters = {'SVM__C':[1, 10, 100],
              'SVM__gamma':[0.1, 0.01]}

# Create train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21)

# Instantiate the GridSearchCV object: cv
cv = GridSearchCV(pipeline, parameters)

# Fit to the training set
cv.fit(X_train, y_train)

# Predict the labels of the test set: y_pred
y_pred = cv.predict(X_test)

# Compute and print metrics
print("Accuracy: {}".format(cv.score(X_test, y_test)))
print(classification_report(y_test, y_pred))
print("Tuned Model Parameters: {}".format(cv.best_params_))

In [None]:
#example usage using a pipline on data to make predictions


# Import necessary modules
from sklearn.preprocessing import Imputer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC

# Setup the pipeline steps: steps
steps = [('imputation', Imputer(missing_values='NaN', strategy='most_frequent', axis=0)),
        ('SVM', SVC())]

# Create the pipeline: pipeline
pipeline = Pipeline(steps)

# Create training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, random_state = 42)

# Fit the pipeline to the train set
pipeline.fit(X_train, y_train)

# Predict the labels of the test set
y_pred = pipeline.predict(X_test)

# Compute metrics
print(classification_report(y_test, y_pred))

In [None]:
#example usage: dummy variable (remember to drop the first column since it is unnecessary -- every time)

# Create dummy variables: df_region
df_region = pd.get_dummies(df)

# Print the columns of df_region
print(df_region.columns)

# Create dummy variables with drop_first=True: df_region
df_region = pd.get_dummies(df, drop_first = True)

# Print the new columns of df_region
print(df_region.columns)

In [None]:
# takes a classifier `clf`, a grid of hyperparameters (such as a complexity parameter or regularization parameter) implemented as a dictionary `parameters`, 
# a training set (as a samples x features array) `Xtrain`, and a set of labels `ytrain`. The code takes the traning set, splits it into `n_folds` parts, sets 
# up `n_folds` folds, and carries out a cross-validation by splitting the training set into a training and validation section for each foldfor us. It prints the 
# best value of the parameters, and retuens the best classifier to us.

def cv_optimize(clf, parameters, Xtrain, ytrain, n_folds=5):
    gs = sklearn.model_selection.GridSearchCV(clf, param_grid=parameters, cv=n_folds)
    gs.fit(Xtrain, ytrain)
    print("BEST PARAMS", gs.best_params_)
    best = gs.best_estimator_
    return best

In [None]:
# takes a dataframe `indf` as input. It takes the columns in the list `featurenames` as the features used to train the classifier. 
# The column `targetname` sets the target. The classification is done by setting those samples for which `targetname` has value `target1val` to the
# value 1, and all others to 0. We split the dataframe into 80% training and 20% testing by default, standardizing the dataset if desired. 
# (Standardizing a data set involves scaling the data so that it has 0 mean and is described in units of its standard deviation. We then train 
#  the model on the training set using cross-validation. Having obtained the best classifier using `cv_optimize`, we retrain on the entire 
#  training set and calculate the training and testing accuracy, which we print. We return the split data and the trained classifier.


from sklearn.model_selection import train_test_split

def do_classify(clf, parameters, indf, featurenames, targetname, target1val, standardize=False, train_size=0.8):
    subdf=indf[featurenames]
    if standardize:
        subdfstd=(subdf - subdf.mean())/subdf.std()
    else:
        subdfstd=subdf
    X=subdfstd.values
    y=(indf[targetname].values==target1val)*1
    Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=train_size)
    clf = cv_optimize(clf, parameters, Xtrain, ytrain)
    clf=clf.fit(Xtrain, ytrain)
    training_accuracy = clf.score(Xtrain, ytrain)
    test_accuracy = clf.score(Xtest, ytest)
    print("Accuracy on training data: {:0.2f}".format(training_accuracy))
    print("Accuracy on test data:     {:0.2f}".format(test_accuracy))
    return clf, Xtrain, ytrain, Xtest, ytest

In [None]:
#find best accuracy for n_neighbors for KNN

#MUST PERFORM TRAIN TEST SPLIT FIRST

# Setup arrays to store train and test accuracies
neighbors = np.arange(1, 9)
train_accuracy = np.empty(len(neighbors))
test_accuracy = np.empty(len(neighbors))

# Loop over different values of k
for i, k in enumerate(neighbors):
    # Setup a k-NN Classifier with k neighbors: knn
    knn = KNeighborsClassifier(n_neighbors = k)

    # Fit the classifier to the training data
    knn.fit(X_train, y_train)
    
    #Compute accuracy on the training set
    train_accuracy[i] = knn.score(X_train, y_train)

    #Compute accuracy on the testing set
    test_accuracy[i] = knn.score(X_test, y_test)

# Generate plot
plt.title('k-NN: Varying Number of Neighbors')
plt.plot(neighbors, test_accuracy, label = 'Testing Accuracy')
plt.plot(neighbors, train_accuracy, label = 'Training Accuracy')
plt.legend()
plt.xlabel('Number of Neighbors')
plt.ylabel('Accuracy')
plt.show()


In [None]:
#example usage of confustion matrix and classification report
# Import necessary modules
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

# Create training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .4, random_state = 42)

# Instantiate a k-NN classifier: knn
knn = KNeighborsClassifier(n_neighbors=6)

# Fit the classifier to the training data
knn.fit(X_train, y_train)

# Predict the labels of the test data: y_pred
y_pred = knn.predict(X_test)

# Generate the confusion matrix and classification report
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
#exaple usage gridsearchcv for ElasticNet regression (a linear combination between the l1 and l2 norms)

# Import necessary modules
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

# Create train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state=42)

# Create the hyperparameter grid
l1_space = np.linspace(0, 1, 30)
param_grid = {'l1_ratio': l1_space}

# Instantiate the ElasticNet regressor: elastic_net
elastic_net = ElasticNet()

# Setup the GridSearchCV object: gm_cv
gm_cv = GridSearchCV(elastic_net, param_grid, cv=5)

# Fit it to the training data
gm_cv.fit(X_train, y_train)

# Predict on the test set and compute metrics
y_pred = gm_cv.predict(X_test)
r2 = gm_cv.score(X_test, y_test)
mse = mean_squared_error(y_test, y_pred)
print("Tuned ElasticNet l1 ratio: {}".format(gm_cv.best_params_))
print("Tuned ElasticNet R squared: {}".format(r2))
print("Tuned ElasticNet MSE: {}".format(mse))

In [None]:
#use randomized search cv to find hyperparameters (this can often perform tuning in less time and with similar results to grid search cv)


#example usage
# Import necessary modules
from scipy.stats import randint
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV

# Setup the parameters and distributions to sample from: param_dist
param_dist = {"max_depth": [3, None],
              "max_features": randint(1, 9),
              "min_samples_leaf": randint(1, 9),
              "criterion": ["gini", "entropy"]}

# Instantiate a Decision Tree classifier: tree
tree = DecisionTreeClassifier()

# Instantiate the RandomizedSearchCV object: tree_cv
tree_cv = RandomizedSearchCV(tree, param_distributions=param_dist, cv=5)

# Fit it to the data
tree_cv.fit(X, y)

# Print the tuned parameters and score
print("Tuned Decision Tree Parameters: {}".format(tree_cv.best_params_))
print("Best score is {}".format(tree_cv.best_score_))

In [None]:
#logistic regression also has a regularization parameter: C. C controls the inverse of the regularization strength, and this is what is tuned here
#A large C can lead to an overfit model, while a small C can lead to an underfit model.

# Import necessary modules
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

# Setup the hyperparameter grid
c_space = np.logspace(-5, 8, 15)
param_grid = {'C': c_space}

# Instantiate a logistic regression classifier: logreg
logreg = LogisticRegression()

# Instantiate the GridSearchCV object: logreg_cv
logreg_cv = GridSearchCV(logreg, param_grid = param_grid, cv=5)

# Fit it to the data
logreg_cv.fit(X, y)

# Print the tuned parameters and score
print("Tuned Logistic Regression Parameters: {}".format(logreg_cv.best_params_)) 
print("Best score is {}".format(logreg_cv.best_score_))

In [None]:
#use cross validation to show auc scores 

# Import necessary modules
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score

# Compute predicted probabilities: y_pred_prob
y_pred_prob = logreg.predict_proba(X_test)[:,1]

# Compute and print AUC score
print("AUC: {}".format(roc_auc_score(y_test, y_pred_prob)))

# Compute cross-validated AUC scores: cv_auc
cv_auc = cross_val_score(logreg, X, y, cv = 5, scoring = 'roc_auc')

# Print list of AUC scores
print("AUC scores computed using 5-fold cross-validation: {}".format(cv_auc))

In [None]:
#example usage of plotting ROC curve (note that sklearn now has this functionality built in)

# Import necessary modules
from sklearn.metrics import roc_curve

# Compute predicted probabilities: y_pred_prob
y_pred_prob = logreg.predict_proba(X_test)[:,1]

# Generate ROC curve values: fpr, tpr, thresholds
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)

# Plot ROC curve
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.show()

In [None]:
# plots the R2 score as well as standard error for each alpha

def display_plot(cv_scores, cv_scores_std):
    fig = plt.figure()
    ax = fig.add_subplot(1,1,1)
    ax.plot(alpha_space, cv_scores)

    std_error = cv_scores_std / np.sqrt(10)

    ax.fill_between(alpha_space, cv_scores + std_error, cv_scores - std_error, alpha=0.2)
    ax.set_ylabel('CV Score +/- Std Error')
    ax.set_xlabel('Alpha')
    ax.axhline(np.max(cv_scores), linestyle='--', color='.5')
    ax.set_xlim([alpha_space[0], alpha_space[-1]])
    ax.set_xscale('log')
    plt.show()
    

    
#--------------------------Example usage
# Import necessary modules
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score

# Setup the array of alphas and lists to store scores
alpha_space = np.logspace(-4, 0, 50)
ridge_scores = []
ridge_scores_std = []

# Create a ridge regressor: ridge
ridge = Ridge(normalize = True)

# Compute scores over range of alphas
for alpha in alpha_space:

    # Specify the alpha value to use: ridge.alpha
    ridge.alpha = alpha
    
    # Perform 10-fold CV: ridge_cv_scores
    ridge_cv_scores = cross_val_score(ridge, X, y, cv = 10)
    
    # Append the mean of ridge_cv_scores to ridge_scores
    ridge_scores.append(np.mean(ridge_cv_scores))
    
    # Append the std of ridge_cv_scores to ridge_scores_std
    ridge_scores_std.append(np.std(ridge_cv_scores))

# Display the plot
display_plot(ridge_scores, ridge_scores_std)

In [None]:
#use Lasso regression for feature importances

#MUST SPECIFY X, y FIRST 

#alpha is a variable value as well, it needs to be chosen

# Import Lasso
from sklearn.linear_model import Lasso

# Instantiate a lasso regressor: lasso
lasso = Lasso(alpha = .4, normalize = True)

# Fit the regressor to the data
lasso.fit(X, y)

# Compute and print the coefficients
lasso_coef = lasso.coef_
print(lasso_coef)

# Plot the coefficients
plt.plot(range(len(df_columns)), lasso_coef)
plt.xticks(range(len(df_columns)), df_columns.values, rotation=60)
plt.margins(0.02)
plt.show()

In [None]:
#shows actual values, overlaid with decision boundary in discriminant classifier



%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")
import sklearn.model_selection

c0=sns.color_palette()[0]
c1=sns.color_palette()[1]
c2=sns.color_palette()[2]

cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
cm = plt.cm.RdBu
cm_bright = ListedColormap(['#FF0000', '#0000FF'])

def points_plot(ax, Xtr, Xte, ytr, yte, clf, mesh=True, colorscale=cmap_light, 
                cdiscrete=cmap_bold, alpha=0.1, psize=10, zfunc=False, predicted=False):
    h = .02
    X=np.concatenate((Xtr, Xte))
    x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
    y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
    xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100),
                         np.linspace(y_min, y_max, 100))

    #plt.figure(figsize=(10,6))
    if zfunc:
        p0 = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 0]
        p1 = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
        Z=zfunc(p0, p1)
    else:
        Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    ZZ = Z.reshape(xx.shape)
    if mesh:
        plt.pcolormesh(xx, yy, ZZ, cmap=cmap_light, alpha=alpha, axes=ax)
    if predicted:
        showtr = clf.predict(Xtr)
        showte = clf.predict(Xte)
    else:
        showtr = ytr
        showte = yte
    ax.scatter(Xtr[:, 0], Xtr[:, 1], c=showtr-1, cmap=cmap_bold, 
               s=psize, alpha=alpha,edgecolor="k")
    # and testing points
    ax.scatter(Xte[:, 0], Xte[:, 1], c=showte-1, cmap=cmap_bold, 
               alpha=alpha, marker="s", s=psize+10)
    ax.set_xlim(xx.min(), xx.max())
    ax.set_ylim(yy.min(), yy.max())
    return ax,xx,yy

def points_plot_prob(ax, Xtr, Xte, ytr, yte, clf, colorscale=cmap_light, 
                     cdiscrete=cmap_bold, ccolor=cm, psize=10, alpha=0.1):
    ax,xx,yy = points_plot(ax, Xtr, Xte, ytr, yte, clf, mesh=False, 
                           colorscale=colorscale, cdiscrete=cdiscrete, 
                           psize=psize, alpha=alpha, predicted=True) 
    Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
    Z = Z.reshape(xx.shape)
    plt.contourf(xx, yy, Z, cmap=ccolor, alpha=.2, axes=ax)
    cs2 = plt.contour(xx, yy, Z, cmap=ccolor, alpha=.6, axes=ax)
    plt.clabel(cs2, fmt = '%2.1f', colors = 'k', fontsize=14)
    return ax

In [7]:
#great function from Jeff Macaluso's blog: https://jeffmacaluso.github.io/post/LinearRegressionAssumptions/
def linear_regression_assumptions(features, label, feature_names=None):
    """
    Tests a linear regression on the model to see if assumptions are being met
    """
    from sklearn.linear_model import LinearRegression
    
    # Setting feature names to x1, x2, x3, etc. if they are not defined
    if feature_names is None:
        feature_names = ['X'+str(feature+1) for feature in range(features.shape[1])]
    
    print('Fitting linear regression')
    # Multi-threading if the dataset is a size where doing so is beneficial
    if features.shape[0] < 100000:
        model = LinearRegression(n_jobs=-1)
    else:
        model = LinearRegression()
        
    model.fit(features, label)
    
    # Returning linear regression R^2 and coefficients before performing diagnostics
    r2 = model.score(features, label)
    print()
    print('R^2:', r2, '\n')
    print('Coefficients')
    print('-------------------------------------')
    print('Intercept:', model.intercept_)
    
    for feature in range(len(model.coef_)):
        print('{0}: {1}'.format(feature_names[feature], model.coef_[feature]))

    print('\nPerforming linear regression assumption testing')
    
    # Creating predictions and calculating residuals for assumption tests
    predictions = model.predict(features)
    df_results = pd.DataFrame({'Actual': label, 'Predicted': predictions})
    df_results['Residuals'] = abs(df_results['Actual']) - abs(df_results['Predicted'])

    
    def linear_assumption():
        """
        Linearity: Assumes there is a linear relationship between the predictors and
                   the response variable. If not, either a polynomial term or another
                   algorithm should be used.
        """
        print('\n=======================================================================================')
        print('Assumption 1: Linear Relationship between the Target and the Features')
        
        print('Checking with a scatter plot of actual vs. predicted. Predictions should follow the diagonal line.')
        
        # Plotting the actual vs predicted values
        sns.lmplot(x='Actual', y='Predicted', data=df_results, fit_reg=False, size=7)
        
        # Plotting the diagonal line
        line_coords = np.arange(df_results.min().min(), df_results.max().max())
        plt.plot(line_coords, line_coords,  # X and y points
                 color='darkorange', linestyle='--')
        plt.title('Actual vs. Predicted')
        plt.show()
        print('If non-linearity is apparent, consider adding a polynomial term')
        
        
    def normal_errors_assumption(p_value_thresh=0.05):
        """
        Normality: Assumes that the error terms are normally distributed. If they are not,
        nonlinear transformations of variables may solve this.
               
        This assumption being violated primarily causes issues with the confidence intervals
        """
        from statsmodels.stats.diagnostic import normal_ad
        print('\n=======================================================================================')
        print('Assumption 2: The error terms are normally distributed')
        print()
    
        print('Using the Anderson-Darling test for normal distribution')

        # Performing the test on the residuals
        p_value = normal_ad(df_results['Residuals'])[1]
        print('p-value from the test - below 0.05 generally means non-normal:', p_value)
    
        # Reporting the normality of the residuals
        if p_value < p_value_thresh:
            print('Residuals are not normally distributed')
        else:
            print('Residuals are normally distributed')
    
        # Plotting the residuals distribution
        plt.subplots(figsize=(12, 6))
        plt.title('Distribution of Residuals')
        sns.distplot(df_results['Residuals'])
        plt.show()
    
        print()
        if p_value > p_value_thresh:
            print('Assumption satisfied')
        else:
            print('Assumption not satisfied')
            print()
            print('Confidence intervals will likely be affected')
            print('Try performing nonlinear transformations on variables')
        
        
    def multicollinearity_assumption():
        """
        Multicollinearity: Assumes that predictors are not correlated with each other. If there is
                           correlation among the predictors, then either remove prepdictors with high
                           Variance Inflation Factor (VIF) values or perform dimensionality reduction
                           
                           This assumption being violated causes issues with interpretability of the 
                           coefficients and the standard errors of the coefficients.
        """
        from statsmodels.stats.outliers_influence import variance_inflation_factor
        print('\n=======================================================================================')
        print('Assumption 3: Little to no multicollinearity among predictors')
        
        # Plotting the heatmap
        plt.figure(figsize = (10,8))
        sns.heatmap(pd.DataFrame(features, columns=feature_names).corr(), annot=True)
        plt.title('Correlation of Variables')
        plt.show()
        
        print('Variance Inflation Factors (VIF)')
        print('> 10: An indication that multicollinearity may be present')
        print('> 100: Certain multicollinearity among the variables')
        print('-------------------------------------')
       
        # Gathering the VIF for each variable
        VIF = [variance_inflation_factor(features, i) for i in range(features.shape[1])]
        for idx, vif in enumerate(VIF):
            print('{0}: {1}'.format(feature_names[idx], vif))
        
        # Gathering and printing total cases of possible or definite multicollinearity
        possible_multicollinearity = sum([1 for vif in VIF if vif > 10])
        definite_multicollinearity = sum([1 for vif in VIF if vif > 100])
        print()
        print('{0} cases of possible multicollinearity'.format(possible_multicollinearity))
        print('{0} cases of definite multicollinearity'.format(definite_multicollinearity))
        print()

        if definite_multicollinearity == 0:
            if possible_multicollinearity == 0:
                print('Assumption satisfied')
            else:
                print('Assumption possibly satisfied')
                print()
                print('Coefficient interpretability may be problematic')
                print('Consider removing variables with a high Variance Inflation Factor (VIF)')
        else:
            print('Assumption not satisfied')
            print()
            print('Coefficient interpretability will be problematic')
            print('Consider removing variables with a high Variance Inflation Factor (VIF)')
        
        
    def autocorrelation_assumption():
        """
        Autocorrelation: Assumes that there is no autocorrelation in the residuals. If there is
                         autocorrelation, then there is a pattern that is not explained due to
                         the current value being dependent on the previous value.
                         This may be resolved by adding a lag variable of either the dependent
                         variable or some of the predictors.
        """
        from statsmodels.stats.stattools import durbin_watson
        print('\n=======================================================================================')
        print('Assumption 4: No Autocorrelation')
        print('\nPerforming Durbin-Watson Test')
        print('Values of 1.5 < d < 2.5 generally show that there is no autocorrelation in the data')
        print('0 to 2< is positive autocorrelation')
        print('>2 to 4 is negative autocorrelation')
        print('-------------------------------------')
        durbinWatson = durbin_watson(df_results['Residuals'])
        print('Durbin-Watson:', durbinWatson)
        if durbinWatson < 1.5:
            print('Signs of positive autocorrelation', '\n')
            print('Assumption not satisfied', '\n')
            print('Consider adding lag variables')
        elif durbinWatson > 2.5:
            print('Signs of negative autocorrelation', '\n')
            print('Assumption not satisfied', '\n')
            print('Consider adding lag variables')
        else:
            print('Little to no autocorrelation', '\n')
            print('Assumption satisfied')

            
    def homoscedasticity_assumption():
        """
        Homoscedasticity: Assumes that the errors exhibit constant variance
        """
        print('\n=======================================================================================')
        print('Assumption 5: Homoscedasticity of Error Terms')
        print('Residuals should have relative constant variance')
        
        # Plotting the residuals
        plt.subplots(figsize=(12, 6))
        ax = plt.subplot(111)  # To remove spines
        plt.scatter(x=df_results.index, y=df_results.Residuals, alpha=0.5)
        plt.plot(np.repeat(0, df_results.index.max()), color='darkorange', linestyle='--')
        ax.spines['right'].set_visible(False)  # Removing the right spine
        ax.spines['top'].set_visible(False)  # Removing the top spine
        plt.title('Residuals')
        plt.show() 
        print('If heteroscedasticity is apparent, confidence intervals and predictions will be affected')
        
        
    linear_assumption()
    normal_errors_assumption()
    multicollinearity_assumption()
    autocorrelation_assumption()
    homoscedasticity_assumption()

# Survey Monkey Specific API Functions

In [None]:
#I created this to grab the headings from JSON survey data pulled from the SurveyMonkey API, email me if you ever do this, its a journey!

json = client.get_survey_details('insert survey id')

def get_headings(json):
    
    '''
    
    This function takes a SurveyMonkey json object from an API call and creates a dataframe with the headers and the corresponding question_ids
    
    Arguments: a single json object created from a .get() call on the SurveyMonkey API
    
    requirements: SurveyMonkey client - https://github.com/GearPlug/surveymonkey-python
    '''
    
    
    ids_list = []
    for pages in json['pages']:
        for question in pages['questions']:
            for ids in question['id']:
                ids_list.append(ids)     
    def divide_chunks(l, n): 
      
    # looping till length l 
        for i in range(0, len(l), n):  
            yield l[i:i + n] 
  
    # How many elements each 
    # list should have 
    n = 9
  
    x = list(divide_chunks(ids_list, n)) 

    #create question id dataframe from list of ids the join is joining together each list into one number, taking out commas and quotation marks
    heading_ids = pd.DataFrame([''.join(i) for i in x])

    headings_list = []
    for pages in json['pages']:
        for question in pages['questions']:
            for headings in question['headings']:
                headings_list.append(headings)
                headings = pd.DataFrame(headings_list)

#concatenate the heading ids with the headings
    questions_and_ids = pd.concat([headings, heading_ids], axis = 1).rename(columns = {0:'question_id'})
    
    return pd.DataFrame(questions_and_ids)

In [None]:
#I created this to extract ALL answers from ALL survey responses from SurveyMonkey API, this has been replaced by another function in this list, get_details_
#choiceids_etc.
def get_bulk_answers(json):
    
    ''' 
    Takes JSON object returned from SurveyMonkey API and returns all answer ids and text answers for a survey
    along with the corresponding row_id
    
    Argument: JSON API called object
    '''
    
    bulk_response_answer_list = []
    for data in json['data']:
        for pages in data['pages']:
            for questions in pages['questions']:
                for answers in questions['answers']:
                    bulk_response_answer_list.append(answers.get('choice_id'))
                    
    bulk_response_answer_text_list = []
    for data in json['data']:
        for pages in data['pages']:
            for questions in pages['questions']:
                for answers in questions['answers']:
                    bulk_response_answer_text_list.append(answers.get('text'))
                    
    bulk_response_answer_list = pd.DataFrame(bulk_response_answer_list)
    bulk_response_answer_text_list = pd.DataFrame(bulk_response_answer_text_list)

    #now filling na values with each other
    bulk_responses_df = pd.concat([bulk_response_answer_list, bulk_response_answer_text_list], axis = 1)
    bulk_responses_df.columns = [['response_id', 'response_text']]

     #now fill in NA values with text from adjacent column to get ALL answers
    bulk_responses_df = pd.DataFrame(bulk_responses_df.bfill(axis=1).iloc[:, 0])
    
    #now get row_id
    bulk_response_row_id_list = []
    for data in json['data']:
        for pages in data['pages']:
            for questions in pages['questions']:
                for answers in questions['answers']:
                    bulk_response_row_id_list.append(answers.get('row_id'))
                    
    row_ids = pd.DataFrame(bulk_response_row_id_list)
    row_ids.columns = [['row_id']]

    #concatenate this column with all responses
    bulk_responses_with_row_ids = pd.concat([bulk_responses_df, row_ids], axis = 1)
    
    return bulk_responses_with_row_ids

In [1095]:
def get_answers_and_ids(json):
    
    '''
    This function takes a json object from the SurveyMonkey API, flattens the json, subsets the items in the json, and extracts
    the answers and their corresponding ids and then places all of this into a new dataframe
    
    Arguments: json object from SurveyMonkey API call
    '''
    
    #flatten DEEPLY nested JSON, same source as above
    from itertools import chain, starmap

    def flatten_json_iterative_solution(dictionary):
        """Flatten a nested json file"""
        def unpack(parent_key, parent_value):
            """Unpack one level of nesting in json file"""
        # Unpack one level only!!!
        
            if isinstance(parent_value, dict):
                for key, value in parent_value.items():
                    temp1 = parent_key + '_' + key
                    yield temp1, value
            elif isinstance(parent_value, list):
                i = 0 
                for value in parent_value:
                    temp2 = parent_key + '_'+str(i) 
                    i += 1
                    yield temp2, value
            else:
                yield parent_key, parent_value    

            
        # Keep iterating until the termination condition is satisfied
        while True:
            # Keep unpacking the json file until all values are atomic elements (not dictionary or list)
            dictionary = dict(chain.from_iterable(starmap(unpack, dictionary.items())))
            # Terminate condition: not any value in the json file is dictionary or list
            if not any(isinstance(value, dict) for value in dictionary.values()) and \
               not any(isinstance(value, list) for value in dictionary.values()):
                break

        return dictionary

    
    #use the function on the json
    flattened_details = pd.Series(flatten_json_iterative_solution(json)).to_frame().reset_index()
    
    flattened_details.rename(columns = {'index':'detail_buckets', 0:'details'}, inplace = True)
    
    
    #searching for all responses within the survey using regex
    choices = flattened_details[flattened_details['detail_buckets'].str.contains(r'questions_[0-9]{1,2}_answers_choices_[0-9]{1,2}_text|questions_[0-9]{1,2}_answers_other_text')].rename(columns = {'details':'possible_choices'})\
    .reset_index(drop = True)
    
    #searching for all response ids within the survey with regex
    choice_ids = flattened_details[flattened_details['detail_buckets'].str.\
    contains(r'pages_[0-9]{1,2}_questions_[0-9]{1,2}_answers_choices_[0-9]{1,2}_id|pages_[0-9]{1,2}_questions_[0-9]{1,2}_answers_other_id')].drop('detail_buckets', axis = 1).rename(columns = {'details':'response_id'}).reset_index(drop = True)

    final_df = pd.concat([choices, choice_ids], axis = 1)
    
    final_df.drop('detail_buckets', axis = 1, inplace = True)
    
    return final_df

In [None]:
json = client.get_survey_details('insert survey id')

def get_row_text_and_row_ids(json):
    
    '''
    This function takes a json object returned from a call to the SurveyMonkey API and returns sub-questions and their ids
    
    Argument: a json object called from the SurveyMonkey API
    
    Requirements: the SurveyMonkey client - https://github.com/GearPlug/surveymonkey-python
    '''
    
    from itertools import chain, starmap

    def flatten_json_iterative_solution(dictionary):
        """Flatten a nested json file"""
        def unpack(parent_key, parent_value):
            """Unpack one level of nesting in json file"""
        # Unpack one level only!!!
        
            if isinstance(parent_value, dict):
                for key, value in parent_value.items():
                    temp1 = parent_key + '_' + key
                    yield temp1, value
            elif isinstance(parent_value, list):
                i = 0 
                for value in parent_value:
                    temp2 = parent_key + '_'+str(i) 
                    i += 1
                    yield temp2, value
            else:
                yield parent_key, parent_value    

            
        # Keep iterating until the termination condition is satisfied
        while True:
            # Keep unpacking the json file until all values are atomic elements (not dictionary or list)
            dictionary = dict(chain.from_iterable(starmap(unpack, dictionary.items())))
            # Terminate condition: not any value in the json file is dictionary or list
            if not any(isinstance(value, dict) for value in dictionary.values()) and \
               not any(isinstance(value, list) for value in dictionary.values()):
                break

        return dictionary
    
    
    flattened_details = pd.Series(flatten_json_iterative_solution(json)).to_frame().reset_index()
    
    flattened_details.rename(columns = {0:'row_id'}, inplace = True)
    
    row_text = flattened_details[flattened_details['index'].str.contains(r'pages_[0-9]{1,4}_questions_[0-9]{1,4}_answers_rows_[0-9]{1,4}_text|rows_[0-9]{1,4}_id')] 
    
    row_text_ids = row_text[row_text['row_id'].str.contains('1') == True]
    
    row_text = row_text[row_text['row_id'].str.contains('1') == False]

    row_id_df = pd.DataFrame(pd.concat([row_text, row_text_ids], axis = 1)['row_id'].iloc[:,1].dropna())
    
    row_text_df =  pd.DataFrame(pd.concat([row_text, row_text_ids], axis = 1)['row_id'].iloc[:,0].dropna())

    rows_df = pd.concat([row_text_df, row_id_df], axis = 1)

    rows_df.iloc[:,1] = rows_df.iloc[:,1].shift(-1)

    rows_df.dropna(inplace = True)

    rows_df.columns = ['row_text', 'row_id']
    
    return rows_df

In [1164]:
#json = client.get_survey_response_bulk('186927358')
#json_2 = client.get_survey_details('186927358')

def get_personid_choiceid_rowid_surveyid(json, json_2):
    
    '''
    
    So, theres a lot going on here... basically this function takes a json object from the SurveyMonkey API call and produces all answers, 
    their corresponding row_ids (which are question ids in the surveys), the respondent ids, the question ids, choice ids, and text answers
    
    Arguments: you must create 2 json objects from the client call first, 
        json = bulk responses from API
        json_2 = survey details
        
    '''
    import pandas as pd
    import numpy as np
    
    #use function to flatten DEEPLY nested JSON, same source as above
    from itertools import chain, starmap

    def flatten_json_iterative_solution(dictionary):
        """Flatten a nested json file"""
        def unpack(parent_key, parent_value):
            """Unpack one level of nesting in json file"""
        # Unpack one level only!!!
        
            if isinstance(parent_value, dict):
                for key, value in parent_value.items():
                    temp1 = parent_key + '_' + key
                    yield temp1, value
            elif isinstance(parent_value, list):
                i = 0 
                for value in parent_value:
                    temp2 = parent_key + '_'+str(i) 
                    i += 1
                    yield temp2, value
            else:
                yield parent_key, parent_value    

            
        # Keep iterating until the termination condition is satisfied
        while True:
            # Keep unpacking the json file until all values are atomic elements (not dictionary or list)
            dictionary = dict(chain.from_iterable(starmap(unpack, dictionary.items())))
            # Terminate condition: not any value in the json file is dictionary or list
            if not any(isinstance(value, dict) for value in dictionary.values()) and \
               not any(isinstance(value, list) for value in dictionary.values()):
                break

        return dictionary
    
    #getting the bulk responses and flattening the json file
    bulk_responses = pd.Series(flatten_json_iterative_solution(json)).to_frame().reset_index()
    
    #renaming the columns in the dataframe
    bulk_responses.rename(columns = {'index':'answer_type', 0:'answer'}, inplace = True)
    
    #searching for ids within the bulk responses, looks like I am only getting 50 unique back at a time...
    bulk_responses = bulk_responses[bulk_responses['answer_type'].str.contains(r'text|data_[0-9]{1,2}_id|data_[0-9]{1,2}_pages_[0-9]' \
    '{1,2}_questions_[0-9]{1,2}_id|data_[0-9]{1,2}_pages_[0-9]{1,2}_questions_[0-9]{1,2}_answers_[0-9]{1,2}_row_id|data_[0-9]{1,2}_pages_[0-9]' \
    '{1,2}_questions_[0-9]{1,2}_answers_[0-9]{1,2}_choice_id|other')].reset_index(drop = True)
    
    #grabbing the survey details from json_2
    survey_details = pd.Series(flatten_json_iterative_solution(json_2)).to_frame()
    
    #get survey id and create column denoting the survey id
    bulk_responses['survey_id'] = survey_details[survey_details.index.str.contains(r'^id') == True][0][0]

    #renameing the ids to normal names
    bulk_responses.replace(to_replace ='data_[0-9]{1,3}_id', value = 'respondent_id', regex = True, inplace = True)
    
    bulk_responses.replace(to_replace ='data_[0-9]{1,3}_pages_[0-9]{1,3}_questions_[0-9]{1,3}_answers_[0-9]{1,3}_text', 
                           value = 'text_answer', regex = True, inplace = True)
    
    bulk_responses.replace(to_replace ='data_[0-9]{1,3}_pages_[0-9]{1,3}_questions_[0-9]{1,3}_id', 
                           value = 'question_id', regex = True, inplace = True)
    
    bulk_responses.replace(to_replace ='data_[0-9]{1,3}_pages_[0-9]{1,3}_questions_[0-9]{1,3}_answers_[0-9]{1,3}_row_id', 
                           value = 'row_id', regex = True, inplace = True)
    
    bulk_responses.replace(to_replace ='data_[0-9]{1,3}_pages_[0-9]{1,3}_questions_[0-9]{1,3}_answers_[0-9]{1,3}_choice_id', 
                           value = 'choice_id', regex = True, inplace = True)
    
    #some surveys have "other" as an option, this covers those
    bulk_responses.replace(to_replace ='data_[0-9]{1,4}_pages_[0-9]{1,3}_questions_[0-9]{1,3}_answers_[0-9]{1,3}_other_id', 
                           value = 'other_id', regex = True, inplace = True)

    #fill in survey_id column completely
    bulk_responses.ffill(inplace = True)
    
    #create mask to use as a transfer from row ids to another column
    mask = (bulk_responses['answer_type'] == 'row_id')

    #use the mask
    bulk_responses['row_id'] = bulk_responses['answer_type'][mask]

    #set row id equal to the actual row id
    bulk_responses.row_id[bulk_responses.row_id == 'row_id'] = bulk_responses.answer

    #shift all row ids up one in order to drop the row ids from the details column
    bulk_responses['row_id'] = bulk_responses['row_id'].shift(-1)

    #drop all row ids rows from df so that row id is listed beside choice id
    bulk_responses = bulk_responses[~bulk_responses.answer_type.str.contains('row_id')]

    #rearrange columns
    bulk_responses = bulk_responses[['answer_type', 'answer', 'row_id', 'survey_id']]
    
    #create mask to use as a transfer to question ids column
    mask_2 = (bulk_responses['answer_type'] == 'question_id')

    #use the mask
    bulk_responses['question_id'] = bulk_responses['answer_type'][mask_2]

    #put actual question ids into the question ids column
    bulk_responses.question_id[bulk_responses.question_id == 'question_id'] = bulk_responses.answer

    #shift all of them down 1 to make sure answers line up next to actual questions
    bulk_responses['question_id'] = bulk_responses['question_id'].shift(1)

    #drop question id from answer type
    bulk_responses = bulk_responses[~bulk_responses.answer_type.str.contains('question_id')]

    #forward fill the question ids so that each row id has a corresponding question id
    bulk_responses['question_id'] = bulk_responses['question_id'].ffill()

    mask_3 = (bulk_responses['answer_type'] == 'respondent_id')

    bulk_responses['respondent_id'] = bulk_responses['answer_type'][mask_3]

    bulk_responses.respondent_id[bulk_responses.respondent_id == 'respondent_id'] = bulk_responses.answer

    bulk_responses['respondent_id'] = bulk_responses['respondent_id'].ffill()

    bulk_responses = bulk_responses[~bulk_responses['answer_type'].str.contains('respondent_id')]
    
    #change column order
    bulk_responses = bulk_responses[['respondent_id', 'survey_id', 'answer_type', 'answer', 'row_id', 'question_id']].reset_index(drop = True)
    
    return bulk_responses

# NLP Functions

In [None]:
#cleans text for analysis
#https://towardsdatascience.com/detecting-bad-customer-reviews-with-nlp-d8b36134dc7e
# return the wordnet object value corresponding to the POS tag
from nltk.corpus import wordnet

def get_wordnet_pos(pos_tag):
    if pos_tag.startswith('J'):
        return wordnet.ADJ
    elif pos_tag.startswith('V'):
        return wordnet.VERB
    elif pos_tag.startswith('N'):
        return wordnet.NOUN
    elif pos_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
    
import string
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem import WordNetLemmatizer

def clean_text(text):
    # lower text
    text = text.lower()
    # tokenize text and remove puncutation
    text = [word.strip(string.punctuation) for word in text.split(" ")]
    # remove words that contain numbers
    text = [word for word in text if not any(c.isdigit() for c in word)]
    # remove stop words
    stop = stopwords.words('english')
    text = [x for x in text if x not in stop]
    # remove empty tokens
    text = [t for t in text if len(t) > 0]
    # pos tag text
    pos_tags = pos_tag(text)
    # lemmatize text
    text = [WordNetLemmatizer().lemmatize(t[0], get_wordnet_pos(t[1])) for t in pos_tags]
    # remove words with only one letter
    text = [t for t in text if len(t) > 1]
    # join all
    text = " ".join(text)
    return(text)

#example of use to clean a column of text in a df
reviews['tidy_reviews'] = reviews['text'].apply(lambda x: clean_text(x))

# Time Series Functions

In [None]:
#this function is a complete augmented Dickey-Fuller test for stationarity, p<.05 means the data is stationary, taken from Jose Portilla's time series class
from statsmodels.tsa.stattools import adfuller

def adf_test(series,title=''):
    """
    Pass in a time series and an optional title, returns an ADF report
    """
    print(f'Augmented Dickey-Fuller Test: {title}')
    result = adfuller(series.dropna(),autolag='AIC') # .dropna() handles differenced data
    
    labels = ['ADF test statistic','p-value','# lags used','# observations']
    out = pd.Series(result[0:4],index=labels)

    for key,val in result[4].items():
        out[f'critical value ({key})']=val
        
    print(out.to_string())          # .to_string() removes the line "dtype: float64"
    
    if result[1] <= 0.05:
        print("Strong evidence against the null hypothesis")
        print("Reject the null hypothesis")
        print("Data has no unit root and is stationary")
    else:
        print("Weak evidence against the null hypothesis")
        print("Fail to reject the null hypothesis")
        print("Data has a unit root and is non-stationary")

# Stats Functions

In [None]:
#get chronbachs alpha for subsets of measures
def cronbach_alpha(items):
    items = pd.DataFrame(items)
    items_count = items.shape[1]
    variance_sum = float(items.var(axis=0, ddof=1).sum())
    total_var = float(items.sum(axis=1).var(ddof=1))
    
    return (items_count / float(items_count - 1) *
            (1 - variance_sum / total_var))

In [None]:
#I got really tired of running seperate lines of code for fifteen minutes just to run a few t tests, therefore I created this function to automate the process, 
# including checking for basic t test assumptions
def t_test(df, x, y, paired = False): 
    
    '''
    Takes two samples of data, runs Levene's Test to determine variance assumption veracity, then runs either 
    a dependent or independent samples T-Test with a printout report of effect sizes and summary statistics
    
    Argument format:
    df = data frame name 
    x = df['column1']
    y = df['column2']
    paired = False
    
    IMPORTANT NOTE: Categorical column data must be recoded to 1 and 2 prior to using the function for it to work properly
    '''
    
    import researchpy as rp
    from scipy.stats import levene
    
    #if the first sample is categorical (2 categories) then the categorical groups are tested against each other 
    if x.nunique() == 2:
        a = df[x == 1].iloc[:, 1]
        b = df[x == 2].iloc[:, 1]
        c, d = levene(a, b)
        print(levene(a, b))
        
        #if Levene's Test is significant, equal_var = False
        if d >= .05:
            print(f'T-Test Categorical Comparison')
            print(rp.ttest(a, b, equal_variances = False))
        else:
            print(f'T-Test Categorical Comparison')
            print(rp.ttest(a, b, equal_variances = True))
        return
    
    
    #if the t-test is paired samples (dependent), run the test and exit the function
    elif paired == True:
        e = rp.ttest(x, y, paired = True)
        return e
    else:
        
    #If the t-test is independent samples, run the Levene's Test and appropriate Welch's t-test with stated variance condition (True, False)
    
    #tuple unpacking to grab the p-value of Levene's Test
        f, g = levene(x, y)
        print(f"P-Value for Levene's Test: {g}")
        
        if g <= .05:
            h = rp.ttest(x, y, equal_variances = False)
            return f
        else:
            i = rp.ttest(x, y, equal_variances = True)
    return i

# Modeling

In [None]:
#took this from Jeff Macaluso's github, but I use it all the time so I threw it in here, I need to create one for classification next
def get_score(model):
        '''
        Fits the model and returns a series containing the RMSE, MAE, and R^2
        '''
        from sklearn.metrics import mean_squared_error, mean_absolute_error
        import time

        startTime = time.time()  # Getting training time
        
        # Fits with training set
        model.fit(X_train, y_train)
        totalTime = time.time() - startTime
        predictions = model.predict(X_test)
        
        r2 = model.score(X_test, y_test)
        rmse = np.sqrt(mean_squared_error(y_test, predictions))
        mae = mean_absolute_error(y_test, predictions)
            
        score_results = pd.Series([r2, rmse, mae, totalTime], index=['R^2', 'RMSE', 'MAE', 'TrainingTime(sec)'])
        
        return score_results

# Frankenstein functions

In [3]:
def t_test(df, x, y, paired = False): 
    
    '''
    Takes two samples of data, runs Levene's Test to determine variance assumption veracity, then runs either 
    a dependent or independent samples T-Test with a printout report of effect sizes and summary statistics
    
    Argument format:
    df = data frame name 
    x = df['column1']
    y = df['column2']
    paired = False
    
    IMPORTANT NOTE: Categorical column data must be recoded to 1 and 2 prior to using the function for it to work properly
    '''
    
    import researchpy as rp
    from scipy.stats import levene
    
    #if the first sample is categorical (2 categories) then the categorical groups are tested against each other 
    if x.nunique() == 2 & paired == True:
        a = df[x == 1].iloc[:, 1]
        b = df[x == 2].iloc[:, 1]
        c, d = levene(a, b)
        print(levene(a, b))
        
        #if Levene's Test is significant, equal_var = False
        if d >= .05:
            print(f'T-Test Categorical Comparison')
            print(rp.ttest(a, b, equal_variances = False))
        else:
            print(f'T-Test Categorical Comparison')
            print(rp.ttest(a, b, equal_variances = True))
        return
    elif x.unique() == 2 & paired == False:
         a = df[x == 1].iloc[:, 1]
        b = df[x == 2].iloc[:, 1]
        c, d = levene(a, b)
        print(levene(a, b))
        
        #if Levene's Test is significant, equal_var = False
        if d >= .05:
            print(f'T-Test Categorical Comparison')
            print(rp.ttest(a, b, equal_variances = False))
        else:
            print(f'T-Test Categorical Comparison')
            print(rp.ttest(a, b, equal_variances = True))
        return
        
    
    #if the t-test is paired samples (dependent), run the test and exit the function
    elif paired == True:
        e = rp.ttest(x, y, paired = True)
        return e
    else:
        
    #If the t-test is independent samples, run the Levene's Test and appropriate Welch's t-test with stated variance condition (True, False)
    
    #tuple unpacking to grab the p-value of Levene's Test
        f, g = levene(x, y)
        print(f"P-Value for Levene's Test: {g}")
        
        if g <= .05:
            h = rp.ttest(x, y, equal_variances = False)
            return f
        else:
            i = rp.ttest(x, y, equal_variances = True)
    return i

In [83]:
def median(numbers):
	if len(numbers) < 1:
		return None
	elif len(numbers) % 2 == 1:
		return sorted(numbers)[int(len(numbers)/2)]
	else:
		return float(sorted(numbers)[int(len(numbers)/2) - 1] + sorted(numbers)[int(len(numbers)/2)])/2

## Visualization

In [None]:
#create a wordcloud from a column of text
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
text = df.column.values
wordcloud = WordCloud(
    width = 2000,
    height = 1000,
    background_color = 'black',
    stopwords = STOPWORDS).generate(str(text))
fig = plt.figure(
    figsize = (40, 30),
    facecolor = 'k',
    edgecolor = 'k')
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()

## Modeling

In [None]:
def hierarchical_clustering(method, df, label_col, learning_rate, cluster_element):
    
    
    X = df.iloc[:, [1, 2]].values
    
    # Calculate the linkage: mergings
    mergings = linkage(X, method = method) #choose linkage method
    labels = list(df[label_col])
    

    # Plot the dendrogram, using varieties as labels
    plt.figure(figsize = (20, 8))
    dendrogram(mergings, labels = labels, leaf_rotation=90, leaf_font_size = 8)
    plt.title('Hierarchical Clustering Dendrogram', fontsize = 20);
    
    #instantiate T-SNE, learning_rate may need to be tweaked to get the visual right
    model = TSNE(learning_rate=learning_rate)

    #transform the data, T-SNE only has a fit_transform method! So, you cannot extend a T-SNE map to include new samples
    transformed = model.fit_transform(X)

    #define the axes
    xs = transformed[:,0]
    ys = transformed[:,1]

    #plot
    plt.figure(figsize = (15, 10))
    plt.scatter(xs, ys, c = cluster_element)  #cluster element is the variable you want to cluster on (i.e. type of plant in iris dataset)
    plt.title('T-SNE for Visualizing Clusters', fontsize = 16)

#     #if you wanted to annotate the points
#     for x, y, label in zip(xs, ys, label_name):
#         plt.annotate(label, (x, y), fontsize=8, alpha=0.75)
    
    plt.show()

    #example usage
#hierarchical_clustering('complete', df, 'country', 100, pairs['labels'])

# R Functions

## EDA

In [None]:
# taken from stackoverflow, sorry I dont have the exact link

#view data side by side
sidebyside <- function(..., width=60){
  l <- list(...)
  p <- lapply(l, function(x){
        xx <- capture.output(print(x, width=width))
        xx <- gsub("\"", "", xx)
        format(xx, justify="left", width=width)
      }
  )
  p <- do.call(cbind, p)
  sapply(seq_len(nrow(p)), function(x)paste(p[x, ], collapse=""))
}

In [None]:
#taken from stackoverflow, search for change column order

##arrange df vars by position
##'vars' must be a named vector, e.g. c("var.name"=1)
arrange.vars <- function(data, vars){
    ##stop if not a data.frame (but should work for matrices as well)
    stopifnot(is.data.frame(data))

    ##sort out inputs
    data.nms <- names(data)
    var.nr <- length(data.nms)
    var.nms <- names(vars)
    var.pos <- vars
    ##sanity checks
    stopifnot( !any(duplicated(var.nms)), 
               !any(duplicated(var.pos)) )
    stopifnot( is.character(var.nms), 
               is.numeric(var.pos) )
    stopifnot( all(var.nms %in% data.nms) )
    stopifnot( all(var.pos > 0), 
               all(var.pos <= var.nr) )

    ##prepare output
    out.vec <- character(var.nr)
    out.vec[var.pos] <- var.nms
    out.vec[-var.pos] <- data.nms[ !(data.nms %in% var.nms) ]
    stopifnot( length(out.vec)==var.nr )

    ##re-arrange vars by position
    data <- data[ , out.vec]
    return(data)
}

In [None]:
# ++++++++++++++++++++++++++++
# flattenCorrMatrix
# ++++++++++++++++++++++++++++
# cormat : matrix of the correlation coefficients
# pmat : matrix of the correlation p-values
flattenCorrMatrix <- function(cormat, pmat) {
  ut <- upper.tri(cormat)
  data.frame(
    row = rownames(cormat)[row(cormat)[ut]],
    column = rownames(cormat)[col(cormat)[ut]],
    cor  =(cormat)[ut],
    p = pmat[ut]
    )
}
