# Strip PDFs and export text to csv file
- I wrote this function to automate the process of copying and pasting pdf files for a business

In [None]:
working_directory = ('whatever your working directory needs to be ')
def pdf_text_to_csv(rootdir):
    
    #TODO: update with method for extracting specific page numbers
    
    '''iterates through a folder directory, extracts text from PDFs, converts the distinct text to a dataframe,
    and exports the dataframe as a csv file with name - 'text_df' - 
     arguments: rootdir = filepath to the folder you want the function to iterate through'''
    import fitz
    import os
    
    os.chdir(working_directory)
    
    text_list = []
    
    for subdir, dirs, files in os.walk(rootdir):
        for file in files:
            doc = fitz.open(file)
            page = doc[0]
            text = page.getText("text")                    
                
            text_list.append(text)            
            doc.close()
            df = pd.DataFrame(text_list, columns = ['text'])
            df.to_csv('text_df.csv')                         #change name of csv file if you wish
    return df

# EDA Functions

In [None]:
#this function brings columns to wherever you wnat, 
# can be useful https://stackoverflow.com/questions/13148429/how-to-change-the-order-of-dataframe-columns
def change_column_order(df, col_name, index):
    cols = df.columns.tolist()
    cols.remove(col_name)
    cols.insert(index, col_name)
    return df[cols]

In [None]:
#displays tables side-by-side https://stackoverflow.com/questions/38783027/jupyter-notebook-display-two-pandas-tables-side-by-side
from IPython.display import display_html
def display_side_by_side(*args):
    html_str=''
    for df in args:
        html_str+=df.to_html()
    display_html(html_str.replace('table','table style="display:inline"'),raw=True)

In [None]:
# Printing the percentage of missing values per column
def percent_missing(dataframe):
    '''
    Prints the percentage of missing values for each column in a dataframe
    '''
    # Summing the number of missing values per column and then dividing by the total
    sumMissing = dataframe.isnull().values.sum(axis=0)
    pctMissing = sumMissing / dataframe.shape[0]
    
    if sumMissing.sum() == 0:
        print('No missing values')
    else:
        # Looping through and printing out each columns missing value percentage
        print('Percent Missing Values:', '\n')
        for idx, col in enumerate(dataframe.columns):
            if sumMissing[idx] > 0:
                print('{0}: {1:.2f}%'.format(col, pctMissing[idx] * 100))

In [None]:
#https://stackoverflow.com/questions/29294983/how-to-calculate-correlation-between-all-columns-and-remove-highly-correlated-on
def filter_df_corr(inp_data, corr_val):
    '''
    Returns an array or dataframe (based on type(inp_data) adjusted to drop \
        columns with high correlation to one another. Takes second arg corr_val
        that defines the cutoff

    ----------
    inp_data : np.array, pd.DataFrame
        Values to consider
    corr_val : float
        Value [0, 1] on which to base the correlation cutoff
    '''
    # Creates Correlation Matrix
    if isinstance(inp_data, np.ndarray):
        inp_data = pd.DataFrame(data=inp_data)
        array_flag = True
    else:
        array_flag = False
    corr_matrix = inp_data.corr()

    # Iterates through Correlation Matrix Table to find correlated columns
    drop_cols = []
    n_cols = len(corr_matrix.columns)

    for i in range(n_cols):
        for k in range(i+1, n_cols):
            val = corr_matrix.iloc[k, i]
            col = corr_matrix.columns[i]
            row = corr_matrix.index[k]
            if abs(val) >= corr_val:
                # Prints the correlated feature set and the corr val
                print(col, "|", row, "|", round(val, 2))
                drop_cols.append(col)
                
    #print(f'Highly Correlated Columns: {drop_cols}')
    # Drops the correlated columns (you can also just have this function print the highly correlated columns)
    drop_cols = set(drop_cols)
    inp_data = inp_data.drop(columns=drop_cols)
    # Return same type as inp
    if array_flag:
        return inp_data.values
    else:
        return inp_data

In [11]:
#mine
def outlier_detection(x, cutoff=3):
    '''
    Detects outliers in a column through using the z-score of the values based on a standard deviation cutoff
    '''
    from scipy import stats
    import numpy as np
    z = np.abs(stats.zscore(x))
    print(z)
    outliers = np.where(z > cutoff)
    print(outliers)

# Time Series Functions

In [None]:
#this function is a complete augmented Dickey-Fuller test for stationarity, p<.05 means the data is stationary, taken from Jose Portilla's time series class
from statsmodels.tsa.stattools import adfuller

def adf_test(series,title=''):
    """
    Pass in a time series and an optional title, returns an ADF report
    """
    print(f'Augmented Dickey-Fuller Test: {title}')
    result = adfuller(series.dropna(),autolag='AIC') # .dropna() handles differenced data
    
    labels = ['ADF test statistic','p-value','# lags used','# observations']
    out = pd.Series(result[0:4],index=labels)

    for key,val in result[4].items():
        out[f'critical value ({key})']=val
        
    print(out.to_string())          # .to_string() removes the line "dtype: float64"
    
    if result[1] <= 0.05:
        print("Strong evidence against the null hypothesis")
        print("Reject the null hypothesis")
        print("Data has no unit root and is stationary")
    else:
        print("Weak evidence against the null hypothesis")
        print("Fail to reject the null hypothesis")
        print("Data has a unit root and is non-stationary")

# Stats Functions

In [None]:
#I got really tired of running seperate lines of code for fifteen minutes just to run a few t tests, therefore I created this function to automate the process, 
# including checking for basic t test assumptions
def t_test(df, x, y, paired = False): 
    
    '''
    Takes two samples of data, runs Levene's Test to determine variance assumption veracity, then runs either 
    a dependent or independent samples T-Test with a printout report of effect sizes and summary statistics
    
    Argument format:
    df = data frame name 
    x = df['column1']
    y = df['column2']
    paired = False
    
    IMPORTANT NOTE: Categorical column data must be recoded to 1 and 2 prior to using the function for it to work properly
    '''
    
    import researchpy as rp
    from scipy.stats import levene
    
    #if the first sample is categorical (2 categories) then the categorical groups are tested against each other 
    if x.nunique() == 2:
        a = df[x == 1].iloc[:, 1]
        b = df[x == 2].iloc[:, 1]
        c, d = levene(a, b)
        print(levene(a, b))
        
        #if Levene's Test is significant, equal_var = False
        if d >= .05:
            print(f'T-Test Categorical Comparison')
            print(rp.ttest(a, b, equal_variances = False))
        else:
            print(f'T-Test Categorical Comparison')
            print(rp.ttest(a, b, equal_variances = True))
        return
    
    
    #if the t-test is paired samples (dependent), run the test and exit the function
    elif paired == True:
        e = rp.ttest(x, y, paired = True)
        return e
    else:
        
    #If the t-test is independent samples, run the Levene's Test and appropriate Welch's t-test with stated variance condition (True, False)
    
    #tuple unpacking to grab the p-value of Levene's Test
        f, g = levene(x, y)
        print(f"P-Value for Levene's Test: {g}")
        
        if g <= .05:
            h = rp.ttest(x, y, equal_variances = False)
            return f
        else:
            i = rp.ttest(x, y, equal_variances = True)
    return i

# Modeling

In [None]:
#took this from Jeff Macaluso's github, but I use it all the time so I threw it in here, I need to create one for classification next
def get_score(model):
        '''
        Fits the model and returns a series containing the RMSE, MAE, and R^2
        '''
        from sklearn.metrics import mean_squared_error, mean_absolute_error
        import time

        startTime = time.time()  # Getting training time
        
        # Fits with training set
        model.fit(X_train, y_train)
        totalTime = time.time() - startTime
        predictions = model.predict(X_test)
        
        r2 = model.score(X_test, y_test)
        rmse = np.sqrt(mean_squared_error(y_test, predictions))
        mae = mean_absolute_error(y_test, predictions)
            
        score_results = pd.Series([r2, rmse, mae, totalTime], index=['R^2', 'RMSE', 'MAE', 'TrainingTime(sec)'])
        
        return score_results