In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns', None)

In [6]:
def dataQuality(df):
    # Takes a dataframe as input and returns a new dataframe with data quality stats such
    # as missing values, unique values, scarcity, and datatype by column

    print('Dataframe contains {} observations across {} columns\n'.format(df.shape[0], df.shape[1]))
    print('Data Quality Stats:\n')
    
    df_temp = pd.DataFrame(index=df.columns, columns=['Missing_Count', 'Scarcity', 'Unique_Values', 'Data_Type'])
    df_temp['Missing_Count'] = df.isnull().sum()
    df_temp['Scarcity'] = round(((df.isnull().sum())/(df.shape[0])*100),2)
    df_temp['Unique_Values'] = [df[i].nunique() for i in df.columns]
    df_temp['Data_Type'] = [df[i].dtype for i in df.columns]
    
    return df_temp.sort_values(by=['Missing_Count'], ascending=False)

In [9]:
def describeNumericCols(cols_list):
    # Gets summary stats for columns defined below
    
    print('Summary stats for the following significant numeric columns:\n')
    for i in cols_list:
        print(i)
    display(df[cols_list].describe())

In [12]:
def createDuplicateTransactionDfs(df):
    # Takes a dataframe as input and generates two dataframes, one for double swipe transactions and another 
    # for reversal transactions
    
    doubleSwipe = df[(df['transactionAmountDiff'] == 0) & (df['transactionTimeDiff'] < 5) & 
                     (~df['transactionType'].isin(['REVERSAL', 'ADDRESS_VERIFICATION']))]
    reversal = df[df['transactionType'] == 'REVERSAL']
    
    return doubleSwipe, reversal

In [1]:
def histogram(df, col):
    # Takes a dataframe and column as input and creates a histogram with mean, median and mode embedded
    
    plt.figure(figsize=[8,5])
    
    ax = plt.axes()
    sns.distplot(df[col].dropna()) #kde=False, norm_hist=False, bins='auto'
    ax.axvline(df[col].mean(), color='r', linestyle='-')
    ax.axvline(df[col].median(), color='g', linestyle='-')
    ax.axvline(df[col].mode().get_values()[0], color='b', linestyle='-')
    ax.set_title('Distribution of ' + col, fontsize=15)
    plt.legend({'Mean':df[col].mean(),'Median':df[col].median(),'Mode':df[col].mode().get_values()[0]}, fontsize=15)
    plt.xlabel('Amount', fontsize=15)
    plt.ylabel('Density', fontsize=15)
    plt.xticks(np.arange(0, df[col].max(), 100), rotation=45)

In [5]:
def featureEngineering(df):
    # Takes a dataframe as input and creates 11 new features
    
    df['transactionAmountDiff'] = df['transactionAmount'] - df['transactionAmount'].shift(1)
    df['transactionTimeDiff'] = round(abs((df['transactionDateTime'] - df['transactionDateTime'].shift(1))).dt.total_seconds()/60, 2)
    df['diffCVV'] = np.where((df['cardCVV'] == df['enteredCVV']), 0, 1)
    df['acctOpenAddressChangeDiff'] = np.where((df['dateOfLastAddressChange'] == df['accountOpenDate']), 0, 1)
    df['daysSinceAddressChange'] = round((df['transactionDateTime'] - df['dateOfLastAddressChange']).dt.days, 2)
    df['daysSinceAccountOpen'] = round((df['transactionDateTime'] - df['accountOpenDate']).dt.days, 2)
    df['transactionDateYear'] = df['transactionDateTime'].dt.year
    df['transactionDateMonth'] = df['transactionDateTime'].dt.month
    df['transactionDateDay'] = df['transactionDateTime'].dt.day
    df['transactionDateHour'] = df['transactionDateTime'].dt.hour
    df['transactionDateMinute'] = df['transactionDateTime'].dt.minute


    new_feats = ['transactionAmountdiff', 'transactionTimeDiff', 'diffCVV', 'acctOpenAddressChangeDiff',
                'daysSinceAddressChange', 'daysSinceAccountOpen', 'transactionDateYear', 
                'transactionDateMonth', 'transactionDateDay', 'transactionDateHour',
                'transactionDateMinute']

    print('Following features created:\n')
    for i in new_feats:
        print(i)
    
    return df

In [6]:
def drop_columns(df, cols_list):
    # Takes a dataframe and a list of columns and drops them and returns overwritten df
    
    df.drop(cols_list, axis=1).reset_index(drop=True)
    print('Successfully dropped {} columns:\n'.format(len(cols_list)))
    for i in cols_list:
        print(i)
    return df

In [None]:
# removeOutliers

In [None]:
# chi square test