In [4]:
import pandas as pd
# Set max columns displayed
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
import numpy as np
import scipy as sp
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
# Load data
test_id = pd.read_csv('train_identity.csv')
test_trans = pd.read_csv('train_transaction.csv')

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
test_id = reduce_mem_usage(test_id)
test_trans = reduce_mem_usage(test_trans)

Mem. usage decreased to 25.86 Mb (42.7% reduction)


Analyze train_identity

In [None]:
# Dimensions of data
test_id.shape

In [None]:
# Data types
test_id.info()

In [None]:
# Summary of columns
test_id.describe()

In [None]:
# Preview data
test_id.head()

Analyze train_transaction

In [None]:
test_trans.shape

In [None]:
test_trans.info()

In [None]:
test_trans.describe()

In [None]:
test_trans.head(20)

In [None]:
test_trans.tail()

In [None]:
def resumetable(df):
    print(f"Dataset Shape: {df.shape}")
    summary = pd.DataFrame(df.dtypes,columns=['dtypes'])
    summary = summary.reset_index()
    summary['Name'] = summary['index']
    summary = summary[['Name','dtypes']]
    summary['Missing'] = df.isnull().sum().values    
    summary['Uniques'] = df.nunique().values
    summary['First Value'] = df.loc[0].values
    summary['Second Value'] = df.loc[1].values
    summary['Third Value'] = df.loc[2].values

    for name in summary['Name'].value_counts().index:
        summary.loc[summary['Name'] == name, 'Entropy'] = round(stats.entropy(df[name].value_counts(normalize=True), base=2),2) 

    return summary

In [None]:
resumetable(test_trans)

In [None]:
resumetable(test_id)

# First goal: We are doing fraud detection, so let us visualize the fraudulent transactions first.

In [None]:
test_trans['TransactionAmt'] = test_trans['TransactionAmt'].astype(float)

In [None]:
total_trans = len(test_trans)

first step, compare the total transactions vs fraud transactions

In [None]:
# https://stackoverflow.com/questions/39922986/pandas-group-by-and-sum
total_transaction_amount = test_trans['TransactionAmt'].sum()
# https://stackoverflow.com/questions/1823058/how-to-print-number-with-commas-as-thousands-separators
f'{round(total_transaction_amount, 2):,}'

NOTE: It is odd that transactions have so many decimals, it is possible that they come from taxes or other forms of financial transactions.

In [None]:
fraud_transactions = test_trans[test_trans['isFraud'] == 1]['TransactionAmt'].sum()
f'{round(fraud_transactions, 2):,}'

In [None]:
non_fraud_transactions = test_trans[test_trans['isFraud'] == 0]['TransactionAmt'].sum()
f'{round(non_fraud_transactions, 2):,}'

In [None]:
fraud_transactions + non_fraud_transactions == total_transaction_amount

In [None]:
fraud_count = test_trans[test_trans['isFraud'] == 1]['TransactionAmt'].count()
non_fraud_count = test_trans[test_trans['isFraud'] == 0]['TransactionAmt'].count()

In [None]:
total_trans == fraud_count + non_fraud_count

In [None]:
plt.figure(figsize=(16,6))
# https://www.geeksforgeeks.org/different-ways-to-create-pandas-dataframe/
fraud_data = {'Fraudulent':['Fraudulent', 'non-Fraudulent'],
              'Transaction Amount':[fraud_transactions, non_fraud_transactions]}
fraud_data = pd.DataFrame(fraud_data)
# https://seaborn.pydata.org/generated/seaborn.barplot.html
# https://stackoverflow.com/questions/42404154/increase-tick-label-font-size-in-seaborn
sns.set(style="whitegrid", font_scale=1.5)
ax = sns.barplot(x="Fraudulent", y="Transaction Amount", data=fraud_data)
ax.set_title('Barplot of Transaction Amount vs. Fraudulent or non-Fraudulent', fontsize=22)
ax.set_xlabel('') # Remove the 'Fraudulent' column name from the bottom
for p in ax.patches:
    height = p.get_height()
    ax.text(p.get_x()+p.get_width()/2.,
            height + 3,
            '{:1.2f}%'.format(height/total_transaction_amount * 100),
            ha="center", fontsize=15)

plt.figure(figsize=(16,6))
fraud_count_data = {'Fraudulent':['Fraudulent', 'non-Fraudulent'],
              'Fraud Count':[fraud_count, non_fraud_count]}
fraud_count_data = pd.DataFrame(fraud_count_data)
sns.set(style="whitegrid", font_scale=1.5)
ax = sns.barplot(x="Fraudulent", y="Fraud Count", data=fraud_count_data)
ax.set_title('Barplot of Transaction Count vs. Fraudulent or non-Fraudulent', fontsize=22)
ax.set_xlabel('') # Remove the 'Fraudulent' column name from the bottom
for p in ax.patches:
    height = p.get_height()
    ax.text(p.get_x()+p.get_width()/2.,
            height + 3,
            '{:1.2f}%'.format(height/total_trans*100),
            ha="center", fontsize=15) 

## Analyze the distributions of the transaction amounts for fraudulent vs. non-fraudulent charges

In [None]:
fraud_dist = test_trans[test_trans['isFraud'] == 1]['TransactionAmt']
non_fraud_dist = test_trans[test_trans['isFraud'] == 0]['TransactionAmt']

In [None]:
plt.figure(figsize=(16,6))
ax = sns.distplot(fraud_dist)
ax.set_title('Histogram of Fraudulent Transactions Amounts', fontsize=22)
ax.set_xlabel('Transaction Amount ($)')
ax.set_ylabel('Frequency')

plt.figure(figsize=(16,6))
ax = sns.distplot(non_fraud_dist)
ax.set_title('Histogram of non-Fraudulent Transactions Amounts', fontsize=22)
ax.set_xlabel('Transaction Amount ($)')
ax.set_ylabel('Frequency')

## Re-examine them both with certain outliers removed.

In [None]:
def subset_by_iqr(df, column, whisker_width=1.5):
#     https://stackoverflow.com/questions/34782063/how-to-use-pandas-filter-with-iqr
    """Remove outliers from a dataframe by column, including optional 
       whiskers, removing rows for which the column value are 
       less than Q1-1.5IQR or greater than Q3+1.5IQR.
    Args:
        df (`:obj:pd.DataFrame`): A pandas dataframe to subset
        column (str): Name of the column to calculate the subset from.
        whisker_width (float): Optional, loosen the IQR filter by a
                               factor of `whisker_width` * IQR.
    Returns:
        (`:obj:pd.DataFrame`): Filtered dataframe
    """
    # Calculate Q1, Q2 and IQR
    q1 = df[column].quantile(0.25)                 
    q3 = df[column].quantile(0.75)
    iqr = q3 - q1
    # Apply filter with respect to IQR, including optional whiskers
    filter = (df[column] >= q1 - whisker_width*iqr) & (df[column] <= q3 + whisker_width*iqr)
    return df.loc[filter]                                                     

In [None]:
fraud_dist_outlier = subset_by_iqr(pd.DataFrame(fraud_dist), 'TransactionAmt', whisker_width=1.5)
non_fraud_dist_outlier = subset_by_iqr(pd.DataFrame(non_fraud_dist), 'TransactionAmt', whisker_width=1.5)

In [None]:
plt.figure(figsize=(16,6))
ax = sns.distplot(fraud_dist_outlier)
ax.set_title('Histogram of Fraudulent Transactions Amounts (Outliers Removed)', fontsize=22)
ax.set_xlabel('Transaction Amount ($)')
ax.set_ylabel('Frequency')

plt.figure(figsize=(16,6))
ax = sns.distplot(non_fraud_dist_outlier)
ax.set_title('Histogram of non-Fraudulent Transactions Amounts (Outliers Removed)', fontsize=22)
ax.set_xlabel('Transaction Amount ($)')
ax.set_ylabel('Frequency')

## Re-examine with log of original (outliers remain).

In [None]:
plt.figure(figsize=(16,6))
ax = sns.distplot(np.log(fraud_dist))
ax.set_title('log Histogram of Fraudulent Transactions Amounts', fontsize=22)
ax.set_xlabel('Transaction Amount ($)')
ax.set_ylabel('Frequency')

plt.figure(figsize=(16,6))
ax = sns.distplot(np.log(non_fraud_dist))
ax.set_title('log Histogram of non-Fraudulent Transactions Amounts', fontsize=22)
ax.set_xlabel('Transaction Amount ($)')

## Re-examine them both with certain outliers removed (log version).

In [None]:
plt.figure(figsize=(16,6))
ax = sns.distplot(np.log(fraud_dist_outlier))
ax.set_title('log Histogram of Fraudulent Transactions Amounts', fontsize=22)
ax.set_xlabel('Transaction Amount ($)')
ax.set_ylabel('Frequency')

plt.figure(figsize=(16,6))
ax = sns.distplot(np.log(non_fraud_dist_outlier))
ax.set_title('log Histogram of non-Fraudulent Transactions Amounts', fontsize=22)
ax.set_xlabel('Transaction Amount ($)')

### Results of histogram:
- Normal histogram (fraud)
    - heavily right skewed
- Normal histogram (non-fraud)
    - heavily right skewed
- log histogram (fraud)
    - distribution appears closer to normal
- log histogram (non-fraud)
    - distribution appears closer to normal
- Normal histogram (fraud) (w/o outliers)
    - less right skew, observations are bunched towards the lower end
- Normal histogram (non-fraud) (w/o outliers)
    - less right skew, observations have various peaks
- log histogram (fraud) (w/o outliers)
    - data appears left skewed, somewhat normal
- log histogram (non-fraud) (w/o outliers)
    - data appears left skewed, two sharp peaks

In [None]:
def CalcOutliers(df_num):
    whisker_width = 1.5
    q1 = df_num.quantile(0.25)                 
    q3 = df_num.quantile(0.75)
    iqr = q3 - q1

    #Calculating the higher and lower cut values
    lower, upper = q1 - whisker_width*iqr, q3 + whisker_width*iqr

    # creating an array of lower, higher and total outlier values 
    outliers_lower = [x for x in df_num if x < lower]
    outliers_higher = [x for x in df_num if x > upper]
    outliers_total = [x for x in df_num if x < lower or x > upper]

    # array without outlier values
    outliers_removed = [x for x in df_num if x > lower and x < upper]
    
    print('Identified lowest outliers: %d' % len(outliers_lower)) # printing total number of values in lower cut of outliers
    print('Identified upper outliers: %d' % len(outliers_higher)) # printing total number of values in higher cut of outliers
    print('Total outlier observations: %d' % len(outliers_total)) # printing total number of values outliers of both sides
    print('Non-outlier observations: %d' % len(outliers_removed)) # printing total number of non outlier values
    print("Total percentual of Outliers: ", round((len(outliers_total) / len(outliers_removed) )*100, 4)) # Percentual of outliers in points
    
    return

In [None]:
CalcOutliers(test_trans['TransactionAmt'])

# Logistic regression

The problem in this project is a binary classification problem and for that reason the primary tool for prediction will be logistic regression. The category of fraudulent vs. non-fraudulent is a categorical variable, where neither fraudulent or non-fraudulent can be considered higher or lower than the other. For that reason, it is nominal rather than ordinal.

The data we have is count data, so it will be following a Poisson distribution. Possibly Poisson, not independent?
https://www.youtube.com/watch?v=sv_KXSiorFk

There seems to be issues with the Poisson assumptions. It is definitely count data, but it is difficult to determine whether these events are occuring in a specific time or space. If it were time we could say from start to end date. If it were space or area we could say North America.

Approximately Poisson process? Instead it is possible that the random variable approximately follows a Poisson process and we can try to model it as such.

Fraudulent  | Frequency | Proportion
------------- | ------------- | -------------
Yes  | 20,663 | 0.03499
No  | 569,877 | 0.96501
Total | 590,540 |

In [None]:
total_trans == fraud_count + non_fraud_count

Independence? $P(A \cap B) = P(A)P(B)$ How to test? Monte Carlo?

Mean and variance should be similar for Poisson regression. Different for sample means/sample variance.

Possible sci-kit learn solution, use weights in logistic regression. https://chrisalbon.com/machine_learning/logistic_regression/handling_imbalanced_classes_in_logistic_regression/

In [1]:
# assume your data is df, and it's a pd dataframe
# "class" is either fradulent (0) and non-fradudulent (1)

frad_df = df[df['class'] == 0].reset_index(drop=True)
non_frad_df = df[df['class'] == 1].reset_index(drop=True)

random_sample_idx = np.random.choice(range(frad_df.shape[0]), size=non_frad_df.shape[0], replace=True)
frad_sample_df = frad_df.iloc[random_sample_idx]
balanced_df = pd.concat([non_frad_df, frad_sample_df]).reset_index(drop=True)

NameError: name 'df' is not defined