# Table of contents

[Libs](#Libs)

[Consts](#Consts)

[Overview](#Overview)
* [Train](#Train)
  * [Identity](#Identity)
      * [Numeric_features](#Numeric_features)
      * [Categorical_features](#Categorical_features)
  * [Transaction](#Transaction)
      * [Numeric_features](#Numeric_features)
      * [Categorical_features](#Categorical_features)
      * [Target](#Target)
      * [NaNs](#NaNs)
  * [Merge_data](#Merge_data)
      * [Save-merged-train-data](#Save-merged-train-data)
  * [Look-at-Pearson-correlation](#Look-at-Pearson-correlation)
  
[Pandas_profiling](#Pandas_profiling)

[Overview_corelated_features](#Overview_corelated_features)

# Libs

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
import pandas_profiling

# Consts

In [None]:
data_dir = '../input/ieee-fraud-detection'

# Overview

## Train

### Identity
Variables in this table are identity information – network connection information (IP, ISP, Proxy, etc) and digital signature (UA/browser/os/version, etc) associated with transactions.
They're collected by Vesta’s fraud protection system and digital security partners.
(The field names are masked and pairwise dictionary will not be provided for privacy protection and contract agreement)

In [None]:
train_identity_data = pd.read_csv(f'{data_dir}/train_identity.csv')
train_identity_data.info()

In [None]:
train_identity_data.head(3)

#### Numeric_features

In [None]:
identity_num_features = [
    'TransactionID',
    'id_01',
    'id_02',
    'id_03',
    'id_04',
    'id_05',
    'id_06',
    'id_07',
    'id_08',
    'id_09',
    'id_10',
    'id_11',
    'id_13',
    'id_14',
    'id_17',
    'id_18',
    'id_19',
    'id_20',
    'id_21',
    'id_22',
    'id_24',
    'id_25',
    'id_26',
    'id_32'
]

train_identity_data[identity_num_features].hist(figsize=(20,20), bins=50)
plt.show()

#### Categorical_features

In [None]:
cat_features = [
     'id_12',
     'id_15',
     'id_16',
     'id_23',
     'id_27',
     'id_28',
     'id_29',
     'id_30',
     'id_31',
     'id_33',
     'id_34',
     'id_35',
     'id_36',
     'id_37',
     'id_38',
     'DeviceType',
     'DeviceInfo'
]
train_identity_data[cat_features].head()

In [None]:
for col in train_identity_data[cat_features].columns:
    print(train_identity_data[col].value_counts(), '\n')

y_pos = np.arange(len(train_identity_data.columns))
fig = plt.figure(figsize=(15,80))

plt.barh(y_pos, values_count, align='edge')
plt.yticks(y_pos, labels=train_identity_data.columns)
plt.xlabel('Колчичество непустых значений в данных')
plt.ylabel('Признаки')
plt.grid(True)
plt.show()

## Transaction
* TransactionDT: timedelta from a given reference datetime (not an actual timestamp)
* TransactionAMT: transaction payment amount in USD
* ProductCD: product code, the product for each transaction
* card1 - card6: payment card information, such as card type, card category, issue bank, country, etc.
* addr: address
* dist: distance
* P_ and (R__) emaildomain: purchaser and recipient email domain
* C1-C14: counting, such as how many addresses are found to be associated with the payment card, etc. The actual meaning is masked.
* D1-D15: timedelta, such as days between previous transaction, etc.
* M1-M9: match, such as names on card and address, etc.
* Vxxx: Vesta engineered rich features, including ranking, counting, and other entity relations.

> Categorical Features:
* ProductCD
* card1 - card6
* addr1, addr2
* P_emaildomain
* R_emaildomain
* M1 - M9

In [None]:
train_transaction_data = pd.read_csv(f'{data_dir}/train_transaction.csv')
train_transaction_data.info()

In [None]:
cat_features_tr = [
    'ProductCD',
    'card1',
    'card2',
    'card3',
    'card4',
    'card5',
    'card6',
    'addr1', 
    'addr2',
    'P_emaildomain',
    'R_emaildomain',
    'M1',
    'M2',
    'M3',
    'M4',
    'M5',
    'M6',
    'M7',
    'M8',
    'M9'
]
num_features_tr = [x for x in train_transaction_data.columns if x not in cat_features_tr]

### Numeric_features

In [None]:
num_features_tr

In [None]:
train_transaction_data[num_features_tr].head()

In [None]:
for feat in num_features_tr:
    fig = plt.figure()
    plt.hist(train_transaction_data[feat], bins=50)
    plt.title(feat)
    plt.show()

# Неинформативные признаки:
not_usefull_features = [
    'C4','C7','C8','C10','C12',
    'V1','V14','V27','V28','V41','...'
]

### Categorical_features

In [None]:
cat_features_tr

### Target
**isFraud** in numeric feaures

## NaNs

In [None]:
values_count = []
for feat in train_transaction_data.columns:
    values_count.append(len(train_transaction_data[feat].dropna()))

In [None]:
plt.bar(train_transaction_data.columns, values_count)

In [None]:
y_pos = np.arange(len(train_transaction_data.columns))
fig = plt.figure(figsize=(15,80))

plt.barh(y_pos, values_count, align='edge')
plt.yticks(y_pos, labels=train_transaction_data.columns)
plt.xlabel('Колчичество непустых значений в данных')
plt.ylabel('Признаки')
plt.grid(True)
plt.show()

In [None]:
len(train_transaction_data.dropna())

In [None]:
for col in train_transaction_data[cat_features_tr].columns:
    print(train_transaction_data[col].value_counts(), '\n')

# Merge_data

In [None]:
train_data = pd.merge(train_identity_data, train_transaction_data, how='outer', on='TransactionID')

In [None]:
train_data.head()

In [None]:
print(len(train_identity_data.columns), len(train_transaction_data.columns))

In [None]:
41+394

In [None]:
len(train_data.TransactionID)

In [None]:
train_data.TransactionID.notna().value_counts()

## Save-merged-train-data

In [None]:
train_data.to_csv('train_data.csv')

In [None]:
with open('categorical_features.txt', 'w') as f:
    for feature in cat_features:
        f.write(f'{feature}\n')
    for feature in cat_features_tr:
        f.write(f'{feature}\n')

In [None]:
with open('numeric_features.txt', 'w') as f:
    for feature in identity_num_features:
        f.write(f'{feature}\n')
    num_features_tr.remove('TransactionID')
    for feature in num_features_tr:
        f.write(f'{feature}\n')

# Look-at-Pearson-correlation

In [None]:
correlation = train_data.drop('isFraud', axis=1).corrwith(train_data.isFraud)

In [None]:
correlation[correlation > 0.2]

In [None]:
correlation[correlation == correlation.max()]

In [None]:
len(correlation[correlation > 0.05])

In [None]:
s = correlation[correlation > 0.05].sort_values(ascending=False)
corr_features = list(zip(s.index, s.values))
for feat in corr_features:
    print(feat)

In [None]:
feat, vals = zip(*corr_features)

In [None]:
feat[0]

# Pandas_profiling

In [None]:
train_identity_data = pd.read_csv(f'{data_dir}/train_identity.csv')
train_transaction_data = pd.read_csv(f'{data_dir}/train_transaction.csv')
print(train_identity_data.shape, train_transaction_data.shape)

In [None]:
train_data = pd.merge(train_identity_data, train_transaction_data, how='outer', on='TransactionID')
del train_identity_data
del train_transaction_data

train_data.shape

In [None]:
profile = pandas_profiling.ProfileReport(train_data, minimal=True)
profile.to_file('profile.html')

In [None]:
profile

# Overview_corelated_features

In [None]:
train_identity_data = pd.read_csv(f'{data_dir}/train_identity.csv')
train_transaction_data = pd.read_csv(f'{data_dir}/train_transaction.csv')
train_data = pd.merge(train_identity_data, train_transaction_data, how='outer', on='TransactionID')
del train_identity_data
del train_transaction_data
train_data.to_csv('train_data.csv')

In [None]:
correlation = train_data.drop('isFraud', axis=1).corrwith(train_data.isFraud)
s = correlation[correlation > 0.05].sort_values(ascending=False)
corr_features = list(zip(s.index, s.values))
corr_features[:5]

In [None]:
most_corr_features = list(zip(*corr_features[:5]))
most_corr_features[0]

In [None]:
for item in most_corr_features[0]:
    train_data.set_index(item)['isFraud'].plot(style='.')
    plt.ylabel('isFraud')
    plt.show()