In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
import pandas_profiling as pdp
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 300)
pd.set_option('display.max_colwidth', 5000)
pd.options.display.float_format = '{:.3f}'.format
%matplotlib inline
plt.style.use('fivethirtyeight')

In [None]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, KFold, RepeatedKFold, GridSearchCV
from sklearn.metrics import classification_report, roc_auc_score, roc_curve, auc

import xgboost as xgb

In [None]:
train_transaction = pd.read_csv('../input/ieee-fraud-detection/train_transaction.csv')
test_transaction = pd.read_csv('../input/ieee-fraud-detection/test_transaction.csv')

train_identity = pd.read_csv('../input/ieee-fraud-detection/train_identity.csv')
test_identity = pd.read_csv('../input/ieee-fraud-detection/test_identity.csv')

sample_submission = pd.read_csv('../input/ieee-fraud-detection/sample_submission.csv')

In [None]:
pd.set_option('display.max_columns', 500)
train_transaction.head(5)

In [None]:
test_transaction.head(5)

In [None]:
train_identity.head(5)

In [None]:
test_identity.head(5)

In [None]:
train = train_transaction.merge(train_identity , how = 'left' , on = 'TransactionID')
test = test_transaction.merge(test_identity , how = 'left' , on = 'TransactionID')


In [None]:
print('Train dataset has {} rows and {} columns.'.format(train.shape[0], train.shape[1]))
print('Test dataset has {} rows and {} columns.'.format(test.shape[0], test.shape[1]))

In [None]:
del train_transaction, train_identity, test_transaction, test_identity

In [None]:
def is_integer_num(n):
    if isinstance(n, int):
        return True
    if isinstance(n, float):
        return n.is_integer()
    return False

def missing_values_table_specified_value(df, value=0.5): 
    mis_val = df.isnull().sum()
    mis_val_percent = 100 * df.isnull().sum()/len(df)
    mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
    mis_val_table_ren_columns = mis_val_table.rename(
    columns = {0 : 'Missing Values', 1 : '% of Total Values'})
    
    if is_integer_num(value):
        mis_val_table_ren_columns = mis_val_table_ren_columns[mis_val_table_ren_columns['Missing Values'] >= value]
        print('The number of columns with {} counts missing values is {}.'.format(value, len(mis_val_table_ren_columns)))
    else:
        value = value * 100
        mis_val_table_ren_columns = mis_val_table_ren_columns[mis_val_table_ren_columns['% of Total Values'] >= value]
        print('The number of columns with {}% missing values is {}.'.format(value, len(mis_val_table_ren_columns)))
    return mis_val_table_ren_columns 

def missing_values_table(data):
    total = data.isnull().sum()
    percent = (data.isnull().sum()/data.isnull().count()*100)
    tt = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    types = []
    for col in data.columns:
        dtype = str(data[col].dtype)
        types.append(dtype)
    tt['Types'] = types
    return(np.transpose(tt))

In [None]:
missing_values_table_specified_value(train, 0.5).head()

In [None]:
missing_values_table_specified_value(test, 0.5).head()

In [None]:
display(missing_values_table(train), missing_values_table(test))


In [None]:
train['isFraud'].value_counts()

In [None]:
sns.countplot(train['isFraud'])

In [None]:
f,ax=plt.subplots(1,2,figsize=(18,8))
train['isFraud'].value_counts().plot.pie(explode=[0,0.1],autopct='%1.1f%%',ax=ax[0],shadow=True)
ax[0].set_title('isFraud')
ax[0].set_ylabel('')
sns.countplot('isFraud',data=train,ax=ax[1])
ax[1].set_title('isFraud')
plt.show()

In [None]:
train=train[train.columns[train.isnull().mean() <= 0.70]] 
test=test[test.columns[test.isnull().mean() <= 0.70]] 

In [None]:
quantitative = [f for f in train.columns if train.dtypes[f] != 'object']
print(quantitative)
print('Counts: {}'.format(len(quantitative)))

In [None]:
qualitative = [f for f in train.columns if train.dtypes[f] == 'object']
print(qualitative)
print('Counts: {}'.format(len(qualitative)))

In [None]:
for column in qualitative:
    train[column].fillna(train[column].mode()[0], inplace=True)

In [None]:
qualitative_test = [f for f in test.columns if train.dtypes[f] == 'object']
print(qualitative_test)
print('Counts: {}'.format(len(qualitative_test)))

In [None]:
for column in qualitative_test:
    test[column].fillna(test[column].mode()[0], inplace=True)

In [None]:
for column in quantitative:
    train[column].fillna(train[column].mean(), inplace=True)

In [None]:
quantitative_test = [f for f in train.columns if train.dtypes[f] != 'object']
print(quantitative_test)
print('Counts: {}'.format(len(quantitative_test)))

In [None]:
del quantitative_test[1]

In [None]:
quantitative_test[1]

In [None]:

for column in quantitative_test:
    test[column].fillna(test[column].mean(), inplace=True)

In [None]:
print(train.shape)
print(test.shape)

Model Building

In [None]:
X_train = train.drop('isFraud', axis=1)
y_train = train['isFraud'].copy()
X_test = test.copy()

In [None]:
X_train.shape, X_test.shape

In [None]:
# Label Encoding
for f in qualitative:
    lbl = preprocessing.LabelEncoder()
    lbl.fit(list(X_train[f].values) + list(X_test[f].values))
    X_train[f] = lbl.transform(list(X_train[f].values))
    X_test[f] = lbl.transform(list(X_test[f].values)) 

In [None]:
# Check if it is encoded
print(len(X_train.select_dtypes(include='object').columns))
print(len(X_test.select_dtypes(include='object').columns))


In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
X_train = reduce_mem_usage(X_train)


In [None]:
X_test = reduce_mem_usage(X_test)

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
score = cross_val_score(LogisticRegression(),X_train,y_train).mean()

In [None]:
print(score)

In [None]:
#decisiontree
from sklearn.tree import DecisionTreeClassifier
decision_score = cross_val_score(DecisionTreeClassifier(),X_train,y_train).mean()
print(decision_score)

In [None]:
#randomforest
from sklearn.ensemble import RandomForestClassifier
random_score = cross_val_score(RandomForestClassifier(),X_train,y_train).mean()
print(random_score)


In [None]:
rand_model=RandomForestClassifier()
rand_model.fit(X_train,y_train)
rand_pred=rand_model.predict(X_test)

In [None]:
sample_submission['isFraud'] = rand_pred
sample_submission.to_csv('IEEE_SUBMISSION.csv',index=False)


In [None]:
sample_submission.columns

In [None]:
sample_submission['isFraud'].value_counts()

In [None]:
sample_submission['isFraud'].value_counts()

In [None]:
sample_submission.head()