## Loading Libraries and Data

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os,gc
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train_transaction = pd.read_csv('../input/ieee-fraud-detection/train_transaction.csv')
test_transaction = pd.read_csv('../input/ieee-fraud-detection/test_transaction.csv')

# train_id = pd.read_csv('../input/ieee-fraud-detection/train_identity.csv')
# test_id = pd.read_csv('../input/ieee-fraud-detection/test_identity.csv')



In [None]:
sample = pd.read_csv('../input/ieee-fraud-detection/sample_submission.csv')

In [None]:
print(train_transaction.shape,test_transaction.shape)
# print(train_id.shape, test_id.shape)
print(sample.shape)

1. Size of both train and test data is comparable. 
2. Number of features is pretty high.

## DATA PREPROCESSING

In [None]:
train_transaction.head(10)

In [None]:
train_transaction.info(max_cols=400)

There are many columns with large number of missing values.
We can drop columns with more than 50% missing values.

In [None]:
columns_to_drop = []
num_of_rows = train_transaction.shape[0]
for i in train_transaction.columns:
    count_of_null_values = train_transaction[i].isna().sum()
    if (count_of_null_values >= num_of_rows/2):
        columns_to_drop.append(i)
del num_of_rows        

In [None]:
# Dropping columns with more than 50% missing values.
train_transaction.drop(columns_to_drop, axis=1, inplace=True)
test_transaction.drop(columns_to_drop, axis=1, inplace=True)

print("No of columns dropped {}".format(len(columns_to_drop)))
del columns_to_drop
gc.collect()

Let's preprocess the object type columns first.

In [None]:
object_columns = train_transaction.select_dtypes(include=object).columns
print("Number of categorical columns: {}".format(len(object_columns)))

In [None]:
for i in object_columns:
    print("Column Name : {}".format(i))
    print("-------------> No of missing values: {}".format(train_transaction[i].isna().sum()))
    print("-------------> Unique values: {}".format(train_transaction[i].unique()))

Some insights:
1. Categorical columns with no missing values : ProductCD
2. Categorical columns with few missing values : card4,card6
3. Categorical columns with many missing values : P_emaildomain, M6
4. Categorical columns with huge number of missing values : M1,M2,M3,M4

Lets plot the value counts graphs for these columns and see if we can fill the missing values with the mode value.

In [None]:
fig,ax = plt.subplots(3,3,figsize=(18,18))
for k,i in enumerate(object_columns):
    plt.subplot(3,3,k+1)
    if(i != 'P_emaildomain'):
        train_transaction[i].value_counts().plot(kind='bar')
    else:
        prob = train_transaction[i].value_counts(normalize=True)
        threshold = 0.02
        mask = prob > threshold
        tail_prob = prob.loc[~mask].sum()
        prob = prob.loc[mask]
        prob['other'] = tail_prob
        prob.plot(kind='bar')
    plt.title(i)

There are many interesting things here,
1. 'W' is the ProductCD in over 400,000 transactions.
2. card4 type is 'visa' in over 350,000 transactions and 'mastercard' in 200,000 transactions. Other types are rare.
3. card6 value is 'debit' in approx. 430,000 transactions and 'credit' in approx. 150,000 transactions. Other values are extremely rare.
4. P_emaildomain type in 'google.com' in over 40% of the transactions and 'yahoo.com' in 20% of the transactions. Some values are comparatively less. Most values are rare.
5. For M2 and M3, 'T' is the category for most transactions.
6. For M1, 'F' category is extremely rare.
7. In case of M4, 'M0' is the most occurred value.
8. In M6, both 'T' and 'F' occur almost equally. Still, we will fill the missing values with 'F' as it is the mode.

From these insights, we can safely fill the missing values with the mode of the object columns.

Now, plotting the object columns from test transactions.

In [None]:
fig,ax = plt.subplots(3,3,figsize=(18,18))
for k,i in enumerate(object_columns):
    plt.subplot(3,3,k+1)
    if(i != 'P_emaildomain'):
        test_transaction[i].value_counts().plot(kind='bar')
    else:
        prob = test_transaction[i].value_counts(normalize=True)
        threshold = 0.02
        mask = prob > threshold
        tail_prob = prob.loc[~mask].sum()
        prob = prob.loc[mask]
        prob['other'] = tail_prob
        prob.plot(kind='bar')
    plt.title(i)

* Above insights hold true for test data too.

In [None]:
for i in object_columns:
    train_transaction[i].fillna(train_transaction[i].mode()[0], inplace=True)
    test_transaction[i].fillna(test_transaction[i].mode()[0], inplace=True)

Now, let's see categorical features with numeric values.
[From the competition host](https://www.kaggle.com/c/ieee-fraud-detection/discussion/101203)

In [None]:
# Categorical Features:
# ProductCD
# card1 - card6
# addr1, addr2
# Pemaildomain Remaildomain
# M1 - M9

# We handles few M* features above, others were dropped because of >50% missing values.
cat_num_features = ['addr1','addr2', 'card1', 'card2', 'card3', 'card5']


In [None]:
for i in cat_num_features:
    print("Column Name : {}".format(i))
    print("-------------> No of missing values: {}".format(train_transaction[i].isna().sum()))
    print("Mode value {} occurred in {} transactions \n".format(train_transaction[i].mode()[0], train_transaction[i].value_counts().values[0]))

In [None]:
# Filling the missing values with mode.
for i in cat_num_features:
    train_transaction[i].fillna(train_transaction[i].mode()[0], inplace=True)
    test_transaction[i].fillna(test_transaction[i].mode()[0], inplace=True)
del cat_num_features
gc.collect()

Now let's create a list of numerical features with missing values.

In [None]:
all_numeric_columns = train_transaction.select_dtypes(include=np.number).columns
numeric_missing = []
for i in all_numeric_columns:
    missing = train_transaction[i].isna().sum()
    if(missing>0):
        numeric_missing.append(i)
del all_numeric_columns        
print(len(numeric_missing))

In [None]:
train_transaction[numeric_missing].describe()

Here were can see that for most V_ columns, median and mode values are same.

In [None]:
for k,i in enumerate(numeric_missing):
    print(k)
    print("Column {} has {} missing values".format(i, train_transaction[i].isna().sum()))
    print("Mode value {} occurred in {} transactions".format(train_transaction[i].mode()[0], train_transaction[i].value_counts().values[0]))
    print("Median value {} \n".format(train_transaction[i].median(), train_transaction[i].value_counts().values[0]))

In [None]:
# Filling the missing values with median.
for i in numeric_missing:
    train_transaction[i].fillna(train_transaction[i].median(), inplace=True)
    test_transaction[i].fillna(test_transaction[i].median(), inplace=True)
print(train_transaction.isna().any().sum(), test_transaction.isna().any().sum())   
del numeric_missing
gc.collect()

Now, there are no missing values in our data. Let's start with handling categorical features.

In [None]:
# object_columns
for f in object_columns:
    lbl = LabelEncoder()
    lbl.fit(list(train_transaction[f].values) + list(test_transaction[f].values))
    train_transaction[f] = lbl.transform(list(train_transaction[f].values))
    test_transaction[f] = lbl.transform(list(test_transaction[f].values))    

In [None]:
train_transaction[object_columns].head()


In [None]:
test_transaction[object_columns].head()

In [None]:
len(train_transaction.select_dtypes(exclude=np.number).sum())
del object_columns
gc.collect()

Now, all columns in the data have numeric values.

## EDA

Now that our data contains no missing values and no categorical values. We can start plotting some graphs to get intuition about the data. 

In [None]:
train_transaction.head()

In [None]:
train_transaction.describe()

Here,
1. Mean of the 'isFraud' column is 0.034, this tells us that the no. of 0s in the columns is way greater than the number of 1s.
2. In most of V_ columns, the max value is way greater than the mean and median. Outliers are present.


In [None]:
# Let's plot the histogram of isFraud column.
train_transaction['isFraud'].plot(kind='hist')

The data is highly imbalanced. We can downsample the '0' class or upsample '1' class, but for now let's continue with the imbalanced data and check the score.

Number of features is over 200, we can't plot pairplots or heatmaps, It will take up all of kernel's ram. Let's skip to training and get a baseline score.

In [None]:
# X_train,X_val,y_train,y_val = train_test_split(train_transaction.drop(['isFraud'],axis=1), train_transaction['isFraud'], test_size=0.2)

In [None]:
# train_transaction.to_csv("train_transaction.csv",sep= ',',index=False)

In [None]:
# test_transaction.to_csv("test_transaction.csv",sep= ',',index=False)

In [None]:
Y_train = train_transaction['isFraud']

In [None]:
Y_train.shape

In [None]:
X_train = train_transaction.drop(['isFraud'],axis=1)

In [None]:
X_train.shape


In [None]:
X_test = test_transaction

In [None]:
X_test.shape


I prefer manual tuning for the baseline model, you can use Hyperopt for hyperparamer-tuning.

In [None]:
# Score= .7427
params = {
    'objective': 'binary',
    'n_estimators':300,
    'learning_rate': 0.1,
    'subsample':0.8
}
# Score= .7306
params1 = {
    'objective': 'binary',
    'n_estimators': 200,
    'learning_rate': 0.1,
}
#Score= .7446
params2 = {
    'objective': 'binary',
    'n_estimators':300,
    'learning_rate': 0.1,
}
# Score=.774
params3 = {
    'objective': 'binary',
    'n_estimators':600,
    'learning_rate': 0.1
}
#Score= .7666
params4 = {
    'objective': 'binary',
    'n_estimators':500,
    'learning_rate': 0.1
}
#Score= .7711
params5 = {
    'objective': 'binary',
    'n_estimators':500,
    'learning_rate': 0.1,
    'num_leaves' : 50,
    'max_depth' : 7,
    'subsample' : 0.9,
    'colsample_bytree' : 0.9
}
#Score=.78109
params6 = {
    'objective': 'binary',
    'n_estimators':600,
    'learning_rate': 0.1,
    'num_leaves' : 50,
    'max_depth' : 7,
    'subsample' : 0.9,
    'colsample_bytree' : 0.9
}
#Score=.7863
params7 = {
    'objective': 'binary',
    'n_estimators':700,
    'learning_rate': 0.1,
    'num_leaves' : 50,
    'max_depth' : 7,
    'subsample' : 0.9,
    'colsample_bytree' : 0.9
}

In [None]:
clf = LGBMClassifier(**params7, random_state=108)
clf.fit(X_train,Y_train)

In [None]:
# preds = clf.predict(X_val)
# roc_auc_score(y_val, preds)

In [None]:
predictions = clf.predict_proba(X_test)
sample['isFraud'] = predictions[:,1]
sample.to_csv('submission_lgbm.csv', index=False)

In [None]:
from sklearn.svm import SVC


In [None]:
svm_clf = SVC(C=1000, kernel="linear", probability=True, gamma = 100, random_state=108, decision_function_shape='ovo')


In [None]:
svm_clf

In [None]:
svm_clf.fit(X_train,Y_train)


In [None]:
predictions = svm_clf.predict_proba(X_test)
sample['isFraud'] = predictions[:,1]
sample.to_csv('submission_svm.csv', index=False)

Code below can be used to download the .csv generated in your Kaggle kernel, this way you can submit the predictions without having to commit the kernel again and again. ![Thanks to Rachel](https://www.kaggle.com/rtatman/download-a-csv-file-from-a-kernel)

In [None]:
# def create_download_link(df, title = "Download CSV file", filename = "data.csv"):  
#     csv = df.to_csv(index=False)
#     b64 = base64.b64encode(csv.encode())
#     payload = b64.decode()
#     html = '<a download="{filename}" href="data:text/csv;base64,{payload}" target="_blank">{title}</a>'
#     html = html.format(payload=payload,title=title,filename=filename)
#     return HTML(html)
# create_download_link(sample)

In [None]:
# train_transaction = pd.read_csv('../input/cleandata/train_transaction.csv')


In [None]:
# test_transaction = pd.read_csv('../input/cleandata/test_transaction.csv')

In [None]:
rf = RandomForestClassifier(n_estimators = 1000,random_state=121,min_samples_split = 2, bootstrap = False, max_depth = 5)


In [None]:
rf

In [None]:
rf.fit(X_train,Y_train)


In [None]:
X_test.replace([np.inf, -np.inf], np.nan, inplace=True)


In [None]:
X_test.fillna(999, inplace=True)


In [None]:
predictions = rf.predict_proba(X_test)


In [None]:
# sample = pd.read_csv('../input/ieee-fraud-detection/sample_submission.csv')

In [None]:
predictions[:,1]

In [None]:
sample['isFraud'] = predictions[:,1]
sample.to_csv('submission_rf.csv', index=False)

In [None]:
from sklearn import tree


In [None]:
DTree = tree.DecisionTreeClassifier(random_state=0, criterion='entropy',max_depth=8,splitter='best', min_samples_split=30)

In [None]:
DTree = DTree.fit(X_train,Y_train)

In [None]:
pred = DTree.predict(X_train)


In [None]:
pred

In [None]:
sample['isFraud'] = predictions
sample.to_csv('submission_dt.csv', index=False)

In [None]:
X_train


In [None]:
from sklearn.preprocessing import MinMaxScaler


In [None]:
scaling = MinMaxScaler(feature_range=(-1,1)).fit(X_train)
X_train = scaling.transform(X_train)
X_test = scaling.transform(X_test)

## REFERENCES

1. I haven't seen any kernels from this competition yet, but I would like to thank artgor for his amazing EDA kernels. I have done a basic analysis of some of his kernels and learned a lot from them.

If you think there are mistakes or improvements can be made, please comment :)