In [None]:
import numpy as np 
import pandas as pd
import math
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score, ConfusionMatrixDisplay, confusion_matrix
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, RandomizedSearchCV, GridSearchCV, StratifiedKFold 
from sklearn.preprocessing import LabelEncoder ,OneHotEncoder, LabelBinarizer, StandardScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.impute import SimpleImputer
import tensorflow as tf
tf.random.set_seed(221)
np.random.seed(221)

# Cargo las fuentes

In [None]:
train_T = pd.read_csv('/kaggle/input/ieee-fraud-detection/train_transaction.csv')
train_I = pd.read_csv('/kaggle/input/ieee-fraud-detection/train_identity.csv')
test_T = pd.read_csv('/kaggle/input/ieee-fraud-detection/test_transaction.csv')
test_I = pd.read_csv('/kaggle/input/ieee-fraud-detection/test_identity.csv')

In [None]:
test_I.columns=train_I.columns

In [None]:
mergedtestset = test_T.merge(test_I, how= 'left')

In [None]:
mergedset = train_T.merge(train_I, how= 'left')

# Encoding

In [None]:
labelencoder = LabelEncoder()

## ---------------------------TransactionDelta---------------------------

In [None]:
mergedset.loc[mergedset['TransactionDT'].isna()]

## ---------------------------Product---------------------------

## ONE HOT ENCODING

In [None]:
binarizer = LabelBinarizer()
count_array = binarizer.fit_transform(mergedset['ProductCD'].fillna('none'))
ProductCDOH = pd.DataFrame(data=count_array,columns = ['prod1','prod2','prod3','prod4','prod5'])
mergedset = mergedset.merge(ProductCDOH, left_index=True, right_index=True)

count_array = binarizer.transform(mergedtestset['ProductCD'].fillna('none'))
ProductCDOH = pd.DataFrame(data=count_array,columns = ['prod1','prod2','prod3','prod4','prod5'])
mergedtestset = mergedtestset.merge(ProductCDOH, left_index=True, right_index=True)

## ---------------------------Card---------------------------

## ONE HOT

In [None]:
binarizer = LabelBinarizer()
count_array = binarizer.fit_transform(mergedset['card4'].fillna('none'))
ProductCDOH = pd.DataFrame(data=count_array,columns = ['CardName1','CardName2','CardName3','CardName4','CardName5'])
mergedset = mergedset.merge(ProductCDOH, left_index=True, right_index=True)

count_array = binarizer.transform(mergedtestset['card4'].fillna('none'))
ProductCDOH = pd.DataFrame(data=count_array,columns = ['CardName1','CardName2','CardName3','CardName4','CardName5'])
mergedtestset = mergedtestset.merge(ProductCDOH, left_index=True, right_index=True)

In [None]:
mergedset.loc[mergedset['card1'].isna()]

In [None]:
mergedset['card2'].fillna(0,inplace=True)
mergedtestset['card2'].fillna(0,inplace=True)

In [None]:
mergedset['card3'].fillna(0,inplace=True)
mergedtestset['card3'].fillna(0,inplace=True)

In [None]:
mergedset['card5'].fillna(0,inplace=True)
mergedtestset['card5'].fillna(0,inplace=True)

## Label Encoding

In [None]:
mergedset['card6'] = labelencoder.fit_transform(mergedset['card6'].fillna('none'))
mergedtestset['card6'] = labelencoder.transform(mergedtestset['card6'].fillna('none'))
mergedset.loc[mergedset['card6'].isna()]

## ---------------------------Addr---------------------------

In [None]:
mergedset['addr1'].unique()

In [None]:
mergedset['addr2'].unique()

## ---------------------------Mail---------------------------

In [None]:
mergedset['P_emaildomain'] = labelencoder.fit_transform(mergedset['P_emaildomain'].fillna('none'))
mergedtestset['P_emaildomain'] = labelencoder.fit_transform(mergedtestset['P_emaildomain'].fillna('none'))
mergedset['P_emaildomain'].unique()

In [None]:
mergedset['R_emaildomain'] = labelencoder.fit_transform(mergedset['R_emaildomain'].fillna('none'))
mergedtestset['R_emaildomain'] = labelencoder.fit_transform(mergedtestset['R_emaildomain'].fillna('none'))
mergedset['R_emaildomain'].unique()

## ---------------------------Mx---------------------------

In [None]:
mergedset['M1'].unique()

In [None]:
mergedset['M2'].unique()

In [None]:
mergedset['M3'].unique()

In [None]:
mergedset['M4'].unique()

In [None]:
mergedset['M5'].unique()

In [None]:
mergedset['M6'].unique()

In [None]:
mergedset['M7'].unique()

In [None]:
mergedset['M8'].unique()

In [None]:
mergedset['M9'].unique()

## ---------------------------Device---------------------------

## Count Vectorizer

In [None]:
vectorizer = CountVectorizer(max_features=10)
count_array = vectorizer.fit_transform(mergedset['DeviceInfo'].fillna('none')).toarray()
DeviceInfoCV = pd.DataFrame(data=count_array,columns = ['word1','word2','word3','word4','word5','word6','word7','word8','word9','word10'])
mergedset = mergedset.merge(DeviceInfoCV, left_index=True, right_index=True)

count_array = vectorizer.transform(mergedtestset['DeviceInfo'].fillna('none')).toarray()
DeviceInfoCV = pd.DataFrame(data=count_array,columns = ['word1','word2','word3','word4','word5','word6','word7','word8','word9','word10'])
mergedtestset = mergedtestset.merge(DeviceInfoCV, left_index=True, right_index=True)

# ONE HOT

In [None]:
binarizer = LabelBinarizer()

count_array = binarizer.fit_transform(mergedset['DeviceType'].fillna('none'))
ProductCDOH = pd.DataFrame(data=count_array,columns = ['type1','type2','type3',])
mergedset = mergedset.merge(ProductCDOH, left_index=True, right_index=True)

count_array = binarizer.transform(mergedtestset['DeviceType'].fillna('none'))
ProductCDOH = pd.DataFrame(data=count_array,columns = ['type1','type2','type3'])
mergedtestset = mergedtestset.merge(ProductCDOH, left_index=True, right_index=True)

## ---------------------------ID---------------------------

## Imputacion de nulos

In [None]:
imputer = SimpleImputer() 
mergedset[['id_13', 'id_14', 'id_17', 'id_18', 'id_19', 'id_20', 'id_21', 'id_22', 'id_24', 'id_25', 'id_26', 'id_32']] = imputer.fit_transform(mergedset[['id_13', 'id_14', 'id_17', 'id_18', 'id_19', 'id_20', 'id_21', 'id_22', 'id_24', 'id_25', 'id_26', 'id_32']])
mergedtestset[['id_13', 'id_14', 'id_17', 'id_18', 'id_19', 'id_20', 'id_21', 'id_22', 'id_24', 'id_25', 'id_26', 'id_32']] = imputer.transform(mergedtestset[['id_13', 'id_14', 'id_17', 'id_18', 'id_19', 'id_20', 'id_21', 'id_22', 'id_24', 'id_25', 'id_26', 'id_32']])

## Label Encoding

In [None]:
mergedset['id_12'] = labelencoder.fit_transform(mergedset['id_12'].fillna('none'))
mergedtestset['id_12'] = labelencoder.fit_transform(mergedtestset['id_12'].fillna('none'))
mergedset.loc[mergedset['id_12'].isna()]

In [None]:
mergedset['id_15'] = labelencoder.fit_transform(mergedset['id_15'].fillna('none'))
mergedtestset['id_15'] = labelencoder.fit_transform(mergedtestset['id_15'].fillna('none'))
mergedset.loc[mergedset['id_15'].isna()]

In [None]:
mergedset['id_16'] = labelencoder.fit_transform(mergedset['id_16'].fillna('none'))
mergedtestset['id_16'] = labelencoder.fit_transform(mergedtestset['id_16'].fillna('none'))
mergedset.loc[mergedset['id_16'].isna()]

In [None]:
mergedset['id_23'] = labelencoder.fit_transform(mergedset['id_23'].fillna('none'))
mergedtestset['id_23'] = labelencoder.fit_transform(mergedtestset['id_23'].fillna('none'))
mergedset.loc[mergedset['id_23'].isna()]

In [None]:
mergedset['id_27'] = labelencoder.fit_transform(mergedset['id_27'].fillna('none'))
mergedtestset['id_27'] = labelencoder.fit_transform(mergedtestset['id_27'].fillna('none'))
mergedset.loc[mergedset['id_27'].isna()]

In [None]:
mergedset['id_28'] = labelencoder.fit_transform(mergedset['id_28'].fillna('none'))
mergedtestset['id_28'] = labelencoder.fit_transform(mergedtestset['id_28'].fillna('none'))
mergedset.loc[mergedset['id_28'].isna()]

In [None]:
mergedset['id_29'] = labelencoder.fit_transform(mergedset['id_29'].fillna('none'))
mergedtestset['id_29'] = labelencoder.fit_transform(mergedtestset['id_29'].fillna('none'))
mergedset.loc[mergedset['id_29'].isna()]

In [None]:
mergedset['id_30'] = labelencoder.fit_transform(mergedset['id_30'].fillna('none'))
mergedtestset['id_30'] = labelencoder.fit_transform(mergedtestset['id_30'].fillna('none'))
mergedset.loc[mergedset['id_30'].isna()]

In [None]:
mergedset['id_34'] = labelencoder.fit_transform(mergedset['id_34'].fillna('none'))
mergedtestset['id_34'] = labelencoder.fit_transform(mergedtestset['id_34'].fillna('none'))
mergedset.loc[mergedset['id_34'].isna()]

In [None]:
mergedset['id_35'] = labelencoder.fit_transform(mergedset['id_35'].fillna('none'))
mergedtestset['id_35'] = labelencoder.fit_transform(mergedtestset['id_35'].fillna('none'))
mergedset.loc[mergedset['id_35'].isna()]

In [None]:
mergedset['id_36'] = labelencoder.fit_transform(mergedset['id_36'].fillna('none'))
mergedtestset['id_36'] = labelencoder.fit_transform(mergedtestset['id_36'].fillna('none'))
mergedset.loc[mergedset['id_36'].isna()]

In [None]:
mergedset['id_37'] = labelencoder.fit_transform(mergedset['id_37'].fillna('none'))
mergedtestset['id_37'] = labelencoder.fit_transform(mergedtestset['id_37'].fillna('none'))
mergedset.loc[mergedset['id_37'].isna()]

In [None]:
mergedset['id_38'] = labelencoder.fit_transform(mergedset['id_38'].fillna('none'))
mergedtestset['id_38'] = labelencoder.fit_transform(mergedtestset['id_38'].fillna('none'))
mergedset.loc[mergedset['id_38'].isna()]

# ID_31

## Mean Encoding

In [None]:
mergedset['id_31'].fillna('none',inplace=True)
mergedtestset['id_31'].fillna('none',inplace=True)

In [None]:
mergedset.loc[mergedset['id_31'].str.lower().str.contains("chrome"),['Browser']] = 'Chrome'
mergedset.loc[mergedset['id_31'].str.lower().str.contains("samsung"),['Browser']] = 'Samsung'
mergedset.loc[mergedset['id_31'].str.lower().str.contains("ie"),['Browser']] = 'ie'
mergedset.loc[mergedset['id_31'].str.lower().str.contains("safari"),['Browser']] = 'Safari'
mergedset.loc[mergedset['id_31'].str.lower().str.contains("firefox"),['Browser']] = 'Firefox'
mergedset.loc[mergedset['id_31'].str.lower().str.contains("opera"),['Browser']] = 'Opera'
mergedset.loc[mergedset['id_31'].str.lower().str.contains("edge"),['Browser']] = 'Edge'
mergedset.loc[mergedset['id_31'].str.lower().str.contains("none"),['Browser']] = 'None'
mergedset.loc[(mergedset['Browser'].isna()) & (~mergedset['id_31'].isna()),['Browser']] = 'Other'

In [None]:
mergedtestset.loc[mergedtestset['id_31'].str.lower().str.contains("chrome"),['Browser']] = 'Chrome'
mergedtestset.loc[mergedtestset['id_31'].str.lower().str.contains("samsung"),['Browser']] = 'Samsung'
mergedtestset.loc[mergedtestset['id_31'].str.lower().str.contains("ie"),['Browser']] = 'ie'
mergedtestset.loc[mergedtestset['id_31'].str.lower().str.contains("safari"),['Browser']] = 'Safari'
mergedtestset.loc[mergedtestset['id_31'].str.lower().str.contains("firefox"),['Browser']] = 'Firefox'
mergedtestset.loc[mergedtestset['id_31'].str.lower().str.contains("opera"),['Browser']] = 'Opera'
mergedtestset.loc[mergedtestset['id_31'].str.lower().str.contains("edge"),['Browser']] = 'Edge'
mergedtestset.loc[mergedtestset['id_31'].str.lower().str.contains("none"),['Browser']] = 'None'
mergedtestset.loc[(mergedtestset['Browser'].isna()) & (~mergedtestset['id_31'].isna()),['Browser']] = 'Other'

In [None]:
meanencoding = mergedset.groupby(['Browser'])['TransactionID'].count().to_dict()
meanencodingtest = mergedtestset.groupby(['Browser'])['TransactionID'].count().to_dict()

In [None]:
mergedset['Meanencodingid31'] = mergedset['Browser'].map(meanencoding)
mergedtestset['Meanencodingid31'] = mergedtestset['Browser'].map(meanencodingtest)

In [None]:
mergedset['Meanencodingid31'] = mergedset['Meanencodingid31']/ mergedset.groupby(['Browser'])['TransactionID'].count().sum()
mergedtestset['Meanencodingid31'] = mergedtestset['Meanencodingid31']/ mergedtestset.groupby(['Browser'])['TransactionID'].count().sum()


# ID_33

## Mean Encoding

In [None]:
mergedset['id_33'].fillna('none',inplace=True)
mergedtestset['id_33'].fillna('none',inplace=True)

In [None]:
meanencoding = mergedset.groupby(['id_33'])['TransactionID'].count().to_dict()
meanencodingtest = mergedtestset.groupby(['id_33'])['TransactionID'].count().to_dict()

In [None]:
mergedset['Meanencodingid33'] = mergedset['id_33'].map(meanencoding)
mergedtestset['Meanencodingid33'] = mergedtestset['id_33'].map(meanencodingtest)

In [None]:
mergedset['Meanencodingid33'] = mergedset['Meanencodingid33']/ mergedset.groupby(['id_33'])['TransactionID'].count().sum()
mergedtestset['Meanencodingid33'] = mergedtestset['Meanencodingid33']/ mergedtestset.groupby(['id_33'])['TransactionID'].count().sum()

## ---------------------------Cx---------------------------

In [None]:
mergedset[['C1','C2','C4','C5','C6','C7','C8','C9','C10','C11','C12','C13','C14']] = mergedset[['C1','C2','C4','C5','C6','C7','C8','C9','C10','C11','C12','C13','C14']].fillna(0)
mergedtestset[['C1','C2','C4','C5','C6','C7','C8','C9','C10','C11','C12','C13','C14']] = mergedtestset[['C1','C2','C4','C5','C6','C7','C8','C9','C10','C11','C12','C13','C14']].fillna(0)

## ---------------------------Dx---------------------------

In [None]:
mergedset[['D2','D3','D4','D5','D6','D8','D10','D11','D13','D15']] = mergedset[['D2','D3','D4','D5','D6','D8','D10','D11','D13','D15']].fillna(0)
mergedtestset[['D2','D3','D4','D5','D6','D8','D10','D11','D13','D15']] = mergedtestset[['D2','D3','D4','D5','D6','D8','D10','D11','D13','D15']].fillna(0)

## ---------------------------Vx---------------------------

# Imputacion de nulos

In [None]:
imputer = SimpleImputer()
mergedset[['V44', 'V45', 'V48', 'V53', 'V55', 'V70', 'V72', 'V75', 'V83', 'V87',
           'V90', 'V91', 'V96', 'V129', 'V130', 'V131', 'V133', 'V140', 'V143',
           'V149', 'V156', 'V160', 'V165', 'V172', 'V187', 'V196', 'V201', 'V206',
           'V225', 'V258', 'V260', 'V266', 'V281', 'V283', 'V294', 'V308', 'V310',
           'V312', 'V314', 'V317', 'V320', 'V331']] = imputer.fit_transform(mergedset[['V44', 'V45', 'V48', 'V53', 'V55', 'V70', 'V72', 'V75', 'V83', 'V87',
                                                                                     'V90', 'V91', 'V96', 'V129', 'V130', 'V131', 'V133', 'V140', 'V143',
                                                                                     'V149', 'V156', 'V160', 'V165', 'V172', 'V187', 'V196', 'V201', 'V206',
                                                                                     'V225', 'V258', 'V260', 'V266', 'V281', 'V283', 'V294', 'V308', 'V310',
                                                                                     'V312', 'V314', 'V317', 'V320', 'V331']])
mergedtestset[['V44', 'V45', 'V48', 'V53', 'V55', 'V70', 'V72', 'V75', 'V83', 'V87',
           'V90', 'V91', 'V96', 'V129', 'V130', 'V131', 'V133', 'V140', 'V143',
           'V149', 'V156', 'V160', 'V165', 'V172', 'V187', 'V196', 'V201', 'V206',
           'V225', 'V258', 'V260', 'V266', 'V281', 'V283', 'V294', 'V308', 'V310',
           'V312', 'V314', 'V317', 'V320', 'V331']] = imputer.transform(mergedtestset[['V44', 'V45', 'V48', 'V53', 'V55', 'V70', 'V72', 'V75', 'V83', 'V87',
                                                                                     'V90', 'V91', 'V96', 'V129', 'V130', 'V131', 'V133', 'V140', 'V143',
                                                                                     'V149', 'V156', 'V160', 'V165', 'V172', 'V187', 'V196', 'V201', 'V206',
                                                                                     'V225', 'V258', 'V260', 'V266', 'V281', 'V283', 'V294', 'V308', 'V310',
                                                                                     'V312', 'V314', 'V317', 'V320', 'V331']])
# strategy='most_frequent'reduce el score

# Defino el data set final 

In [None]:
mergedset = mergedset[['TransactionID','isFraud','TransactionDT','TransactionAmt','card1','card2','card3','card5','card6',
                       'id_17','id_18','id_19','id_20','id_32',
                       'C1','C2','C4','C5','C6','C7','C8','C9','C10','C11','C12','C13','C14','D2','D3','D4','D5','D6','D8','D10','D11','D13','D15',
                       'V44', 'V45', 'V48', 'V53', 'V55', 'V70', 'V72', 'V75', 'V83', 'V87',
                       'V90', 'V91', 'V96', 'V129', 'V130', 'V131', 'V133', 'V140', 'V143',
                       'V149', 'V156', 'V160', 'V165', 'V172', 'V187', 'V196', 'V201', 'V206',
                       'V225', 'V258', 'V260', 'V266', 'V281', 'V283', 'V294', 'V308', 'V310',
                       'V312', 'V314', 'V317', 'V320', 'V331',
                       'word1','word2','word3','word4','word5','word6','word7','word8','word9','word10',
                       'prod1','prod2','prod3','prod4','prod5', 
                       'type1','type2','type3',
                       'Meanencodingid31',
                       'Meanencodingid33',
                       'CardName1','CardName2','CardName3','CardName4','CardName5'
]]

In [None]:
mergedtestset = mergedtestset[['TransactionID','TransactionDT','TransactionAmt','card1','card2','card3','card5','card6',
                               'id_17','id_18','id_19','id_20','id_32',
                               'C1','C2','C4','C5','C6','C7','C8','C9','C10','C11','C12','C13','C14','D2','D3','D4','D5','D6','D8','D10','D11','D13','D15',
                               'V44', 'V45', 'V48', 'V53', 'V55', 'V70', 'V72', 'V75', 'V83', 'V87',
                               'V90', 'V91', 'V96', 'V129', 'V130', 'V131', 'V133', 'V140', 'V143',
                               'V149', 'V156', 'V160', 'V165', 'V172', 'V187', 'V196', 'V201', 'V206',
                               'V225', 'V258', 'V260', 'V266', 'V281', 'V283', 'V294', 'V308', 'V310',
                               'V312', 'V314', 'V317', 'V320', 'V331',
                               'word1','word2','word3','word4','word5','word6','word7','word8','word9','word10', 
                               'prod1','prod2','prod3','prod4','prod5', 
                               'type1','type2','type3',
                               'Meanencodingid31',
                               'Meanencodingid33',
                               'CardName1','CardName2','CardName3','CardName4','CardName5'
]]

In [None]:
points = mergedset.loc[:,(mergedset.columns != 'TransactionID') & (mergedset.columns !=  'isFraud')]

In [None]:
labels = mergedset['isFraud']

In [None]:
featurelist= points.columns

In [None]:
points.shape

In [None]:
labels.shape

# Genero Train y Test Sets del set de entrenamiento

In [None]:
X_train, X_test, y_train, y_test = train_test_split(points, labels, test_size=0.5)

In [None]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
X_train.shape

In [None]:
y_train.shape

In [None]:
X_test.shape

In [None]:
y_test.shape

In [None]:
tf.random.set_seed(221)
np.random.seed(221)
classifier = tf.keras.models.Sequential()
classifier.add(tf.keras.layers.Dense(75, activation='relu', kernel_initializer='glorot_uniform',input_dim=102)) # glorot_uniform
classifier.add(tf.keras.layers.Dense(75, activation = 'relu')) 
classifier.add(tf.keras.layers.Dense(1,  activation = 'sigmoid'))
classifier.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])#[keras.metrics.AUC()])
classifier.fit(X_train, y_train, batch_size = 64, epochs=20,validation_split=0.1)

In [None]:
roc_auc_score(y_test, (classifier.predict( X_test))>0.5)

In [None]:
accuracy_score(y_test, classifier.predict( X_test)>0.5 )

In [None]:
print(classification_report(y_test,classifier.predict( X_test)>0.5 ))

# Genero Prediccion de la competencia

In [None]:
Y_test= classifier.predict(sc.transform(mergedtestset.loc[:, (mergedtestset.columns != 'TransactionID')]))

In [None]:
finalpredKeras = mergedtestset.drop(featurelist,axis=1)
finalpredKeras['isFraud']= Y_test
print(finalpredKeras.shape)
finalpredKeras.to_csv("prediccionKeras.csv",index=False)

## Score en la competencia:
0.846155