In [None]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df_train = pd.read_csv('/kaggle/input/ieee-fraud-detection/train_transaction.csv')
df_test = pd.read_csv('/kaggle/input/ieee-fraud-detection/test_transaction.csv')

In [None]:
#features_to_use = ['TransactionDT','TransactionAmt','ProductCD','card1','card2','card3','card4','card5','card6','P_emaildomain','R_emaildomain','addr1','addr2','C1','C2','C4','C5','C6',
#                   'C8','C9','C11','C12','C13','C14','D1','D2','D3','D4','D5','D6','D7','D8','D9','D10','D11','D12','D13','D14','D15','M4','M5']
research_data=df_train
#df_test = df_test[features_to_use]

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df
research_data=reduce_mem_usage(research_data)

In [None]:
encoder = preprocessing.LabelEncoder()
categorial_columns = research_data.select_dtypes(include='object').columns

In [None]:
for col in categorial_columns:  
    research_data[col] = research_data[col].fillna('missing value')    
    research_data[col] = encoder.fit_transform(research_data[col])
for col in categorial_columns:
    df_test[col] = df_test[col].fillna('missing value')    
    df_test[col] = encoder.fit_transform(df_test[col])

In [None]:
def process_categorial_columns(data, categorical_columns):
    for col in categorical_columns:        
      data[col] = data[col].fillna('missing value')     
      data[col] = encoder.fit_transform(data[col])
    return data
def fill_nan_values(data):
  for col in data.columns:
    if data[col].isnull().sum() > 0:
      pass
    data[col] = data[col].fillna(data[col].median())
  return data

In [None]:
research_data = fill_nan_values(research_data)
df_test = fill_nan_values(df_test)
research_data=process_categorial_columns(research_data,research_data)
df_test=process_categorial_columns(df_test,df_test)


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
X = research_data.sort_values('TransactionDT').drop(['TransactionDT','isFraud'], axis=1)
y = df_train.sort_values('TransactionDT')['isFraud']
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=0)
df_test = df_test.sort_values('TransactionDT').drop(['TransactionDT'], axis=1)

In [None]:
from sklearn.ensemble import RandomForestRegressor
model_rf = RandomForestRegressor(n_jobs=-1,n_estimators=400,max_features=0.3,verbose=1)
model_rf.fit(x_train, y_train)
predict_rf=model_rf.predict(x_test)

In [None]:
score=roc_auc_score(y_test,predict_rf)
print('Точность модели регрессии Рандомного леса =',score)

In [None]:
predict_rf=model_rf.predict(df_test)

In [None]:
submission = pd.read_csv('/kaggle/input/ieee-fraud-detection/sample_submission.csv')
submission['isFraud'] = predict_rf
submission.to_csv('submission.csv', index=False)