In [0]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
import pandas as pd
import numpy as np
import gc
pd.set_option('display.max_columns', 500)

In [0]:
train = pd.read_pickle('/content/drive/My Drive/train_features_1.pkl')

In [0]:
train_outliers = train[train['outliers'] == 1]
train_normal = train[train['outliers'] == 0]

In [0]:
train_outliers = preprocess(train_outliers)
train_outliers = rename(train_outliers)
train_outliers = rem_inf(train_outliers)
train_normal = preprocess(train_normal)
train_normal = rename(train_normal)
train_normal = rem_inf(train_normal)

In [0]:
train_outliers.drop(columns='outliers', axis=1, inplace=True)
train_normal.drop(columns='outliers', axis=1, inplace=True)

In [0]:
out_y = train_outliers['target']
train_outliers.drop(columns='target', axis=1, inplace=True)
out_X = train_outliers

norm_y = train_normal['target']
train_normal.drop(columns='target', axis=1, inplace=True)
norm_X = train_normal

In [0]:
def makeOverSamplesSMOTE(X,y):
    #input DataFrame
    #X →Independent Variable in DataFrame\
    #y →dependent Variable in Pandas DataFrame format
    from imblearn.over_sampling import SMOTE
    sm = SMOTE()
    X, y = sm.fit_sample(X, y)
    return X,y

In [0]:
def normalize(df):
    mu = df.mean()
    sigma = df.std()
    
    return (df -mu) / sigma

In [0]:
def preprocess(train_df):
    ''' preprocessing of train_df only withouting merge '''

    train_df.replace([np.inf, -np.inf], np.nan)
    
    # converting to datetime
    train_df['first_active_month'] = pd.to_datetime(train_df['first_active_month'])

    # sorting to find the most rescent active month card 
    train_df.sort_values(by='first_active_month')

    # 2018-02 is the most rescent month
    train_df['difference_between_most_recent_transaction'] = pd.to_datetime('2018-02-01') - train_df['first_active_month']

    # creating the thime delta with reference as most rescent active month 
    train_df['difference_between_most_recent_transaction'] = train_df['difference_between_most_recent_transaction'].apply(lambda x: x.days)

    # normalizing time delta around mean new_x = (x - mean)/std
    mu = train_df['difference_between_most_recent_transaction'].mean()
    sigma = train_df['difference_between_most_recent_transaction'].std()
    train_df['difference_between_most_recent_transaction'] = (train_df['difference_between_most_recent_transaction'] - mu) /sigma

    # creating dummies for feature_ 1 and 2
    train_df = train_df.join(pd.get_dummies(train_df['feature_1'], prefix='feature_1').join(pd.get_dummies(train_df['feature_2'], prefix='feature_2')))

    # removing original feature_ 1and 2 as they are not need anymore
    train_df.drop(columns=['feature_1', 'feature_2'], axis=1, inplace=True)

    # creating features for first active month and year
    train_df['month'] = train_df['first_active_month'].apply(lambda x: x.month)
    train_df['year'] = train_df['first_active_month'].apply(lambda x: x.year)

    # dropped un-necessary columns
    # 'new_card_id_'
    for col in train_df.columns:
        if train_df[col].dtype == 'object':
            train_df.drop(columns=col, axis=1, inplace=True)
            
    train_df.drop(inplace=True, axis=1, columns='first_active_month')
        
    # preprocessing dates    
    train_df['new_purchase_date_min'] = train_df['new_purchase_date_min'].astype(np.int64) * 1e-9
    train_df['new_purchase_date_max'] = train_df['new_purchase_date_max'].astype(np.int64) * 1e-9
    train_df['hist_purchase_date_min'] = train_df['hist_purchase_date_min'].astype(np.int64) * 1e-9
    train_df['hist_purchase_date_max'] = train_df['hist_purchase_date_max'].astype(np.int64) * 1e-9

    train_df['new_purchase_date_ptp'] = train_df['new_purchase_date_ptp'].apply(lambda x: x.total_seconds())
    train_df['hist_purchase_date_ptp'] = train_df['hist_purchase_date_ptp'].apply(lambda x: x.total_seconds())

    cols = ['new_purchase_date_min', 'new_purchase_date_max', 'hist_purchase_date_min', 'hist_purchase_date_max', 'new_purchase_date_ptp', 'hist_purchase_date_ptp']

    for col in cols:
        train_df[col] = normalize(train_df[col])
        
    # processing nan values
    # taking mean of each column for replacing nan
    means = train_df.describe().iloc[1, :]
    
    # replacing nan by mean of column
    for col in train_df.columns:
        if len(train_df[train_df[col].isna()]):
            train_df[col].fillna(value=means[col], inplace=True)
            
    return train_df

In [0]:
def rename(train_df):
    dic = {'feature_1_0.008058486333804104':'feature_1_1',
      'feature_1_0.010479387818900955':'feature_1_2',
      'feature_1_0.010609889420578167':'feature_1_3',
      'feature_1_0.010711591651998994':'feature_1_4',
      'feature_1_0.013144615384615385':'feature_1_5',
      'feature_2_0.008752121220219405':'feature_2_1',
      'feature_2_0.011384773985343224':'feature_2_2',
      'feature_2_0.014166402368115023':'feature_2_3'
      }
    train_df.rename(mapper=dic, axis=1, inplace=True)
    return train_df

In [0]:
def rem_inf(df):
    for col in df.columns:
        if len(df[df[col] == np.inf]):
            mean = df[col].mean()
            df[col] = df[col].replace(np.inf, mean)
    return df

In [0]:
train_y = train['outliers']
train.drop(columns='outliers', axis=1, inplace=True)
train_X = train
del train
gc.collect()

372

In [0]:
train_X = preprocess(train_X)

In [0]:
train_X = rename(train_X)

In [0]:
train_X.drop(columns='target', axis=1, inplace=True)

In [0]:
train_X.head()

Unnamed: 0,feature_3,new_category_1_ntrans_sum,new_category_1_ntrans_mean,new_category_1_merch_sum,new_category_1_merch_mean,new_category_2_ntrans_mean,new_category_2_merch_mean,new_category_3_mean,new_merchant_id_nunique,new_merchant_category_id_ntrans_nunique,new_merchant_category_id_merch_nunique,new_state_id_ntrans_nunique,new_state_id_merch_nunique,new_city_id_ntrans_nunique,new_city_id_merch_nunique,new_subsector_id_ntrans_nunique,new_subsector_id_merch_nunique,new_purchase_amount_sum,new_purchase_amount_mean,new_purchase_amount_max,new_purchase_amount_min,new_purchase_amount_std,new_installments_sum,new_installments_mean,new_installments_max,new_installments_min,new_installments_std,new_purchase_month_mean,new_purchase_month_max,new_purchase_month_min,new_purchase_month_std,new_purchase_date_min,new_purchase_date_max,new_purchase_date_ptp,new_month_lag_mean,new_month_lag_max,new_month_lag_min,new_month_lag_std,new_month_diff_mean,hist_card_id_nunique,hist_authorized_flag_mean,hist_category_1_sum,hist_category_1_mean,hist_category_2_mean,hist_category_3_mean,hist_merchant_id_nunique,hist_merchant_category_id_nunique,hist_state_id_nunique,hist_city_id_nunique,hist_subsector_id_nunique,hist_purchase_amount_sum,hist_purchase_amount_mean,hist_purchase_amount_max,hist_purchase_amount_min,hist_purchase_amount_std,hist_installments_sum,hist_installments_mean,hist_installments_max,hist_installments_min,hist_installments_std,hist_purchase_month_mean,hist_purchase_month_max,hist_purchase_month_min,hist_purchase_month_std,hist_purchase_date_min,hist_purchase_date_max,hist_purchase_date_ptp,hist_month_lag_mean,hist_month_lag_max,hist_month_lag_min,hist_month_lag_std,hist_month_diff_mean,hist_card_id_count,new_card_id_count,hist_trans_freq,new_trans_freq,hist_time_gap,new_time_gap,difference_between_most_recent_transaction,feature_1_1,feature_1_2,feature_1_3,feature_1_4,feature_1_5,feature_2_1,feature_2_2,feature_2_3,month,year
0,0.011428,0.0,0.0,0.0,0.0,0.0,0.0,0.0,23.0,14.0,14.0,1.0,1.0,3.0,2.0,10.0,10.0,-13.244202,-0.575835,-0.296112,-0.724368,0.135812,0.0,0.0,0.0,0.0,0.0,3.478261,4.0,3.0,0.510754,0.349674,0.350331,1.260966,1.478261,2.0,1.0,0.510754,11.652174,1,0.95,0.0,0.0,0.046154,0.015385,94,41,3,7,21,-165.968739,-0.638341,2.258395,-0.739395,0.212139,4,0.015385,1,0,0.123314,8.057692,12,1,3.474193,0.371112,0.572846,-0.011779,-3.911538,0,-8,2.397687,11.684615,260,23.0,1.070837,0.419036,0.933849,2.386428,-0.466375,0,0,0,0,1,1,0,0,6,2017
1,0.010283,0.0,0.0,0.0,0.0,0.0,0.0,1.0,6.0,5.0,5.0,1.0,1.0,1.0,1.0,4.0,4.0,-4.355735,-0.725956,-0.701858,-0.73941,0.014326,6.0,1.0,1.0,1.0,0.0,2.5,3.0,2.0,0.547723,0.34885,0.349551,1.344313,1.5,2.0,1.0,0.547723,12.666667,1,0.968571,31.0,0.088571,0.354286,1.211429,142,57,3,9,24,-210.006336,-0.600018,4.630299,-0.7424,0.384967,543,1.551429,10,-1,1.510777,6.22,12,1,3.848142,-1.071733,0.245666,1.234804,-5.031429,0,-12,3.804934,12.665714,350,6.0,0.896859,0.106064,1.115003,9.428302,0.047738,0,0,0,1,0,0,1,0,1,2017
2,0.010283,0.0,0.0,0.0,0.0,4.0,4.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,-0.700326,-0.700326,-0.700326,-0.700326,0.231553,0.0,0.0,0.0,0.0,0.519733,4.0,4.0,4.0,0.524177,0.351074,0.350312,-1.458895,2.0,2.0,2.0,0.396778,11.0,1,0.953488,0.0,0.0,3.627907,0.0,13,8,2,5,7,-29.167391,-0.678311,-0.145847,-0.730138,0.08738,0,0.0,0,0,0.0,4.55814,12,1,3.275467,-1.032613,0.604962,1.422475,-8.604651,0,-13,3.842987,11.627907,43,1.0,0.104255,inf,9.591845,0.0,0.568659,0,0,1,0,0,1,0,0,8,2016
3,0.010283,1.0,0.142857,1.0,0.142857,3.142857,3.285714,1.142857,7.0,6.0,6.0,2.0,2.0,2.0,2.0,5.0,5.0,-4.654372,-0.66491,-0.56674,-0.734135,0.065882,5.0,0.714286,1.0,-1.0,0.755929,3.714286,4.0,3.0,0.48795,0.349723,0.350046,0.620444,1.714286,2.0,1.0,0.48795,11.857143,1,1.0,12.0,0.155844,2.688312,1.116883,50,25,5,7,13,-49.491364,-0.642745,1.445596,-0.740897,0.261624,84,1.090909,3,-1,0.588974,7.74026,12,1,3.904797,1.135607,0.614216,-0.7557,-2.831169,0,-5,1.802065,11.61039,77,7.0,0.497393,0.166818,2.010483,5.994552,-0.779609,0,0,0,1,0,0,0,1,9,2017
4,0.010283,2.0,0.055556,4.0,0.111111,2.694444,2.944444,1.055556,36.0,17.0,19.0,5.0,5.0,5.0,3.0,10.0,11.0,-19.926237,-0.553507,0.450886,-0.739395,0.223821,35.0,0.972222,2.0,-1.0,0.376913,3.555556,4.0,3.0,0.503953,0.349594,0.350313,1.379901,1.555556,2.0,1.0,0.503953,11.694444,1,0.962406,15.0,0.112782,2.894737,1.052632,66,26,6,6,17,-48.687656,-0.366073,7.193041,-0.746156,1.352094,182,1.368421,12,1,1.896862,5.406015,12,1,5.003086,1.524357,0.619196,-1.144136,-1.285714,0,-3,1.0267,11.571429,133,36.0,1.221735,0.628404,0.818508,1.591333,-0.987296,1,0,0,0,0,0,0,1,11,2017


In [0]:
def fun(x):
    if x == np.inf:
        return t_mean
    return x

In [0]:
temp_s = train_X[np.logical_not(np.isinf(train_X['new_trans_freq']))]
t_mean = temp_s['new_trans_freq'].mean()
print(t_mean)

3.157788633712763


In [0]:
train_X['new_trans_freq'] = train_X['new_trans_freq'].apply(lambda x: fun(x))

In [0]:
train_X, train_y = makeOverSamplesSMOTE(train_X, train_y)

In [0]:
len(train_X)

399420

In [0]:
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix

X_train, X_test, y_train, y_test = train_test_split(train_X, train_y, shuffle=True, test_size=0.33, random_state=2019)


#n_estimatorss = [100, 110, 120]
#max_depths = [8, 9, 10, 11]
#earning_rates = [0.1, 0.3]
#subsamples = [0.5, 0.6, 0.7]

#pram = {'n_estimators':n_estimatorss, 'max_depth':max_depths, 'learning_rate':learning_rates, 'subsample':subsamples}

xgbb = XGBClassifier(n_estimators=110, max_depth=10, learning_rate=0.1, subsample=0.5)
#grid_search = GridSearchCV(xgb, pram, cv=2, verbose=5, scoring='f1')
#grid_search.fit(X_train, y_train)

xgbb.fit(X_train, y_train)

#pred = grid_search.predict(X_test)
pred = xgbb.predict(X_test)

cmat = confusion_matrix(y_test, pred)

print(cmat)

[[65708    82]
 [  660 65359]]


FROM HERE ON PREDICTING WHETHER ITEM IS OUTLIER OR NOT

In [0]:
sub_df = pd.read_pickle('/content/drive/My Drive/test_features_1.pkl')

In [0]:
sub_df = preprocess(sub_df)

In [0]:
sub_df.head()

Unnamed: 0,feature_3,new_category_1_ntrans_sum,new_category_1_ntrans_mean,new_category_1_merch_sum,new_category_1_merch_mean,new_category_2_ntrans_mean,new_category_2_merch_mean,new_category_3_mean,new_merchant_id_nunique,new_merchant_category_id_ntrans_nunique,new_merchant_category_id_merch_nunique,new_state_id_ntrans_nunique,new_state_id_merch_nunique,new_city_id_ntrans_nunique,new_city_id_merch_nunique,new_subsector_id_ntrans_nunique,new_subsector_id_merch_nunique,new_purchase_amount_sum,new_purchase_amount_mean,new_purchase_amount_max,new_purchase_amount_min,new_purchase_amount_std,new_installments_sum,new_installments_mean,new_installments_max,new_installments_min,new_installments_std,new_purchase_month_mean,new_purchase_month_max,new_purchase_month_min,new_purchase_month_std,new_purchase_date_min,new_purchase_date_max,new_purchase_date_ptp,new_month_lag_mean,new_month_lag_max,new_month_lag_min,new_month_lag_std,new_month_diff_mean,hist_card_id_nunique,hist_authorized_flag_mean,hist_category_1_sum,hist_category_1_mean,hist_category_2_mean,hist_category_3_mean,hist_merchant_id_nunique,hist_merchant_category_id_nunique,hist_state_id_nunique,hist_city_id_nunique,hist_subsector_id_nunique,hist_purchase_amount_sum,hist_purchase_amount_mean,hist_purchase_amount_max,hist_purchase_amount_min,hist_purchase_amount_std,hist_installments_sum,hist_installments_mean,hist_installments_max,hist_installments_min,hist_installments_std,hist_purchase_month_mean,hist_purchase_month_max,hist_purchase_month_min,hist_purchase_month_std,hist_purchase_date_min,hist_purchase_date_max,hist_purchase_date_ptp,hist_month_lag_mean,hist_month_lag_max,hist_month_lag_min,hist_month_lag_std,hist_month_diff_mean,hist_card_id_count,new_card_id_count,hist_trans_freq,new_trans_freq,hist_time_gap,new_time_gap,difference_between_most_recent_transaction,feature_1_1,feature_1_2,feature_1_3,feature_1_4,feature_1_5,feature_2_1,feature_2_2,feature_2_3,month,year
0,1,0.0,0.0,0.0,0.0,0.0,0.0,1.333333,3.0,3.0,3.0,1.0,1.0,3.0,3.0,3.0,3.0,-1.777156,-0.592385,-0.383266,-0.722114,0.182843,5.0,1.666667,3.0,1.0,1.154701,2.0,2.0,2.0,0.0,0.351515,0.351412,-0.199291,2.0,2.0,2.0,0.0,13.666667,1,0.647059,23.0,0.338235,1.352941,1.323529,24,16,3,7,12,-40.733733,-0.599025,0.235676,-0.743902,0.192268,141,2.073529,12,1,2.061127,8.367647,12,4,2.454994,-0.332976,-0.216704,0.200276,-3.632353,0,-8,2.454994,13.632353,68,3.0,0.280065,0.054657,3.5706,18.295945,-0.256759,0,0,1,0,0,0,0,1,4.0,2017.0
1,0,2.0,0.222222,2.0,0.222222,3.222222,3.444444,1.111111,9.0,8.0,8.0,2.0,2.0,2.0,1.0,6.0,6.0,-5.944698,-0.660522,-0.506484,-0.740897,0.071147,11.0,1.222222,3.0,1.0,0.666667,3.444444,4.0,3.0,0.527046,0.352238,0.352721,0.935324,1.444444,2.0,1.0,0.527046,11.777778,1,0.987179,2.0,0.025641,3.012821,1.025641,27,16,3,4,12,-49.136513,-0.629955,0.318817,-0.731881,0.154999,83,1.064103,4,1,0.405794,3.282051,5,1,1.357016,-1.013055,0.483466,1.325397,-10.410256,0,-13,2.164866,11.846154,78,9.0,0.199871,0.159095,5.003218,6.285535,0.050871,0,1,0,0,0,0,0,1,1.0,2017.0
2,1,1.0,0.5,1.0,0.5,4.0,4.5,1.5,2.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,2.0,0.180138,0.090069,0.904506,-0.724368,1.151788,11.0,5.5,10.0,1.0,6.363961,3.0,3.0,3.0,0.0,0.352189,0.35174,-0.867881,1.0,1.0,1.0,0.0,12.0,1,0.692308,1.0,0.076923,3.692308,1.923077,9,8,4,4,6,4.52884,0.348372,2.525866,-0.536537,0.906547,44,3.384615,10,-1,3.686427,7.307692,12,1,4.819831,0.864696,0.265972,-0.706057,-2.076923,0,-6,1.754116,11.846154,13,2.0,0.031519,inf,31.726872,0.0,-0.673769,0,0,0,0,1,1,0,0,8.0,2017.0
3,0,1.0,0.1,3.0,0.3,0.8,1.5,1.5,10.0,10.0,10.0,3.0,2.0,3.0,2.0,8.0,9.0,-5.743674,-0.574367,-0.44788,-0.671775,0.073166,29.0,2.9,12.0,1.0,3.3483,3.3,4.0,3.0,0.483046,0.352262,0.352648,0.746127,1.3,2.0,1.0,0.483046,11.7,1,1.0,0.0,0.0,0.0,1.346154,23,18,1,1,11,-13.690715,-0.526566,0.087965,-0.731881,0.219162,38,1.461538,6,-1,1.475961,7.692308,12,1,5.136296,1.710754,0.504963,-1.410208,-1.230769,0,-2,0.951113,11.769231,26,10.0,0.167951,0.238312,5.954122,4.196186,-1.090779,0,1,0,0,0,1,0,0,12.0,2017.0
4,1,0.0,0.0,0.0,0.0,2.833333,3.0,1.333333,6.0,5.0,5.0,2.0,1.0,2.0,2.0,4.0,4.0,12.064997,2.010833,14.279604,-0.704082,6.028671,5.0,0.833333,2.0,-1.0,0.983192,3.166667,4.0,3.0,0.408248,0.352253,0.35251,0.495582,1.166667,2.0,1.0,0.408248,12.0,1,0.790909,0.0,0.0,2.845455,1.054545,47,31,4,5,15,25.139384,0.22854,15.782255,-0.746758,2.777764,120,1.090909,4,1,0.43988,4.827273,12,1,3.361572,-1.09818,0.598921,1.483658,-6.227273,0,-13,4.530547,11.763636,110,6.0,1.010457,0.104734,0.989651,9.547998,1.407863,0,0,0,0,1,1,0,0,12.0,2015.0


In [0]:
sub_df = rem_inf(sub_df)

In [0]:
sub_df['outlier'] = xgbb.predict(sub_df.values)

In [0]:
test_outlier = sub_df[sub_df['outlier'] == 1]
test_normal = sub_df[sub_df['outlier'] == 0]

In [0]:
test_outlier.drop(columns='outlier', axis=1, inplace=True)
test_normal.drop(columns='outlier', axis=1, inplace=True)

**MODEL FOR OUTLIERS**

In [0]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from math import sqrt

X_train, X_test, y_train, y_test = train_test_split(out_X, out_y, shuffle=True, test_size=0.33, random_state=2019)


xgdmat = xgb.DMatrix(X_train,y_train)
our_params = {'eta':0.5,
             'seed':0,
             'subsample':0.80,
             'colsample_bytree':0.9,
             'objective':'reg:linear',
             'max_depth':5,
             'min_child_weight':3
         }
final_gb = xgb.train(our_params,xgdmat)
tesdmat = xgb.DMatrix(X_test)
y_pred_xg = final_gb.predict(tesdmat)
print(f'the root mean squared error is {sqrt(mean_squared_error(y_test, y_pred_xg))}')

the root mean squared error is 0.03320764677734189


In [0]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from math import sqrt

X_train, X_test, y_train, y_test = train_test_split(norm_X, norm_y, shuffle=True, test_size=0.33, random_state=2019)


xgdmat = xgb.DMatrix(X_train,y_train)
our_params = {'eta':0.5,
             'seed':0,
             'subsample':0.80,
             'colsample_bytree':0.9,
             'objective':'reg:linear',
             'max_depth':5,
             'min_child_weight':3
         }
final_gb2 = xgb.train(our_params,xgdmat)
tesdmat = xgb.DMatrix(X_test)
y_pred_xg = final_gb2.predict(tesdmat)
print(f'the root mean squared error is {sqrt(mean_squared_error(y_test, y_pred_xg))}')

the root mean squared error is 1.5923172662696985


**FROM HERE PREDICTIONS**

In [0]:
out_dmat = xgb.DMatrix(test_outlier)
norm_dmat = xgb.DMatrix(test_normal)

out_pred = final_gb.predict(out_dmat)
norm_pred = final_gb2.predict(norm_dmat)

In [0]:
sub_df_1 = pd.read_pickle('/content/drive/My Drive/test_features_1.pkl')

In [0]:
temp_df = pd.DataFrame()
temp_df['card_id'] = sub_df_1['card_id']

In [0]:
temp_df['outlier'] = sub_df['outlier']

In [0]:
out_temp = temp_df[temp_df['outlier'] == 1]
norm_temp = temp_df[temp_df['outlier'] == 0]

In [0]:
out_temp['pred'] = out_pred
norm_temp['pred'] = norm_pred

In [0]:
t_df = pd.concat([norm_temp, out_temp])
t_df.head()

Unnamed: 0,card_id,outlier,pred
0,C_ID_0ab67a22ab,0,0.240491
1,C_ID_130fd0cbdd,0,-0.148135
2,C_ID_b709037bc5,0,-0.281877
3,C_ID_d27d835a9f,0,-0.002855
4,C_ID_2b5e3df5c2,0,-1.46871


In [0]:
t_df.drop(columns='outlier', axis=1, inplace=True)

In [0]:
final_sub = pd.read_csv('/content/drive/My Drive/pickle/sample_submission.csv')
final_sub.head()

Unnamed: 0,card_id,target
0,C_ID_0ab67a22ab,0
1,C_ID_130fd0cbdd,0
2,C_ID_b709037bc5,0
3,C_ID_d27d835a9f,0
4,C_ID_2b5e3df5c2,0


In [0]:
temp_final_sub = final_sub.merge(t_df, on='card_id', how='left')

In [0]:
temp_final_sub.head()

Unnamed: 0,card_id,target,pred
0,C_ID_0ab67a22ab,0,0.240491
1,C_ID_130fd0cbdd,0,-0.148135
2,C_ID_b709037bc5,0,-0.281877
3,C_ID_d27d835a9f,0,-0.002855
4,C_ID_2b5e3df5c2,0,-1.46871


In [0]:
temp_final_sub.drop(columns='target', axis=1, inplace=True)

In [0]:
temp_final_sub.rename({'pred':'target'}, axis=1, inplace=True)

In [0]:
temp_final_sub.to_csv('submission15.csv', index_label=False, index=False)