Preprocessing part 1

In [3]:
from multiprocessing import Pool, cpu_count
import gc; gc.enable()
import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn import *
import sklearn

train = pd.read_csv('train.csv')
train = pd.concat((train, pd.read_csv('train_v2.csv')), axis=0, ignore_index=True).reset_index(drop=True)
test = pd.read_csv('sample_submission_v2.csv')

transactions = pd.read_csv('transactions.csv', usecols=['msno'])
transactions = pd.concat((transactions, pd.read_csv('transactions_v2.csv', usecols=['msno'])), axis=0, ignore_index=True).reset_index(drop=True)
transactions = pd.DataFrame(transactions['msno'].value_counts().reset_index())
transactions.columns = ['msno','trans_count']
train = pd.merge(train, transactions, how='left', on='msno')
test = pd.merge(test, transactions, how='left', on='msno')
transactions = []; print('transaction merge...')

user_logs = pd.read_csv('user_logs_v2.csv', usecols=['msno'])
user_logs = pd.DataFrame(user_logs['msno'].value_counts().reset_index())
user_logs.columns = ['msno','logs_count']
train = pd.merge(train, user_logs, how='left', on='msno')
test = pd.merge(test, user_logs, how='left', on='msno')
user_logs = []; print('user logs merge...')

members = pd.read_csv('members_v3.csv')
train = pd.merge(train, members, how='left', on='msno')
test = pd.merge(test, members, how='left', on='msno')
members = []; print('members merge...') 
gender = {'male':1, 'female':2}
train['gender'] = train['gender'].map(gender)
test['gender'] = test['gender'].map(gender)

train = train.fillna(0)
test = test.fillna(0)

transactions = pd.read_csv('transactions_v2.csv') 
transactions = transactions.sort_values(by=['transaction_date'], ascending=[False]).reset_index(drop=True)
transactions = transactions.drop_duplicates(subset=['msno'], keep='first')

train = pd.merge(train, transactions, how='left', on='msno')
test = pd.merge(test, transactions, how='left', on='msno')
transactions=[]
train = train.loc[(train['bd'] > 10) & (train['bd'] < 100)]

transaction merge...
user logs merge...
members merge...


Preprocessing part 2

In [8]:
def transform_df(df):
    df = pd.DataFrame(df)
    df = df.sort_values(by=['date'], ascending=[False])
    df = df.reset_index(drop=True)
    df = df.drop_duplicates(subset=['msno'], keep='first')
    return df

def transform_df2(df):
    df = df.sort_values(by=['date'], ascending=[False])
    df = df.reset_index(drop=True)
    df = df.drop_duplicates(subset=['msno'], keep='first')
    return df

df_iter = pd.read_csv('user_logs.csv', low_memory=False, iterator=True, chunksize=10000000)
last_user_logs = []
i = 0 
for df in df_iter:
    if i>35:
        if len(df)>0:
            print(df.shape)
            p = Pool(cpu_count())
            df = p.map(transform_df, np.array_split(df, cpu_count()))   
            df = pd.concat(df, axis=0, ignore_index=True).reset_index(drop=True)
            df = transform_df2(df)
            p.close(); p.join()
            last_user_logs.append(df)
            print('...', df.shape)
            df = []
    i+=1
last_user_logs.append(transform_df(pd.read_csv('user_logs_v2.csv')))
last_user_logs = pd.concat(last_user_logs, axis=0, ignore_index=True).reset_index(drop=True)
last_user_logs = transform_df2(last_user_logs)

train = pd.merge(train, last_user_logs, how='left', on='msno')
test = pd.merge(test, last_user_logs, how='left', on='msno')
last_user_logs=[]

(10000000, 9)
... (1616917, 9)
(10000000, 9)
... (1533539, 9)
(10000000, 9)
... (1353720, 9)
(2106543, 9)
... (429234, 9)


Filling in the missing values

In [17]:
train = train.fillna(0)
test = test.fillna(0)

cols = [c for c in train.columns if c not in ['is_churn','msno']]

Scaling data for NN usage

In [31]:
from sklearn.preprocessing import MinMaxScaler
nlr = MinMaxScaler()
train_n = nlr.fit_transform(train[cols])
test_n = nlr.fit_transform(test[cols])

Creating and training autoencoder on the train_data

In [40]:
from keras.layers import Input, Dense
from keras.models import Model
from keras import optimizers

input_dim = 23  

Input_l = Input(shape=(input_dim,))

encoded = Dense(200, activation='relu')(Input_l)
encoded = Dense(200, activation='relu')(encoded)
decoded = Dense(200, activation='relu')(encoded)
decoded = Dense(input_dim, activation='sigmoid')(decoded)

autoencoder = Model(Input_l, decoded)
encoder = Model(Input_l, encoded)


sgd = optimizers.SGD(lr=0.01)
autoencoder.compile(loss='mean_squared_error', optimizer=sgd)

autoencoder.fit(train_n, train_n,
                epochs=100,
                batch_size=1280,
                validation_data=(test_n, test_n),verbose = 2)



Train on 775599 samples, validate on 907471 samples
Epoch 1/100
 - 15s - loss: 0.1387 - val_loss: 0.1038
Epoch 2/100
 - 15s - loss: 0.0532 - val_loss: 0.0673
Epoch 3/100
 - 15s - loss: 0.0371 - val_loss: 0.0626
Epoch 4/100
 - 15s - loss: 0.0349 - val_loss: 0.0608
Epoch 5/100
 - 15s - loss: 0.0341 - val_loss: 0.0599
Epoch 6/100
 - 15s - loss: 0.0336 - val_loss: 0.0595
Epoch 7/100
 - 15s - loss: 0.0332 - val_loss: 0.0591
Epoch 8/100
 - 15s - loss: 0.0328 - val_loss: 0.0588
Epoch 9/100
 - 15s - loss: 0.0325 - val_loss: 0.0585
Epoch 10/100
 - 15s - loss: 0.0322 - val_loss: 0.0582
Epoch 11/100
 - 15s - loss: 0.0319 - val_loss: 0.0580
Epoch 12/100
 - 15s - loss: 0.0316 - val_loss: 0.0576
Epoch 13/100
 - 15s - loss: 0.0313 - val_loss: 0.0573
Epoch 14/100
 - 15s - loss: 0.0309 - val_loss: 0.0570
Epoch 15/100
 - 15s - loss: 0.0306 - val_loss: 0.0568
Epoch 16/100
 - 15s - loss: 0.0303 - val_loss: 0.0564
Epoch 17/100
 - 15s - loss: 0.0299 - val_loss: 0.0561
Epoch 18/100
 - 15s - loss: 0.0296 - va

<keras.callbacks.History at 0x7fcc0da33320>

Encoding our data

In [41]:
train_aut = encoder.predict(train_n)
test_aut = encoder.predict(test_n)

In [43]:
test_aut.shape

(907471, 200)

Modifying keras epochwise output(callback class)

In [57]:
from sklearn.metrics import log_loss

class roc_callback(keras.callbacks.Callback):
    
    def __init__(self,training_data,validation_data, display):
        self.display = display
        self.x_val = validation_data[0]
        self.y_val = validation_data[1]
        self.seen = 0


    def on_train_begin(self, logs={}):
        return

    def on_train_end(self, logs={}):
        return

    def on_epoch_begin(self, epoch, logs={}):
        return

    def on_epoch_end(self, epoch, logs={}):
        if int(epoch) % self.display == 0:
            y_pred_val = self.model.predict_proba(self.x_val)
            ll = log_loss(self.y_val, y_pred_val)
            print('\logloss: %s ' % ll)
            return

    def on_batch_begin(self, batch, logs={}):
        return

    def on_batch_end(self, batch, logs={}):
        return

Creating and training out NN model

In [59]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.optimizers import SGD

model = Sequential()


model.add(Dense(500, activation='relu', input_dim=200))
model.add(Dropout(0.5))
model.add(Dense(500, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

sgd = SGD(lr=0.01)
model.compile(loss='binary_crossentropy',
              optimizer=sgd,metrics=['accuracy'])

model.fit(train_aut, train['is_churn'],
          epochs=100,
          batch_size=10000,callbacks=[roc_callback(training_data=(train_aut, train['is_churn']),validation_data=(train_aut, train['is_churn']),display = 10)] )



Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

NameError: name 'x_test' is not defined

Adding some more training epochs

In [64]:
model.fit(train_aut, train['is_churn'],
          epochs=50,
          batch_size=10000,callbacks=[roc_callback(training_data=(train_aut, train['is_churn']),validation_data=(train_aut, train['is_churn']),display = 20 )])



Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7fcc0b705940>

Making prediction and saving the result

In [65]:
pred1 = model.predict(test_aut)
test['is_churn'] = pred1.clip(0.+1e-15, 1-1e-15)
test[['msno','is_churn']].to_csv('NNAEsub.csv.gz', index=False, compression='gzip')