In [1]:
import pandas as pd 
import re
import pylab as pl
import gc
import xgboost as xgb
import lightgbm as lgb
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import roc_auc_score as auc
from sklearn.model_selection import StratifiedKFold
from collections import defaultdict, Counter
from nltk.tag import pos_tag
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from scipy.stats import pearsonr
from scipy.sparse import hstack
from multiprocessing import Pool


In [2]:
import numpy as np
np.random.seed(10001)
import random
import tensorflow as tf
random.seed(10002)
session_conf = tf.ConfigProto(intra_op_parallelism_threads=6, inter_op_parallelism_threads=5)
from keras import backend
tf.set_random_seed(10003)
backend.set_session(tf.Session(graph=tf.get_default_graph(), config=session_conf))

Using TensorFlow backend.


In [3]:
train_raw_data=pd.read_csv("../../../Users/sreek/Documents/vg_donors_choose/train/train.csv")
test_raw_data=pd.read_csv("../../../Users/sreek/Documents/vg_donors_choose/test/test.csv",low_memory=False)
res_raw_data=pd.read_csv("../../../Users/sreek/Documents/vg_donors_choose/resources/resources.csv")


In [5]:
print (train_raw_data.shape)
print(test_raw_data.shape)
print (res_raw_data.shape)


(182080, 16)
(78035, 15)
(1541272, 4)


In [None]:
def cleanup_data(dataF):
    
    dataF.loc[dataF.project_essay_4.isnull(), ['project_essay_4','project_essay_2']] = \
    dataF.loc[dataF.project_essay_4.isnull(), ['project_essay_2','project_essay_4']].values

    dataF[['project_essay_2','project_essay_3']] = dataF[['project_essay_2','project_essay_3']].fillna('')

    dataF['project_essay_1'] = dataF.apply(lambda row: ' '.join([str(row['project_essay_1']), 
                                                     str(row['project_essay_2'])]), axis=1)
    dataF['project_essay_2'] = dataF.apply(lambda row: ' '.join([str(row['project_essay_3']),
                                                     str(row['project_essay_4'])]), axis=1)

    dataF = dataF.drop(['project_essay_3', 'project_essay_4'], axis=1,inplace=True)


cleanup_data(train_raw_data)
cleanup_data(test_raw_data)

In [6]:
res_raw_data['Total'] = res_raw_data['quantity']*res_raw_data['price']
res_data = res_raw_data.groupby('id').agg({'description':'count',
                            'quantity':'sum',
                            'price':'sum',
                            'Total':'sum'}).rename(columns={'description':'items'})
res_data['avgPrice'] = res_data.Total / res_data.quantity
numFeatures = ['items', 'quantity', 'price', 'Total', 'avgPrice']

for func in ['min', 'max', 'mean']:
    res_data = res_data.join(res_raw_data.groupby('id').agg({'quantity':func,
                                          'price':func,
                                          'Total':func}).rename(
                                columns={'quantity':func+'Quantity',
                                         'price':func+'Price',
                                         'Total':func+'Total'}).fillna(0))
    numFeatures += [func+'Quantity', func+'Price', func+'Total']

res_data = res_data.join(res_raw_data.groupby('id').agg(
    {'description':lambda x:' '.join(x.values.astype(str))}).rename(
    columns={'description':'resource_description'}))

train_data = train_raw_data.join(res_data, on='id')
test_data=test_raw_data.join(res_data,on='id')


train_data['price_category'] = pl.digitize(train_data.Total, [0, 50, 100, 250, 500, 1000, pl.inf])
test_data['price_category'] = pl.digitize(test_data.Total, [0, 50, 100, 250, 500, 1000, pl.inf])
numFeatures.append('price_category')

for c in ['Quantity', 'Price', 'Total']:
    train_data['max%s_min%s'%(c,c)] = train_data['max%s'%c] - train_data['min%s'%c]
    test_data['max%s_min%s'%(c,c)] = test_data['max%s'%c] - test_data['min%s'%c]
    numFeatures.append('max%s_min%s'%(c,c))

del res_data, train_raw_data, res_raw_data, test_raw_data
gc.collect()

212

In [7]:

train_data['teacher_id'] = LabelEncoder().fit_transform(train_data['teacher_id'])
train_data['teacher_gender_unknown'] = train_data.teacher_prefix.apply(lambda x:int(x not in ['Ms.', 'Mrs.', 'Mr.']))  
test_data['teacher_id'] = LabelEncoder().fit_transform(test_data['teacher_id'])
test_data['teacher_gender_unknown'] = test_data.teacher_prefix.apply(lambda x:int(x not in ['Ms.', 'Mrs.', 'Mr.']))  
aggFtrs=[]    
       
for col in ['school_state', 'teacher_id', 'teacher_prefix', 'teacher_gender_unknown', 'project_grade_category', 
                'project_subject_categories', 'project_subject_subcategories', 'teacher_number_of_previously_posted_projects']:
        Aggtr = train_data[['id', col]].groupby(col).agg('count').rename(columns={'id':col+'_agg'})
        Aggts = test_data[['id', col]].groupby(col).agg('count').rename(columns={'id':col+'_agg'})
        Aggtr /= Aggtr.sum()
        Aggts /= Aggts.sum()
        train_data = train_data.join(Aggtr, on=col)
        test_data = test_data.join(Aggts, on=col) 





In [8]:
numFeatures += ['teacher_number_of_previously_posted_projects','teacher_id','teacher_gender_unknown']
aggFtrs+=['school_state_agg', 'teacher_id_agg', 'teacher_prefix_agg', 'teacher_gender_unknown_agg', \
          'project_grade_category_agg', 
          'project_subject_categories_agg', 'project_subject_subcategories_agg',\
          'teacher_number_of_previously_posted_projects_agg']

In [None]:
def getTimeFeatures(data):
    data['year'] = data['project_submitted_datetime'].apply(lambda x: x.year)
    data['month'] = data['project_submitted_datetime'].apply(lambda x: x.month)
    data['day'] = data['project_submitted_datetime'].apply(lambda x: x.day)
    data['dow'] = data['project_submitted_datetime'].apply(lambda x: x.dayofweek)
    return data
train_data['project_submitted_datetime'] = pd.to_datetime(train_data['project_submitted_datetime'])
test_data['project_submitted_datetime'] = pd.to_datetime(test_data['project_submitted_datetime'])
getTimeFeatures(train_data)
getTimeFeatures(test_data)

for col in  ['year', 'month', 'day', 'dow']:
        Aggtr = train_data[['id', col]].groupby(col).agg('count').rename(columns={'id':col+'_agg'})
        Aggts = test_data[['id', col]].groupby(col).agg('count').rename(columns={'id':col+'_agg'})
        Aggtr /= Aggtr.sum()
        Aggts /= Aggts.sum()
        train_data = train_data.join(Aggtr, on=col)
        test_data = test_data.join(Aggts, on=col)
            

aggFtrs+=['year_agg', 'month_agg', 'day_agg', 'dow_agg']
numFeatures += ['year', 'month', 'day', 'dow']
numFeatures += aggFtrs
gc.collect()

In [11]:
cat_features = ['teacher_prefix','school_state','project_grade_category',\
                'project_subject_categories','project_subject_subcategories']
cat_features_hash = [col+"_hash" for col in cat_features]
max_size=15000#0
def feature_hash(df, max_size=max_size):
    for col in cat_features:
        df[col+"_hash"] = df[col].apply(lambda x: hash(x)%max_size)
    return df

train_data = feature_hash(train_data)
test_data = feature_hash(test_data)

train_cat_new = np.array(train_data[cat_features_hash], dtype=np.int)
test_cat_new = np.array(test_data[cat_features_hash], dtype=np.int)

In [13]:
def pprocess(string):

    string = re.sub(r'(\")', ' ', string)
    string = re.sub(r'(\r)', ' ', string)
    string = re.sub(r'(\n)', ' ', string)
    string = re.sub(r'(\r\n)', ' ', string)
    string = re.sub(r'(\\)', ' ', string)
    string = re.sub(r'\t', ' ', string)
    string = re.sub(r'\:', ' ', string)
    string = re.sub(r'\"\"\"\"', ' ', string)
    string = re.sub(r'_', ' ', string)
    string = re.sub(r'\+', ' ', string)
    string = re.sub(r'\=', ' ', string)

    return string

In [14]:
text_cols=['project_essay_1','project_essay_2','project_resource_summary','resource_description','project_title']
train_data['text'] = train_data.apply(lambda x: " ".join(x[col] for col in text_cols), axis=1)
test_data['text'] = test_data.apply(lambda x: " ".join(x[col] for col in text_cols), axis=1)                                   

In [15]:
train_data['text']=train_data['text'].apply(pprocess)
test_data['text']=test_data['text'].apply(pprocess)

In [17]:
stop_words = set(stopwords.words("english"))
porter = PorterStemmer()
def filter_stop_words(sentence):
    new_sent = [porter.stem(word.lower()) for word in sentence.split() if word not in stop_words]
    new_sent=  ["".join(sent) for sent in new_sent]
    return new_sent

In [18]:
train_data['text']=train_data['text'].apply(filter_stop_words)
test_data['text']=test_data['text'].apply(filter_stop_words)

In [19]:
max_features = 100000
from keras.preprocessing import text,sequence
tokenizer = text.Tokenizer(num_words=max_features,filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~')
tokenizer.fit_on_texts(train_data['text'].str.join(' ').tolist()+test_data['text'].str.join(' ').tolist())
gc.collect()

0

In [20]:
tr_text_data_list=tokenizer.texts_to_sequences(train_data["text"].str.join(' ').tolist())
ta_text_data_list=tokenizer.texts_to_sequences(test_data["text"].str.join(' ').tolist())

In [21]:
train_words = sequence.pad_sequences(tr_text_data_list, maxlen=300)
test_words = sequence.pad_sequences(ta_text_data_list, maxlen=300)

train_target = train_data.project_is_approved


In [22]:
EMBEDDING_FILE = '../../../Users/sreek/Documents/vg_donors_choose/crawl-300d-2M.vec/crawl-300d-2M.vec'
embed_size=300
embeddings_index = {}
with open(EMBEDDING_FILE,encoding='utf8') as f:
    for line in f:
        values = line.rstrip().rsplit(' ')
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

word_index = tokenizer.word_index

In [23]:
num_words = min(max_features, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, embed_size))
for word, i in word_index.items():
    if i >= max_features:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector


In [24]:
from keras.layers import Input, Dense, Embedding, Flatten, concatenate, Dropout, Convolution1D, \
GlobalMaxPool1D,SpatialDropout1D,Bidirectional,PReLU,GRU
from keras.models import Model
from keras import optimizers

In [25]:
from sklearn.preprocessing import StandardScaler
train_data_num=StandardScaler().fit_transform(train_data[numFeatures].fillna(0))
test_data_num=StandardScaler().fit_transform(test_data[numFeatures].fillna(0))

In [26]:
gc.collect()

7

In [27]:
def build_NN_model():
    cat_input = Input((len(cat_features_hash), ))
    num_input = Input((len(numFeatures), ))
    text_input = Input((300, ))
    m_cat=Embedding(max_size,10)(cat_input)
    m_cat=SpatialDropout1D(0.3)(m_cat)
    m_cat = Flatten()(m_cat)
    m_words = Embedding(max_features, 300,
                            weights=[embedding_matrix],
                            trainable=False)(text_input)
    m_words = SpatialDropout1D(0.3)(m_words)
    m_words =Bidirectional(GRU(50, return_sequences=True))(m_words)
    m_words = Convolution1D(100, 3, activation="relu")(m_words)
    m_words = GlobalMaxPool1D()(m_words)
    m_cat = Dense(100, activation="relu")(m_cat)
    m_num = Dense(100, activation="relu")(num_input)
    m = concatenate([m_cat, m_num, m_words])
    m = Dense(50, activation="relu")(m)
    m = Dropout(0.25)(m)
    predictions = Dense(1, activation="sigmoid")(m)
    model = Model(inputs=[cat_input, num_input, text_input], outputs=predictions)
    model.compile(optimizer=optimizers.Adam(0.0005, decay=1e-6),
              loss='binary_crossentropy',
              metrics=['accuracy'])

    return model


In [27]:
run_opts = tf.RunOptions(report_tensor_allocations_upon_oom = True)

In [28]:
from keras.callbacks import *
from sklearn.metrics import roc_auc_score
file_path='RNN.h5'
run_opts = tf.RunOptions(report_tensor_allocations_upon_oom = True)
checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=2, save_best_only=True, save_weights_only=True,
                                     mode='min')

early = EarlyStopping(monitor="val_loss", mode="min", patience=4)
lr_reduced = ReduceLROnPlateau(monitor='val_loss',
                               factor=0.1,
                               patience=2,
                               verbose=1,
                               epsilon=1e-4,
                               mode='min')
callbacks_list = [checkpoint, early, lr_reduced]
model=build_NN_model()
history = model.fit([train_cat_new, train_data_num, train_words], train_target, validation_split=0.1,
                    verbose=2,callbacks=callbacks_list,
          epochs=5, batch_size=256)


Train on 163872 samples, validate on 18208 samples
Epoch 1/5
Epoch 00000: val_loss improved from inf to 0.37153, saving model to RNN.h5
448s - loss: 0.4015 - acc: 0.8468 - val_loss: 0.3715 - val_acc: 0.8509
Epoch 2/5
Epoch 00001: val_loss improved from 0.37153 to 0.35445, saving model to RNN.h5
442s - loss: 0.3717 - acc: 0.8514 - val_loss: 0.3545 - val_acc: 0.8573
Epoch 3/5
Epoch 00002: val_loss improved from 0.35445 to 0.34915, saving model to RNN.h5
444s - loss: 0.3568 - acc: 0.8564 - val_loss: 0.3492 - val_acc: 0.8591
Epoch 4/5
Epoch 00003: val_loss improved from 0.34915 to 0.34860, saving model to RNN.h5
442s - loss: 0.3507 - acc: 0.8593 - val_loss: 0.3486 - val_acc: 0.8591
Epoch 5/5
Epoch 00004: val_loss improved from 0.34860 to 0.34573, saving model to RNN.h5
448s - loss: 0.3456 - acc: 0.8612 - val_loss: 0.3457 - val_acc: 0.8603


In [29]:
del train_cat_new, train_data_num, train_words,train_target
model.load_weights(file_path)
pred_test = model.predict([test_cat_new, test_data_num, test_words], batch_size=2000)
test["project_is_approved"] = pred_test
test[['id', 'project_is_approved']].to_csv("submission.csv", index=False)

ResourceExhaustedError: OOM when allocating tensor with shape[2000,300,300] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[Node: bidirectional_1/zeros_like = ZerosLike[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:GPU:0"](spatial_dropout1d_2/cond/Merge)]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.

	 [[Node: dense_4/Sigmoid/_219 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device_incarnation=1, tensor_name="edge_598_dense_4/Sigmoid", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.


Caused by op 'bidirectional_1/zeros_like', defined at:
  File "C:\python\pyLibs\Anaconda\lib\runpy.py", line 170, in _run_module_as_main
    "__main__", mod_spec)
  File "C:\python\pyLibs\Anaconda\lib\runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "C:\python\pyLibs\Anaconda\lib\site-packages\ipykernel\__main__.py", line 3, in <module>
    app.launch_new_instance()
  File "C:\python\pyLibs\Anaconda\lib\site-packages\traitlets\config\application.py", line 596, in launch_instance
    app.start()
  File "C:\python\pyLibs\Anaconda\lib\site-packages\ipykernel\kernelapp.py", line 442, in start
    ioloop.IOLoop.instance().start()
  File "C:\python\pyLibs\Anaconda\lib\site-packages\zmq\eventloop\ioloop.py", line 162, in start
    super(ZMQIOLoop, self).start()
  File "C:\python\pyLibs\Anaconda\lib\site-packages\tornado\ioloop.py", line 883, in start
    handler_func(fd_obj, events)
  File "C:\python\pyLibs\Anaconda\lib\site-packages\tornado\stack_context.py", line 275, in null_wrapper
    return fn(*args, **kwargs)
  File "C:\python\pyLibs\Anaconda\lib\site-packages\zmq\eventloop\zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "C:\python\pyLibs\Anaconda\lib\site-packages\zmq\eventloop\zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "C:\python\pyLibs\Anaconda\lib\site-packages\zmq\eventloop\zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "C:\python\pyLibs\Anaconda\lib\site-packages\tornado\stack_context.py", line 275, in null_wrapper
    return fn(*args, **kwargs)
  File "C:\python\pyLibs\Anaconda\lib\site-packages\ipykernel\kernelbase.py", line 276, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "C:\python\pyLibs\Anaconda\lib\site-packages\ipykernel\kernelbase.py", line 228, in dispatch_shell
    handler(stream, idents, msg)
  File "C:\python\pyLibs\Anaconda\lib\site-packages\ipykernel\kernelbase.py", line 391, in execute_request
    user_expressions, allow_stdin)
  File "C:\python\pyLibs\Anaconda\lib\site-packages\ipykernel\ipkernel.py", line 199, in do_execute
    shell.run_cell(code, store_history=store_history, silent=silent)
  File "C:\python\pyLibs\Anaconda\lib\site-packages\IPython\core\interactiveshell.py", line 2723, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "C:\python\pyLibs\Anaconda\lib\site-packages\IPython\core\interactiveshell.py", line 2825, in run_ast_nodes
    if self.run_code(code, result):
  File "C:\python\pyLibs\Anaconda\lib\site-packages\IPython\core\interactiveshell.py", line 2885, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-28-087151803252>", line 16, in <module>
    model=build_NN_model()
  File "<ipython-input-27-c2513483fd55>", line 12, in build_NN_model
    m_words =Bidirectional(GRU(50, return_sequences=True))(m_words)
  File "C:\python\pyLibs\Anaconda\lib\site-packages\keras\engine\topology.py", line 602, in __call__
    output = self.call(inputs, **kwargs)
  File "C:\python\pyLibs\Anaconda\lib\site-packages\keras\layers\wrappers.py", line 294, in call
    y = self.forward_layer.call(inputs, **kwargs)
  File "C:\python\pyLibs\Anaconda\lib\site-packages\keras\layers\recurrent.py", line 315, in call
    initial_state = self.get_initial_state(inputs)
  File "C:\python\pyLibs\Anaconda\lib\site-packages\keras\layers\recurrent.py", line 244, in get_initial_state
    initial_state = K.zeros_like(inputs)  # (samples, timesteps, input_dim)
  File "C:\python\pyLibs\Anaconda\lib\site-packages\keras\backend\tensorflow_backend.py", line 704, in zeros_like
    return tf.zeros_like(x, dtype=dtype, name=name)
  File "C:\python\pyLibs\Anaconda\lib\site-packages\tensorflow\python\ops\array_ops.py", line 1681, in zeros_like
    return gen_array_ops.zeros_like(tensor, name=name)
  File "C:\python\pyLibs\Anaconda\lib\site-packages\tensorflow\python\ops\gen_array_ops.py", line 11442, in zeros_like
    "ZerosLike", x=x, name=name)
  File "C:\python\pyLibs\Anaconda\lib\site-packages\tensorflow\python\framework\op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)
  File "C:\python\pyLibs\Anaconda\lib\site-packages\tensorflow\python\framework\ops.py", line 3392, in create_op
    op_def=op_def)
  File "C:\python\pyLibs\Anaconda\lib\site-packages\tensorflow\python\framework\ops.py", line 1718, in __init__
    self._traceback = self._graph._extract_stack()  # pylint: disable=protected-access

ResourceExhaustedError (see above for traceback): OOM when allocating tensor with shape[2000,300,300] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[Node: bidirectional_1/zeros_like = ZerosLike[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:GPU:0"](spatial_dropout1d_2/cond/Merge)]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.

	 [[Node: dense_4/Sigmoid/_219 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device_incarnation=1, tensor_name="edge_598_dense_4/Sigmoid", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.



In [None]:
params_xgb = {
        'eta': 0.05,
        'max_depth': 4,
        'subsample': 0.85,
        'colsample_bytree': 0.25,
        'min_child_weight': 3,
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'seed': 0,
        'silent': 1,
    }
params_lgb = {
        'boosting_type': 'dart',
        'objective': 'binary',
        'metric': 'auc',
        'max_depth': 10,
        'learning_rate': 0.05,
        'feature_fraction': 0.25,
        'bagging_fraction': 0.85,
        'seed': 0,
        'verbose': 0,
    }
for i in range(21, 22):
    gc.collect()
    X_train, X_val, Tar_train, Tar_val = train_test_split(Xtr, Ttr_tar, test_size=0.15, random_state=i, stratify=Ttr_tar)

    # LGB
    dtrain = lgb.Dataset(X_train, Tar_train)
    dval   = lgb.Dataset(X_val, Tar_val)
    model = lgb.train(params_lgb, dtrain, num_boost_round=10000, valid_sets=[dtrain, dval], early_stopping_rounds=200, verbose_eval=200)
    Yvl2 = model.predict(X_val)
    Yts2 = model.predict(Xts)
