In [1]:
## NN

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, csr_matrix
import scipy.sparse as sparse

from keras.layers import Input, Dense, Embedding
from keras.layers import Flatten, concatenate, Dropout
from sklearn.metrics import roc_auc_score


from keras.models import Model
from keras import optimizers

import lightgbm as lgb
import pandas as pd
import numpy as np
import string
import gc

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
target_column = "project_is_approved"
id_column = 'id'

print "dataset preparation"
normdf = pd.read_csv("../models/preprocessed/featured_2.csv")
testid = normdf[normdf['is_test'] == 1][id_column]

Y = normdf[normdf['is_train'] == 1][target_column]
drop_cols = [target_column, id_column]

normdf = normdf.drop(drop_cols, axis = 1)

print "Label Encoding"
cat_feats =  ['project_grade_category', 'project_subject_categories', 'project_subject_subcategories', 'school_state', 'teacher_id', 'teacher_prefix']
for c in cat_feats:
    le = LabelEncoder()
    le.fit(normdf[c].astype(str))
    normdf[c] = le.transform(normdf[c].astype(str))


relevant_cols = """teacher_id
project_essay_2_density
project_resource_summary_density
resource_description_density
project_essay_1_sub
project_essay_2_sub
project_essay_2_pol
project_essay_1_density
dayofyear_count
project_essay_2_char_count
project_essay_1_pol
total_price
price
project_essay_1_char_count
project_title_density
project_essay_2_stopword
project_essay_2_punctuation_count
min_price
min_total_price
project_subject_subcategories_count
resource_description_sub
project_resource_summary_char_count
resource_description_pol
project_essay_2_word_count
max_price"""
relevant_cols = [x for x in relevant_cols.split("\n")]

print "Robust Scaling"
normdf = normdf.fillna(99)
std = RobustScaler()
normdf[relevant_cols] = pd.DataFrame(std.fit_transform(normdf[relevant_cols])).set_index(normdf.index)


traindf = normdf[normdf['is_train'] == 1][relevant_cols]
testdf = normdf[normdf['is_test'] == 1][relevant_cols]

dataset preparation
Label Encoding
Robust Scaling


In [12]:
print "Loading Vectors"
textColumns = ['project_essay_1', 'project_essay_2', 'project_resource_summary', 'resource_description', 'project_title']
tr_vects = []
pred_vects_tr = []

# tr_vects_char = [] ## new

for i, col in enumerate(textColumns):
    tr_vect = sparse.load_npz("../models/vectors/new_tr_"+col+".npz")
    tr_vects.append(tr_vect)
    
    
#     pred_vects_tr.append(tr_vect) ## new

    tr_vect = sparse.load_npz("../models/vectors/new_char_tr_"+col+".npz")
    tr_vects_char.append(tr_vect)

print "Stacking Vectors"
num_train = normdf[normdf['is_train'] == 1][relevant_cols]
tr_vect1 = hstack(tr_vects, 'csr')
tr_vect2 = hstack(tr_vects_char, 'csr')
del tr_vects, tr_vects_char
gc.collect()

Loading Vectors
Stacking Vectors


220

In [14]:
# ## new 
# pred_vects_tr.append(csr_matrix(traindf))
# X_train_stack = hstack(pred_vects_tr, 'csr')

# train_preds = []
# predsdf = pd.DataFrame()
# for i in range(1,10):
#     print i 
#     model = lgb.Booster(model_file='../models/uplgb'+str(i)+'.txt')
#     predsdf["weak_"+str(i)] = model.predict(X_train_stack, num_iteration=model.best_iteration)

In [18]:
X_train_tfidf1, X_valid_tfidf1, y_train, y_valid = train_test_split(tr_vect1, Y, test_size=0.20, random_state=42)
X_train_tfidf2, X_valid_tfidf2, y_train, y_valid = train_test_split(tr_vect2, Y, test_size=0.20, random_state=42)
X_train1, X_valid1, y_train, y_valid = train_test_split(num_train, Y, test_size=0.20, random_state=42)
X_train2, X_valid2, y_train, y_valid = train_test_split(predsdf, Y, test_size=0.20, random_state=42)


X_train_target = y_train
size_tfidf1 = X_train_tfidf1.shape[1]
size_tfidf2 = X_train_tfidf2.shape[1]
size_numfeats1 = len(X_train1.columns)
size_numfeats2 = len(predsdf.columns)

# Complete Training Purposes

# X_train_tfidf1 = tr_vect1
# X_train_tfidf2 = tr_vect2
# X_train1 = num_train
# X_train_target = Y

# size_tfidf1 = X_train_tfidf1.shape[1]
# size_tfidf2 = X_train_tfidf2.shape[1]
# size_numfeats1 = len(X_train1.columns)

In [29]:
# Defining the Input Layer of our model
input_num1 = Input((size_numfeats1, ))
input_tfidf1 = Input((size_tfidf1, ), sparse=True)
input_tfidf2 = Input((size_tfidf2, ), sparse=True)

layer_num1 = Dense(512, activation='relu')(input_num1)

layer_tfidf1 = Dense(256, activation="relu")(input_tfidf1)
layer_tfidf1 = Dense(512, activation="relu")(input_tfidf1)

layer_tfidf2 = Dense(256, activation="relu")(input_tfidf2)

output = concatenate([layer_num1, layer_tfidf1, layer_tfidf2])
output = Dense(512, activation="relu")(output)

output = Dense(1, activation="sigmoid")(output)

# combine the model
model = Model(inputs=[input_num1, input_tfidf1, input_tfidf2], outputs=output)
model.compile(optimizer=optimizers.Adam(), loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
# Train the model 
from keras.callbacks import EarlyStopping

earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=2, verbose=0, mode='auto')
model.fit([X_train1, X_train_tfidf1, X_train_tfidf2], X_train_target, validation_split=0.20, batch_size=512, epochs=5, callbacks=[earlystop])

val_pred = model.predict([X_valid1, X_valid_tfidf1, X_valid_tfidf2])
print roc_auc_score(y_valid, val_pred)

Train on 116531 samples, validate on 29133 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5

In [60]:
## predicting on test data

print "Loading Vectors"
textColumns = ['project_essay_1', 'project_essay_2', 'project_resource_summary', 'resource_description', 'project_title']
ts_vects = []
ts_vects_char = []
for i, col in enumerate(textColumns):
    ts_vect = sparse.load_npz("models/vectors/new_"+col+".npz")
    ts_vects.append(ts_vect)
    
    ts_vect = sparse.load_npz("models/vectors/new_char_"+col+".npz")
    ts_vects_char.append(ts_vect)

print "stacking"
ts_vect1 = hstack(ts_vects, 'csr')
ts_vect2 = hstack(ts_vects_char, 'csr')
num_test = normdf[normdf['is_test'] == 1][relevant_cols]

print "predicting"
test_preds = model.predict([num_test, ts_vect1, ts_vect2])

sub = pd.DataFrame()
sub['id'] = testid
sub['project_is_approved'] = test_preds
sub.to_csv("sub/nn_ps.csv", index = False)

Loading Vectors
stacking


MemoryError: 

In [22]:
import gc 
del model
gc.collect()

866

In [None]:
# 0.7791028157116862 - 512,256,512 (no dropout)
# 0.7811179176857501 - same + chars (train los : 0.3711)