In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

/kaggle/input/donors-choose/preprocessed_data.csv
/kaggle/input/glove6b100dtxt/glove.6B.100d.txt


In [2]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
embeddings_index = dict()
f = open('/kaggle/input/glove6b100dtxt/glove.6B.100d.txt',encoding="utf8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:])
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 400000 word vectors.


In [4]:
data=pd.read_csv('/kaggle/input/donors-choose/preprocessed_data.csv')

In [5]:
y=data['project_is_approved']

In [6]:
data=data.drop(['project_is_approved'],axis=1)

In [7]:
preproc=[]
for row in data['project_grade_category']:
    row=row.replace('grades','')
    row=row.replace('_prek_2','prek2')
    row=row.replace('_3_5','3to5')
    row=row.replace('_6_8','6to8')
    row=row.replace('_9_12','9to12')
    preproc.append(row)
    
data['project_grade_category']=preproc

In [8]:
preproc=[]
for row in data['clean_categories']:
    row=row.replace(' ','')
    row=row.replace('_','')
    preproc.append(row)
    
data['clean_categories']=preproc

In [9]:
preproc=[]
for row in data['clean_subcategories']:
    row=row.replace(' ','')
    row=row.replace('_','')
    preproc.append(row)
    
data['clean_subcategories']=preproc

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.33, stratify=y)

In [12]:
X_train.columns

Index(['school_state', 'teacher_prefix', 'project_grade_category',
       'teacher_number_of_previously_posted_projects', 'clean_categories',
       'clean_subcategories', 'essay', 'price'],
      dtype='object')

In [13]:
from keras.models import Sequential,Model
from keras.layers import Dense,BatchNormalization
from keras.preprocessing.text import Tokenizer
from keras.layers import LSTM,Input,Flatten
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.optimizers import Adam
from keras.preprocessing.sequence import pad_sequences
from keras.layers import BatchNormalization
from keras.initializers import he_normal

Using TensorFlow backend.


# Model 2

### Essay

In [14]:
tfidf = TfidfVectorizer(min_df=10,ngram_range=(1,1))
tfidf.fit(X_train['essay'])
print("Total number of features ",len(tfidf.get_feature_names()))

Total number of features  14219


In [16]:
feature_names = np.asarray(tfidf.get_feature_names()) # getting all words
idf_score = tfidf.idf_
index = []
for i in range(len(idf_score)):
    if idf_score[i] >= 2 and idf_score[i] <=10:
        index.append(i)
        
important_words = []
for i in index:
    important_words.append(feature_names[i])

In [17]:
from tqdm import tqdm
x_train_essay_new = []
for sentence in tqdm(X_train['essay']):
    sen = []
    for word in sentence.split():
        if word in important_words:
            sen.append(word)      
    x_train_essay_new.append(' '.join(sen))

100%|██████████| 73196/73196 [32:50<00:00, 37.15it/s]


In [18]:
x_test_essay_new = []
for sentence in tqdm(X_test['essay']):
    sen = []
    for word in sentence.split():
        if word in important_words:
            sen.append(word)    
    x_test_essay_new.append(' '.join(sen))

100%|██████████| 36052/36052 [16:11<00:00, 37.10it/s]


In [19]:
text_essay = Tokenizer()
text_essay.fit_on_texts(x_train_essay_new)
vocab_size = len(text_essay.word_index) + 1
encoded_train_new = text_essay.texts_to_sequences(x_train_essay_new)
encoded_test_new = text_essay.texts_to_sequences(x_test_essay_new)

In [20]:
max_length = 100
padded_train_new = pad_sequences(encoded_train_new, maxlen=max_length, padding='post')
padded_test_new = pad_sequences(encoded_test_new, maxlen=max_length, padding='post')


In [21]:
embedding_matrix = np.zeros((vocab_size, 100))
for word, i in text_essay.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [22]:
input_essay_tfidf = Input(shape=(100,))

embedding = Embedding(vocab_size,100,weights=[embedding_matrix],input_length=100,trainable=False)(input_essay_tfidf)
lstm=LSTM(100,return_sequences=True)(embedding)
flatten_21=Flatten()(lstm)

### School State

In [23]:
def self_token(column):
    unique = list(set(column))
    total = list(column)
    size = len(unique)
    count = []
    for category in unique:
        count.append([total.count(category),category])
    count.sort()
    rank = {}
    for i in range(1,len(count)+1):
        rank.update({count[i-1][1] : i})
    return (rank , unique,size)

In [24]:
state_rank, unique,size = self_token(X_train['school_state'])
print(state_rank)
state_size = size
encoded_state_train = []
encoded_state_test = []
for state in X_train['school_state']:
    encoded_state_train.append(state_rank[state])

for state in X_test['school_state']:
    if state in unique:
        encoded_state_test.append(state_rank[state]) 
    else:
        encoded_state_test.append(0) 
    
encoded_state_train = np.asarray(encoded_state_train)
encoded_state_test = np.asarray(encoded_state_test)

print(encoded_state_train.shape)
print(encoded_state_test.shape)

{'vt': 1, 'wy': 2, 'nd': 3, 'mt': 4, 'sd': 5, 'ne': 6, 'ri': 7, 'de': 8, 'ak': 9, 'nh': 10, 'wv': 11, 'dc': 12, 'hi': 13, 'me': 14, 'nm': 15, 'ks': 16, 'ia': 17, 'id': 18, 'ar': 19, 'mn': 20, 'co': 21, 'or': 22, 'ms': 23, 'ky': 24, 'nv': 25, 'md': 26, 'ct': 27, 'tn': 28, 'ut': 29, 'al': 30, 'wi': 31, 'va': 32, 'az': 33, 'nj': 34, 'ok': 35, 'wa': 36, 'la': 37, 'ma': 38, 'oh': 39, 'in': 40, 'mo': 41, 'pa': 42, 'mi': 43, 'sc': 44, 'ga': 45, 'il': 46, 'nc': 47, 'fl': 48, 'ny': 49, 'tx': 50, 'ca': 51}
(73196,)
(36052,)


In [25]:
input_state = Input(shape=(1,))
embedding=Embedding(state_size,10, input_length=1,trainable=False)(input_state)
flatten_1=Flatten()(embedding)

### Project Grade Category

In [26]:
p_grade_rank, unique,size = self_token(X_train['project_grade_category'])
print(p_grade_rank)
project_grade_categories_size = size
encoded_p_grade_train = []
encoded_p_grade_test = []
for grade in X_train['project_grade_category']:
    encoded_p_grade_train.append(p_grade_rank[grade])

for grade in X_test['project_grade_category']:
    if grade in unique:
        encoded_p_grade_test.append(p_grade_rank[grade]) 
    else:
        encoded_p_grade_test.append(0) 
    
encoded_p_grade_train = np.asarray(encoded_p_grade_train)
encoded_p_grade_test = np.asarray(encoded_p_grade_test)

print(encoded_p_grade_train.shape)
print(encoded_p_grade_test.shape)

{'9to12': 1, '6to8': 2, '3to5': 3, 'prek2': 4}
(73196,)
(36052,)


In [27]:
input_grade = Input(shape=(1,))
embedding=Embedding(project_grade_categories_size,10, input_length=1,trainable=False)(input_grade)
flatten_2=Flatten()(embedding)

### Clean Categories

In [28]:
cat_rank, unique,size = self_token(X_train['clean_categories'])
print(cat_rank)
categories_size = size
encoded_cat_train = []
encoded_cat_test = []
for category in X_train['clean_categories']:
    encoded_cat_train.append(cat_rank[category])

for category in X_test['clean_categories']:
    if category in unique:
        encoded_cat_test.append(cat_rank[category]) 
    else:
        encoded_cat_test.append(0) 
    
encoded_cat_train = np.asarray(encoded_cat_train)
encoded_cat_test = np.asarray(encoded_cat_test)

print(encoded_cat_train.shape)
print(encoded_cat_test.shape)

{'musicartswarmthcarehunger': 1, 'historycivicshealthsports': 2, 'mathsciencewarmthcarehunger': 3, 'appliedlearningwarmthcarehunger': 4, 'literacylanguagewarmthcarehunger': 5, 'musicartsappliedlearning': 6, 'specialneedswarmthcarehunger': 7, 'musicartshistorycivics': 8, 'musicartshealthsports': 9, 'healthsportswarmthcarehunger': 10, 'historycivicsappliedlearning': 11, 'healthsportshistorycivics': 12, 'specialneedshealthsports': 13, 'literacylanguagehealthsports': 14, 'musicartsspecialneeds': 15, 'healthsportsmusicarts': 16, 'appliedlearninghistorycivics': 17, 'healthsportsappliedlearning': 18, 'historycivicsspecialneeds': 19, 'healthsportsmathscience': 20, 'historycivicsmathscience': 21, 'specialneedsmusicarts': 22, 'historycivicsmusicarts': 23, 'mathsciencehealthsports': 24, 'appliedlearninghealthsports': 25, 'literacylanguageappliedlearning': 26, 'mathsciencehistorycivics': 27, 'literacylanguagehistorycivics': 28, 'appliedlearningmusicarts': 29, 'healthsportsliteracylanguage': 30, 'a

In [29]:
input_cat = Input(shape=(1,))
embedding=Embedding(categories_size,10, input_length=1,trainable=False)(input_cat)
flatten_3=Flatten()(embedding)

### Clean Subcategories

In [30]:
subcat_rank, unique,size = self_token(X_train['clean_subcategories'])
print(subcat_rank)
subcategories_size = size
encoded_subcat_train = []
encoded_subcat_test = []
for category in X_train['clean_subcategories']:
    encoded_subcat_train.append(subcat_rank[category])

for category in X_test['clean_subcategories']:
    if category in unique:
        encoded_subcat_test.append(subcat_rank[category]) 
    else:
        encoded_subcat_test.append(0) 
    
encoded_subcat_train = np.asarray(encoded_subcat_train)
encoded_subcat_test = np.asarray(encoded_subcat_test)

print(encoded_subcat_train.shape)
print(encoded_subcat_test.shape)

{'civicsgovernmentextracurricular': 1, 'civicsgovernmentforeignlanguages': 2, 'civicsgovernmenthealthwellness': 3, 'civicsgovernmentnutritioneducation': 4, 'civicsgovernmentparentinvolvement': 5, 'collegecareerprepwarmthcarehunger': 6, 'communityservicefinancialliteracy': 7, 'communityservicemusic': 8, 'earlydevelopmentforeignlanguages': 9, 'earlydevelopmenthistorygeography': 10, 'economicsforeignlanguages': 11, 'economicshealthlifescience': 12, 'economicsother': 13, 'environmentalscienceteamsports': 14, 'esleconomics': 15, 'eslteamsports': 16, 'extracurricularforeignlanguages': 17, 'extracurricularsocialsciences': 18, 'financialliteracyhealthwellness': 19, 'financialliteracyother': 20, 'financialliteracyperformingarts': 21, 'foreignlanguagesgymfitness': 22, 'foreignlanguageshealthlifescience': 23, 'gymfitnessparentinvolvement': 24, 'gymfitnesssocialsciences': 25, 'gymfitnesswarmthcarehunger': 26, 'literaturewritingnutritioneducation': 27, 'otherwarmthcarehunger': 28, 'parentinvolvemen

In [31]:
input_sub = Input(shape=(1,))
embedding=Embedding(subcategories_size,10, input_length=1,trainable=False)(input_sub)
flatten_4=Flatten()(embedding)

### Teacher Prefix

In [32]:
t_prefix_rank, unique,size = self_token(X_train['teacher_prefix'])
print(t_prefix_rank)
teacher_prefix_size =size
encoded_t_prefix_train = []
encoded_t_prefix_test = []
for prefix in X_train['teacher_prefix']:
    encoded_t_prefix_train.append(t_prefix_rank[prefix])

for prefix in X_test['teacher_prefix']:
    if prefix in unique:
        encoded_t_prefix_test.append(t_prefix_rank[prefix]) 
    else:
        encoded_t_prefix_test.append(0) 
    
encoded_t_prefix_train = np.asarray(encoded_t_prefix_train)
encoded_t_prefix_test = np.asarray(encoded_t_prefix_test)

print(encoded_t_prefix_train.shape)
print(encoded_t_prefix_test.shape)

{'dr': 1, 'teacher': 2, 'mr': 3, 'ms': 4, 'mrs': 5}
(73196,)
(36052,)


In [33]:
input_prefix = Input(shape=(1,))
embedding=Embedding(teacher_prefix_size, 10, input_length=1,trainable=False)(input_prefix)
flatten_5=Flatten()(embedding)

### Remaining Numerical Features

In [34]:
from scipy.sparse import hstack

train_price=X_train['price'].values.reshape(-1, 1)
train_number=X_train['teacher_number_of_previously_posted_projects'].values.reshape(-1, 1)

test_price=X_test['price'].values.reshape(-1, 1)
test_number=X_test['teacher_number_of_previously_posted_projects'].values.reshape(-1, 1)

rem_train=np.concatenate((train_price,train_number),axis=1)
rem_test=np.concatenate((test_price,test_number),axis=1)

In [35]:
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
scaler.fit(rem_train)
rem_train_standard=scaler.transform(rem_train)
rem_test_standard=scaler.transform(rem_test)

In [36]:
from keras.regularizers import l2

In [70]:
rem_feat = Input(shape=(2,))
rem_feat_dense= Dense(128, activation='relu',kernel_regularizer=l2(0.0001))(rem_feat)

### Concatenating Data

In [71]:
train_2=[padded_train_new,encoded_state_train,encoded_p_grade_train,encoded_cat_train,encoded_subcat_train,encoded_t_prefix_train,rem_train_standard]
test_2=[padded_test_new,encoded_state_test,encoded_p_grade_test,encoded_cat_test,encoded_subcat_test,encoded_t_prefix_test,rem_test_standard]

In [73]:
# https://www.youtube.com/watch?v=2U6Jl7oqRkM
# https://www.tensorflow.org/tensorboard/r2/get_started

import tensorflow as tf
from keras.callbacks import TensorBoard,EarlyStopping
from time import time

earlystop_2 = EarlyStopping(monitor = 'val_loss', mode="max",min_delta = 0, patience = 10,verbose = 1,restore_best_weights = True)
tensorboard_2 = TensorBoard("logs932332")

In [74]:
# https://github.com/pranaya-mathur/Donors-Choose-LSTM/blob/master/DonorsChoose_Model_1_13_Aug_19.ipynb
import tensorflow as tf
from sklearn.metrics import roc_auc_score

def auc_score(y_true, y_pred):
    if len(np.unique(y_true[:])) == 1:
        return 0.5
    else:
        return roc_auc_score(y_true, y_pred)

def auc_sc(y_true, y_pred):
    return tf.py_func(auc_score, (y_true, y_pred), tf.double)   

In [75]:
from keras.layers import concatenate
from keras.layers import Dropout
from keras.initializers import glorot_normal
concat = concatenate([flatten_21,flatten_1,flatten_2,flatten_3,flatten_4,flatten_5,rem_feat_dense])

x = Dense(256, activation='relu',kernel_regularizer=l2(0.001),kernel_initializer='glorot_normal')(concat)
x=Dropout(0.5)(x)

x = Dense(128, activation='relu',kernel_regularizer=l2(0.001),kernel_initializer='glorot_normal')(x)
x=Dropout(0.5)(x)
x = BatchNormalization()(x)

x = Dense(64, activation='relu',kernel_regularizer=l2(0.001),kernel_initializer='glorot_normal')(x)

# And finally we add the main logistic regression layer
main_output = Dense(2, activation='softmax')(x)

In [76]:
from keras.utils import np_utils
Y_train=np_utils.to_categorical(y_train)
Y_test=np_utils.to_categorical(y_test)

In [77]:
model2 = Model(inputs=[input_essay_tfidf, input_state, input_grade, input_cat, input_sub, input_prefix,rem_feat], outputs=[main_output])

In [78]:
model2.compile(optimizer='adam', loss='binary_crossentropy', metrics=[auc_sc])

In [79]:
model2.summary()

Model: "model_5"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 100)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 100, 100)     1419500     input_1[0][0]                    
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            (None, 1)            0                                            
____________________________________________________________________________________________

In [80]:
model2.fit(train_2,Y_train,batch_size=512, epochs=20,validation_data=(test_2,Y_test),verbose=1,class_weight='balanced',callbacks=[tensorboard_2,earlystop_2])

Train on 73196 samples, validate on 36052 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Restoring model weights from the end of the best epoch
Epoch 00011: early stopping


<keras.callbacks.callbacks.History at 0x7fb1507e6f60>

In [82]:
from prettytable import PrettyTable
x = PrettyTable()
x.field_names = ["Model", "train_auc", "test_auc"]
x.add_row(["Model_2", 0.7701,0.7343])
print(x)

+---------+-----------+----------+
|  Model  | train_auc | test_auc |
+---------+-----------+----------+
| Model_2 |   0.7701  |  0.7343  |
+---------+-----------+----------+


In [83]:
%load_ext tensorboard.notebook
%tensorboard --logdir logs932332