In [14]:
import pandas as pd
from importlib import reload
import feature_generators as fg
import preproc as pp
import os
import numpy as np

from sklearn.metrics import roc_auc_score
reload(pp)
reload(fg)

<module 'feature_generators' from 'C:\\Users\\ryanz\\OneDrive\\code base\\python\\machine learning\\question pairs\\feature_generators.py'>

#### settings

In [12]:
data_needed = 40000

raw_file = "data/train.csv"
pp_file  = "data/pp.csv"
mat_file = "data/mat.csv"
raw_drop = ["id", "question1", "question2"]

# pre-processing

In [13]:
raw = pd.read_csv(raw_file, nrows = data_needed)
try:
    pproc = pd.read_csv(pp_file)
    row_count = pproc.shape[0]
    if row_count < data_needed:
        print("Pre-Processing %d data..."%(data_needed - row_count))
        pproc = pproc.append(pp.stemmer(raw.loc[row_count:]))
        pproc.to_csv(pp_file, index=False)

except:
    # don't have preproessed data yet.
    print("Pre-Processing %d data..."%(data_needed))
    new_pp = pp.stemmer(raw)
    new_pp.to_csv(pp_file, index=False)
    
print("pre-processed data shape:", pd.read_csv(pp_file).shape)
pd.read_csv(pp_file).head(5)

Pre-Processing 35000 data...
pre-processed data shape: (40000, 6)


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,what is the step by step guid to invest in sha...,what is the step by step guid to invest in sha...,0
1,1,3,4,what is the stori of kohinoor ( koh-i-noor ) d...,what would happen if the indian govern stole t...,0
2,2,5,6,how can i increas the speed of my internet con...,how can internet speed be increas by hack thro...,0
3,3,7,8,whi am i mental veri lone ? how can i solv it ?,find the remaind when [ math ] 23^ { 24 } [ /m...,0
4,4,9,10,"which one dissolv in water quik sugar , salt ,...",which fish would surviv in salt water ?,0


In [8]:
#os.remove(mat_file)

# generating feature matrix

In [15]:
def add_feature(feat_func, pp_file, mat_file):
    ff_name = pp.get_method_abbre(feat_func.__name__)
    try:
        df_pp = pd.read_csv(pp_file)
        mat = pd.read_csv(mat_file)
        
        try:
            # check if feature already exists
            start_idx = mat[ff_name].last_valid_index()
            print(start_idx+1 ,"vs", df_pp.shape[0])
            if start_idx + 1 < df_pp.shape[0]:
                print("extending %s col from start_idx: %d..."%(ff_name,start_idx+1))
                df_pp = df_pp.loc[start_idx+1:]
                sub = '_sub_'
                mat = mat.rename(index=str, columns={ff_name:sub})
                cp_col = mat[sub][:start_idx+1].append(df_pp.apply(feat_func, axis=1, raw=True))
                cp_col = cp_col.to_frame(ff_name)
                cp_col.reset_index(drop=True, inplace=True)
                mat.reset_index(drop=True, inplace=True)
                new_df = cp_col.join(mat)
                new_df = new_df.drop([sub], axis = 1)
                new_df.to_csv(mat_file, index=False)
                
        # if feature doesn't alreay exist -> add a new col
        except:
            print("adding a new feature col")
            new_col = df_pp.apply(feat_func, axis=1, raw=True)
            new_col = new_col.to_frame(ff_name)
            new_df = pd.concat([mat,new_col],axis=1)
            new_df.to_csv(mat_file, index=False)

    # mat_file does not exist
    except:
        print("creating mat...")
        mat = pd.DataFrame()
        mat[ff_name] = df_pp.apply(feat_func, axis=1, raw=True)
        mat.to_csv(mat_file, index=False)

    print(ff_name,"applied, Mat shape: ", pd.read_csv(mat_file).shape)
    

In [16]:
add_feature(fg.shared_percentage, pp_file, mat_file, )
print("---------------------------------------")
print("---------------------------------------")

add_feature(fg.longest_common_substr_prop, pp_file, mat_file)
print("---------------------------------------")
print("---------------------------------------")
add_feature(fg.is_first_word_identical, pp_file, mat_file)
print("---------------------------------------")
print("---------------------------------------")
add_feature(fg.is_in_same_cat, pp_file, mat_file)
pd.read_csv(mat_file).head(5)

5000 vs 40000
extending sp col from start_idx: 5000...
sp applied, Mat shape:  (40000, 4)
---------------------------------------
---------------------------------------
5000 vs 40000
extending lcsp col from start_idx: 5000...
lcsp applied, Mat shape:  (40000, 4)
---------------------------------------
---------------------------------------
5000 vs 40000
extending ifwi col from start_idx: 5000...
ifwi applied, Mat shape:  (40000, 4)
---------------------------------------
---------------------------------------
5000 vs 40000
extending iisc col from start_idx: 5000...
iisc applied, Mat shape:  (40000, 4)


Unnamed: 0,iisc,ifwi,lcsp,sp
0,0.0,1.0,0.910569,0.5
1,0.0,1.0,0.468085,0.444444
2,0.0,1.0,0.162602,0.307692
3,0.0,0.0,0.033898,0.125
4,1.0,1.0,0.123894,0.25


In [17]:
pd.read_csv(mat_file)

Unnamed: 0,iisc,ifwi,lcsp,sp
0,0.0,1.0,0.910569,0.500000
1,0.0,1.0,0.468085,0.444444
2,0.0,1.0,0.162602,0.307692
3,0.0,0.0,0.033898,0.125000
4,1.0,1.0,0.123894,0.250000
5,1.0,0.0,0.170455,0.292683
6,0.0,0.0,0.050000,0.117647
7,0.0,0.0,0.328767,0.222222
8,1.0,1.0,0.400000,0.272727
9,1.0,0.0,0.192982,0.347826


# split data for training, validation, testing

In [72]:
from sklearn.model_selection import train_test_split

data = pd.read_csv(mat_file)
targ = pd.read_csv(pp_file)

Y = targ['is_duplicate']
X = data.join(targ.drop(raw_drop, axis = 1))
X = X.drop(['is_duplicate'],axis = 1)

xtr, xte, ytr, yte = train_test_split(X, Y, test_size=0.4, random_state=4242)

#### Logistic regression

In [55]:
xtr, xte, ytr, yte = train_test_split(X, Y, test_size=0.4, random_state=4242)

In [57]:
from sklearn.linear_model import LogisticRegression as lr

model = lr().fit(xtr,ytr)
predicted_LR = model.predict(xte)
print('AUC:', roc_auc_score(yte, predicted_LR))
print('accuracy:', sum([int(int(l>0.5)==r) for l,r in zip(predicted_LR,yte)]), "/" ,len(predicted_LR), "=", \
      sum([int(int(l>0.5)==r) for l,r in zip(predicted_LR,yte)])/len(predicted_LR))

AUC: 0.5
accuracy: 9959 / 16000 = 0.6224375


#### extreme gradient boost tree

In [58]:
import xgboost as xgb
# Set our parameters for xgboost
params = {}
params['objective'] = 'binary:logistic'
params['eval_metric'] = 'logloss'
params['eta'] = 0.02
params['max_depth'] = 6

d_train = xgb.DMatrix(xtr, label=ytr)
d_valid = xgb.DMatrix(xte, label=yte)

watchlist = [(d_train, 'train'), (d_valid, 'valid')]

bst = xgb.train(params, d_train, 1000, watchlist, early_stopping_rounds=90, verbose_eval=10)

[0]	train-logloss:0.683964	valid-logloss:0.686693
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 90 rounds.
[10]	train-logloss:0.608765	valid-logloss:0.63598
[20]	train-logloss:0.555219	valid-logloss:0.603131
[30]	train-logloss:0.515726	valid-logloss:0.58183
[40]	train-logloss:0.486162	valid-logloss:0.567844
[50]	train-logloss:0.463297	valid-logloss:0.558896
[60]	train-logloss:0.445663	valid-logloss:0.553618
[70]	train-logloss:0.43161	valid-logloss:0.550867
[80]	train-logloss:0.420488	valid-logloss:0.549788
[90]	train-logloss:0.41164	valid-logloss:0.550029
[100]	train-logloss:0.404272	valid-logloss:0.551221
[110]	train-logloss:0.398279	valid-logloss:0.552808
[120]	train-logloss:0.393327	valid-logloss:0.554908
[130]	train-logloss:0.389122	valid-logloss:0.557323
[140]	train-logloss:0.385519	valid-logloss:0.559782
[150]	train-logloss:0.382407	valid-logloss:0.562065
[160]	train-logloss:0.379771	val

In [59]:
d_te = xgb.DMatrix(xte)
p_te = bst.predict(d_te)
print('Original AUC:', roc_auc_score(yte, p_te))
pp_train = []
for i in p_te[:]:
    if i > 0.5:
        pp_train.append(1)
    else:
        pp_train.append(0)
print('accuracy:', sum([int(l==r) for l,r in zip(pp_train,yte)]), "/" ,len(pp_train), "=", \
      sum([int(l==r) for l,r in zip(pp_train,yte)])/len(pp_train))

Original AUC: 0.7710869157154663
accuracy: 11116 / 16000 = 0.69475


#### neural network

In [61]:
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(10, 10), random_state=1)
clf.fit(xtr, ytr)


MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(10, 10), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True,
       solver='lbfgs', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)

In [62]:
p_nn = clf.predict(xte)
print('AUC:', roc_auc_score(yte, p_nn))
print('accuracy:', sum([int(int(l>0.5)==r) for l,r in zip(p_nn,yte)]), "/" ,len(p_nn), "=", \
      sum([int(int(l>0.5)==r) for l,r in zip(p_nn,yte)])/len(p_nn))

AUC: 0.49994979415603974
accuracy: 9958 / 16000 = 0.622375


#### recurrent neural network

In [63]:
import keras
from keras.models import Sequential

from keras.layers import LSTM
from keras.layers import Dense, Embedding, SimpleRNN

xtr_rnn, xva_rnn, ytr_rnn, yva_rnn = train_test_split(xtr, ytr, test_size=0.2, random_state=4242)

# ----------------training data----------------------

x_arr = xtr_rnn.values
y_arr = ytr_rnn.values

row = x_arr.shape[0]
col = x_arr.shape[1]

x_arr = x_arr.reshape(row,col,1)

# ----------------validation data----------------------

xva_arr = xva_rnn.values
yva_arr = yva_rnn.values

row = xva_arr.shape[0]
col = xva_arr.shape[1]
 
x_va = xva_arr.reshape(row,col,1)
y_va = yva_arr

In [64]:
model=Sequential()

# model.add(SimpleRNN(input_dim=1, output_dim=50))

model.add(LSTM(input_dim=1, output_dim=50))
model.add(Dense(output_dim=1, activation = "sigmoid"))
model.compile(loss="mse", optimizer="rmsprop")

adam = keras.optimizers.Adam(lr=0.001)
model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])

model.fit(x_arr, y_arr, validation_data=(x_va, y_va), nb_epoch=10, batch_size=32)

  """
  """
  


Train on 19200 samples, validate on 4800 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x23147fe8438>

In [67]:
xte_arr = xte.values
yte_arr = yte.values

row = xte_arr.shape[0]
col = xte_arr.shape[1]
 
xte_arr = xte_arr.reshape(row,col,1)
yte_arr = yte_arr    
    
scores = model.evaluate(xte_arr, yte_arr, verbose=1)
print("Elman RNN Test Accuracy: %.3f%%" % (scores[1]*100))

Elman RNN Test Accuracy: 69.525%
