In [97]:
#import library

import numpy as np 
import pandas as pd 
import os
import gc
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords
from collections import Counter
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.externals import joblib
%matplotlib inline

In [98]:
#===Get Data====

#Get train data
df_train = pd.read_csv('../data/given_data/train.csv')

#Get test data
df_test= pd.read_csv("../data/given_data/test.csv")

In [101]:
#===Processing===

#Create stopword List
stops = set(stopwords.words("english"))

In [102]:
#===word_match_share===

#Calculate Jaccard　coefficient between two questions with word count.
def word_match_share(row):
    q1words = {}
    q2words = {}
    for word in str(row['question1']).lower().split():
        if word not in stops:
            q1words[word] = 1
    for word in str(row['question2']).lower().split():
        if word not in stops:
            q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        return 0
    shared_words_in_q1 = [w for w in q1words.keys() if w in q2words]
    shared_words_in_q2 = [w for w in q2words.keys() if w in q1words]
    R = (len(shared_words_in_q1) + len(shared_words_in_q2))/(len(q1words) + len(q2words))
    return R

In [103]:
#Apply df_train data to word_match_function
word_match_train = df_train.apply(word_match_share, axis=1, raw=True)

In [104]:
#===tfidf_word_match_share====

#Create word list
train_qs = pd.Series(df_train['question1'].tolist() + df_train['question2'].tolist()).astype(str)

#Calculate weight
def get_weight(count, eps=10000, min_count=2):
    if count < min_count:
        return 0
    else:
        return 1 / (count + eps)

eps = 5000 
words = (" ".join(train_qs)).lower().split()
counts = Counter(words)
weights = {word: get_weight(count) for word, count in counts.items()}

In [105]:
#Calculate Jaccard　coefficient between two questions with tfidf word count.
def tfidf_word_match_share(row):
    q1words = {}
    q2words = {}
    for word in str(row['question1']).lower().split():
        if word not in stops:
            q1words[word] = 1
    for word in str(row['question2']).lower().split():
        if word not in stops:
            q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        # The computer-generated chaff includes a few questions that are nothing but stopwords
        return 0
    
    shared_weights = [weights.get(w, 0) for w in q1words.keys() if w in q2words] + [weights.get(w, 0) for w in q2words.keys() if w in q1words]
    total_weights = [weights.get(w, 0) for w in q1words] + [weights.get(w, 0) for w in q2words]
    
    R = np.sum(shared_weights) / np.sum(total_weights)
    return R

In [106]:
#Apply df_train data to word_match_function
tfidf_word_match_train = df_train.apply(tfidf_word_match_share, axis=1, raw=True)



In [63]:
#===word_count_distance===

#Count word_count_distance
def word_count_distance(row):
    count_word = len(str(row["question1"]).lower().split()) - len(str(row["question2"]).lower().split())
    return np.linalg.norm(count_word)

In [64]:
#Apply df_train data to word_match function
word_count_distance_train = df_train.apply(word_count_distance, axis=1, raw=True)

In [69]:
#Initialize train dataframe
x_train = pd.DataFrame()
y_train = pd.DataFrame()

#Add the column of  predictor valiable(explained valiable)
x_train["word_match"] = word_match_train
x_train["tfidf_word_match"] = tfidf_word_match_train
x_train["word_count_distance"] = word_count_distance_train

#Add the column of objective valiable
y_train["is_duplicate"] = df_train.is_duplicate

In [73]:
#Save x_train
x_train.to_csv("../data/train_data/x_train.csv",index = False)

#Save y_train
y_train.to_csv("../data/train_data/y_train.csv",index = False)

In [87]:
#Load x_train and y_train(from the second time)
x_train = pd.read_csv("../data/train_data/x_train.csv")

y_train = pd.read_csv("../data/train_data/y_train.csv")

In [116]:
#===test data====

#Apply df_test data to word_match_share function
word_match_test= df_test.apply(word_match_share, axis=1, raw=True)

#Apply df_test data to tfidf_word_match_share function
tfidf_word_match_test = df_test.apply(tfidf_word_match_share, axis=1, raw=True)

#Apply df_test data to word_count_distance function
word_count_distance_test= df_test.apply(word_count_distance, axis=1, raw=True)



In [117]:
#Initialize test dataframe
x_test = pd.DataFrame()
y_test = pd.DataFrame()

#Add the column of  predictor valiable(explained valiable)
x_test["word_match"] = word_match_test
x_test["tfidf_word_match"] = tfidf_word_match_test
x_test["word_count_distance"] = word_count_distance_test

In [118]:
#Save x_test
x_test.to_csv("../data/test_data/x_test.csv",index = False)

In [119]:
#Load x_test(from the second time)
x_test = pd.read_csv("../data/test_data/x_test.csv")

In [121]:
y_train = df_train['is_duplicate'].values
pos_train = x_train[y_train == 1]
neg_train = x_train[y_train == 0]

# Now we oversample the negative class
# There is likely a much more elegant way to do this...
p = 0.165
scale = ((len(pos_train) / (len(pos_train) + len(neg_train))) / p) - 1
while scale > 1:
    neg_train = pd.concat([neg_train, neg_train])
    scale -=1
neg_train = pd.concat([neg_train, neg_train[:int(scale * len(neg_train))]])
print(len(pos_train) / (len(pos_train) + len(neg_train)))

x_train = pd.concat([pos_train, neg_train])
y_train = (np.zeros(len(pos_train)) + 1).tolist() + np.zeros(len(neg_train)).tolist()
del pos_train, neg_train

0.19124366100096607


In [122]:
# Finally, we split some of the data off for validation
from sklearn.cross_validation import train_test_split

x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size=0.2, random_state=4242)

In [124]:
import lightgbm as lgb

lgb_train = lgb.Dataset(x_train, y_train)
lgb_eval = lgb.Dataset(x_valid, y_valid, reference=lgb_train)


# LightGBM parameters
params = {
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'multiclass',
        'metric': {'multi_logloss'},
        'num_class': 3,
        'learning_rate': 0.1,
        'num_leaves': 23,
        'min_data_in_leaf': 1,
        'num_iteration': 100,
        'verbose': 0
}

# train
gbm = lgb.train(params,
            lgb_train,
            num_boost_round=50,
            valid_sets=lgb_eval,
            early_stopping_rounds=10)



[1]	valid_0's multi_logloss: 0.639237
Training until validation scores don't improve for 10 rounds.
[2]	valid_0's multi_logloss: 0.600759
[3]	valid_0's multi_logloss: 0.568751
[4]	valid_0's multi_logloss: 0.54181
[5]	valid_0's multi_logloss: 0.518944
[6]	valid_0's multi_logloss: 0.499353
[7]	valid_0's multi_logloss: 0.482507
[8]	valid_0's multi_logloss: 0.467958
[9]	valid_0's multi_logloss: 0.455318
[10]	valid_0's multi_logloss: 0.444285
[11]	valid_0's multi_logloss: 0.434646
[12]	valid_0's multi_logloss: 0.426172
[13]	valid_0's multi_logloss: 0.41875
[14]	valid_0's multi_logloss: 0.412223
[15]	valid_0's multi_logloss: 0.406433
[16]	valid_0's multi_logloss: 0.401353
[17]	valid_0's multi_logloss: 0.396861
[18]	valid_0's multi_logloss: 0.392865
[19]	valid_0's multi_logloss: 0.389295
[20]	valid_0's multi_logloss: 0.386168
[21]	valid_0's multi_logloss: 0.383343
[22]	valid_0's multi_logloss: 0.38082
[23]	valid_0's multi_logloss: 0.378579
[24]	valid_0's multi_logloss: 0.37658
[25]	valid_0's 

In [125]:
#predict y_test with x_test
y_test = gbm.predict(x_test, num_iteration=gbm.best_iteration)

In [128]:
#Initialize submission DataFrame
submission=pd.DataFrame()

#Add the column of test_id
submission['test_id']=df_test['test_id']

#Add the column of 0-class probablity
submission['is_duplicate']=y_test[:,1]

#Save submission
submission.to_csv("../submission/submission_DL.csv", index=False)