In [None]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
 
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import regex as re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from wordcloud import WordCloud


from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model,Sequential
from keras import initializers, regularizers, constraints, optimizers, layers

In [None]:
train = pd.read_csv('/kaggle/input/us-patent-phrase-to-phrase-matching/train.csv')
test = pd.read_csv('/kaggle/input/us-patent-phrase-to-phrase-matching/test.csv')
sub = pd.read_csv('/kaggle/input/us-patent-phrase-to-phrase-matching/sample_submission.csv')
embedding  = '/kaggle/input/glove-embeddings/glove.6B.50d.txt'

In [None]:
def basic_exploration(df):
    print(f"-----STARTED--EXPLORING----\n")
    print(f"\n {df.head(5)}\n")
    print(f"\n the size of dataframe is {df.shape}\n")
    print(f"\n {df.info()}\n")
    print(f"\n{df.isnull().any()}\n")
    print(f"-----ENDED EXPLORING-------\n")

basic_exploration(train)
basic_exploration(test)    

In [None]:
def hist(df):
    sns.countplot(df.id)
    plt.xlabel("Frequency")
    plt.xticks(rotation = 90)
    plt.show()
    
def eda(df):
    print(f"Total no. of unique anchors {len(np.unique(df.anchor).tolist())}\n")
    print(f"Min value of score : {np.min(df.score)} and Max value of score : {np.max(df.score)}\n")
    print(f"Top 20 anchor categories are {df.groupby('anchor').count().sort_values('score', axis = 0, ascending = False)['id'][:20]}")
    top = df.groupby('anchor').count().sort_values('score', axis = 0, ascending = False)[:50]
    bottom = df.groupby('anchor').count().sort_values('score', axis = 0, ascending = True)[:50]
    hist(top)
    hist(bottom)
eda(train)    
    
    

In [None]:
stops = set(stopwords.words("english"))
def clean_content(table):
    content = table.target
    content = content.apply(lambda x: x.lower())
    #Removing any character which does not match to letter,digit or underscore
    content = content.apply(lambda x: re.sub(r'^\W+|\W+$',' ',x))
    #Removing space,newline,tab
    content = content.apply(lambda x: re.sub(r'\s',' ',x))
    #Removing punctuation
    content = content.apply(lambda x: re.sub(r'[^a-zA-Z0-9]',' ',x))
    #Tokenizing data
    content = content.apply(lambda x: word_tokenize(x))
    #Removing stopwords
    content = content.apply(lambda x: [i for i in x if i not in stops])
    return(content)
train['words'] = clean_content(train)
test['words'] = clean_content(test)

In [None]:
def wordcloud(df):
    buffer = " " 
    for x in df.words:
        for y in x:
            buffer += " " + y
    plt.figure(figsize=(8,10))
    wc = WordCloud(background_color= 'white', max_words=1000,random_state=1).generate(buffer)
    plt.imshow(wc)
    plt.show() 
wordcloud(train)
wordcloud(test)

In [None]:
def Stemming(df):
    wordnet = WordNetLemmatizer()
    df['words']= df['words'].apply(lambda x:[wordnet.lemmatize(i,pos='v') for i in x])
Stemming(train)
Stemming(test)

In [None]:
list_of_words_train = train['words'].values

In [None]:
list_of_words_train

In [None]:
test.head(5)

In [None]:
maxlen = 100
max_features = 10000 
embed_size = 50

In [None]:
x_train = train['words']
y_train = train['score']
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(x_train)
seq_train = tokenizer.texts_to_sequences(x_train)
X_train = pad_sequences(seq_train, maxlen=maxlen)

In [None]:
def get_coefs(word,*arr): 
        return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.strip().split()) for o in open(embedding))
all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
emb_mean,emb_std

In [None]:
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= nb_words: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [None]:
model = Sequential()
model.add(Embedding(nb_words, embed_size, weights=[embedding_matrix]))
model.add(Bidirectional(LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1)))
model.add(GlobalMaxPool1D())
model.add(Dense(50,activation = 'relu'))
model.add(Dropout(0.1))
model.add(Dense(1))
model.compile(loss='mse', optimizer='rmsprop')

In [None]:
model.fit(X_train, y_train, batch_size=32, epochs=2, validation_split=0.1);

In [None]:
test.head(5)

In [None]:
x_test = test['words']
x_test = tokenizer.texts_to_sequences(x_test)
X_test = pad_sequences(x_test, maxlen=maxlen)

In [None]:
predict = model.predict(X_test)
test['label'] = predict
test.head()

In [None]:
ans = test.label
sub['score'] = ans
sub.to_csv("submission.csv",index=False)

In [None]:
sub.head(5)