## 1.Reading Data

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import re
from sklearn.model_selection import train_test_split

In [3]:
train = pd.read_csv("train.csv")

In [4]:
train.head()

Unnamed: 0,ID,TITLE,ABSTRACT,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance
0,1,Reconstructing Subject-Specific Effect Maps,Predictive models allow subject-specific inf...,1,0,0,0,0,0
1,2,Rotation Invariance Neural Network,Rotation invariance and translation invarian...,1,0,0,0,0,0
2,3,Spherical polyharmonics and Poisson kernels fo...,We introduce and develop the notion of spher...,0,0,1,0,0,0
3,4,A finite element approximation for the stochas...,The stochastic Landau--Lifshitz--Gilbert (LL...,0,0,1,0,0,0
4,5,Comparative study of Discrete Wavelet Transfor...,Fourier-transform infra-red (FTIR) spectra o...,1,0,0,1,0,0


In [5]:
y = train[['Computer Science', 'Physics', 'Mathematics','Statistics', 'Quantitative Biology', 'Quantitative Finance']].values

In [6]:
len(y[0])

6

## 2.No of Tag per Question

In [7]:
tags_sum=[]
for tags in y:
    tags_sum.append(sum(tags))

In [8]:
train["no of tags"]=tags_sum

In [9]:
print(train["no of tags"].value_counts())

1    15928
2     4793
3      251
Name: no of tags, dtype: int64


In [10]:
train['text'] = train['TITLE']+' '+train['ABSTRACT']

In [11]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## 3.Sentence Preprocessing 

In [12]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

In [13]:
def clean_text(x):
    x=x.lower()
    x= re.sub('[^A-Za-z0-9]+', ' ', x)
    return x

def text_normalization(text):
    #stopword removal and stemming
    string=""
    for word in text.split():
        if not word in stop_words:
            #word=(sno.stem(word))
            string += word + " "
    return string

In [None]:
title=[]
for x in train["text"]:
    x=clean_text(x)
    x=text_normalization(x)
    title.append(x)
train["text"]=title

In [17]:
train.head()

Unnamed: 0,ID,TITLE,ABSTRACT,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance,no of tags,text
0,1,Reconstructing Subject-Specific Effect Maps,Predictive models allow subject-specific inf...,1,0,0,0,0,0,1,reconstructing subject specific effect maps pr...
1,2,Rotation Invariance Neural Network,Rotation invariance and translation invarian...,1,0,0,0,0,0,1,rotation invariance neural network rotation in...
2,3,Spherical polyharmonics and Poisson kernels fo...,We introduce and develop the notion of spher...,0,0,1,0,0,0,1,spherical polyharmonics poisson kernels polyha...
3,4,A finite element approximation for the stochas...,The stochastic Landau--Lifshitz--Gilbert (LL...,0,0,1,0,0,0,1,finite element approximation stochastic maxwel...
4,5,Comparative study of Discrete Wavelet Transfor...,Fourier-transform infra-red (FTIR) spectra o...,1,0,0,1,0,0,2,comparative study discrete wavelet transforms ...


In [23]:
y = train[['Computer Science', 'Physics', 'Mathematics','Statistics', 'Quantitative Biology', 'Quantitative Finance']].values

In [25]:
y

array([[1, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0],
       ...,
       [1, 0, 0, 0, 0, 0],
       [0, 0, 1, 1, 0, 0],
       [0, 0, 1, 1, 0, 0]])

## 4. Splitting Train and Test

In [26]:
list_sentences_train=train['text'].values

In [27]:
from sklearn.model_selection import train_test_split

In [28]:
x_tr,x_test,y_tr,y_test=train_test_split(list_sentences_train,y,test_size=0.30,random_state=42)
x_train,x_cv,y_train,y_cv=train_test_split(x_tr,y_tr,test_size=0.30,random_state=42)

In [29]:
x_train.shape,x_test.shape,x_cv.shape

((10276,), (6292,), (4404,))

In [30]:
y_train.shape,y_test.shape,y_cv.shape

((10276, 6), (6292, 6), (4404, 6))

## 5.Deep learning Model

### 5.1 Tokenization

In [31]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [32]:
tokenizer = Tokenizer()

In [33]:
tokenizer.fit_on_texts(x_train)

In [34]:
#tokenizer datac
vocab_size=len(tokenizer.word_index)+1
print(vocab_size)

38576


In [36]:
max_len=300

### 5.2. padding and text to sequence

In [37]:
train_sequences = tokenizer.texts_to_sequences(x_train)

train_padded = pad_sequences(
    train_sequences, maxlen=max_len, truncating="post", padding="post"
)

In [38]:
cv_sequences = tokenizer.texts_to_sequences(x_cv)

cv_padded = pad_sequences(
    cv_sequences, maxlen=max_len, truncating="post", padding="post"
)

In [39]:
test_sequences = tokenizer.texts_to_sequences(x_test)

test_padded = pad_sequences(
    test_sequences, maxlen=max_len, truncating="post", padding="post"
)

In [41]:
y_train.shape,y_test.shape,y_cv.shape

((10276, 6), (6292, 6), (4404, 6))

### 5.3. Training deep learning model

In [47]:
from tensorflow.keras.models import Sequential
from keras.layers import Dense, Embedding, Input
from keras.layers import LSTM, Bidirectional, GlobalMaxPool1D, Dropout
from keras.preprocessing import text, sequence
from keras.callbacks import EarlyStopping, ModelCheckpoint

In [52]:
embedding_vector_features=100

model=Sequential()
model.add(Embedding(vocab_size,embedding_vector_features,input_length=max_len))
model.add(LSTM(64, dropout=0.2,return_sequences=True))
model.add(LSTM(128, dropout=0.2,return_sequences=True))
model.add(GlobalMaxPool1D())
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(y.shape[1],activation='sigmoid'))

model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [53]:
### Finally Training
history=model.fit(train_padded,y_train,epochs=15,batch_size=32,validation_data=(cv_padded,y_cv))

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


### 5.4. Testing Model

In [54]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

y_pred = model.predict(test_padded)
y_pred = np.round(y_pred).astype(int)
print("F1-Score on test set: %0.3f"%(f1_score(y_test, y_pred,average='micro')))

F1-Score on test set: 0.765


In [55]:
from sklearn.metrics import jaccard_score

In [57]:
jaccard_score(y_test, y_pred,average='micro')

0.618934544703696