# HackLive 3: Guided Hackathon - NLP (Analytics vidhya)

<h2> Performance metric </h2>

<h4> Micro F1 score </h4>

## Importing the data and necessary libraries

In [None]:
import re
import numpy as np
import pandas as pd
from scipy.sparse import hstack
from sklearn.metrics import f1_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
df = pd.read_csv('../input/training-set/Train.csv')
df.drop('id', axis=1, inplace=True) #Unnecesary Features
df.head()

In [None]:
print(df.shape)

<h3> There are total 14004 research papers(rows) in which Abstract gives us the gist of the research paper, rows such as Computer Science, Mathematics, Physics, Statistics gives us the primary domain of the research paper and the remaining 25 columns are the target columns(labels) </h3>

<h4> Above is the sample of a Abstract of a research paper </h4>

In [None]:
df_test = pd.read_csv('../input/test-set/Test.csv')
df_test.head()

In [None]:
TARGET_COLS = ['Analysis of PDEs', 'Applications',
               'Artificial Intelligence', 'Astrophysics of Galaxies',
               'Computation and Language', 'Computer Vision and Pattern Recognition',
               'Cosmology and Nongalactic Astrophysics',
               'Data Structures and Algorithms', 'Differential Geometry',
               'Earth and Planetary Astrophysics', 'Fluid Dynamics',
               'Information Theory', 'Instrumentation and Methods for Astrophysics',
               'Machine Learning', 'Materials Science', 'Methodology', 'Number Theory',
               'Optimization and Control', 'Representation Theory', 'Robotics',
               'Social and Information Networks', 'Statistics Theory',
               'Strongly Correlated Electrons', 'Superconductivity',
               'Systems and Control']


<h3> Text preprocessing </h3>

In [None]:
#Could had done by removing stopwords using NLTK
def remove_punctuations(x):
    x = str(x)
    for punct in "/-'":
        x = x.replace(punct, ' ')
    for punct in '&':
        x = x.replace(punct, f' {punct} ')
    for punct in '?!.,"#$%\'()*+-/:;<=>@[\\]^_`{|}~' + '“”’' + '…':
        x = x.replace(punct, '')
    return x

In [None]:
def clean_numbers(x):
    x = re.sub('[0-9]{5,}', '#####', x)
    x = re.sub('[0-9]{4}', '####', x)
    x = re.sub('[0-9]{3}', '###', x)
    x = re.sub('[0-9]{2}', '##', x)
    return x

In [None]:
# This is a generalized replacement of misspelled words which i use for all projects so some words here may not be actually in abstract
def misspelled_words(x):
    x = x.replace('colour', 'color').replace('centre', 'center').replace('didnt', 'did not').replace('doesnt', 'does not') \
        .replace('isnt', 'is not').replace('shouldnt', 'should not').replace('favourite', 'favorite').replace('travelling', 'traveling') \
        .replace('counselling', 'counseling').replace('theatre', 'theater').replace('cancelled', 'canceled').replace('labour', 'labor') \
        .replace('organisation', 'organization').replace('wwii', 'world war 2').replace('citicise', 'criticize') \
        .replace('instagram', 'social medium').replace('whatsapp', 'social medium').replace('WeChat', 'social medium') \
        .replace('snapchat', 'social medium').replace('Snapchat', 'social medium').replace('btech', 'B.Tech').replace('Quorans', 'Quora') \
        .replace('cryptocurrency', 'crypto currency').replace('cryptocurrencies', 'crypto currency').replace('behaviour', 'behavior') \
        .replace('analyse', 'analyze').replace('licence', 'license').replace('programme', 'program').replace('grey', 'gray') \
        .replace('realise', 'realize').replace('bcom', 'B.Com').replace('defence', 'defense').replace('mtech', 'M.Tech') \
        .replace('Btech', 'B.Tech').replace('honours', 'honors').replace('recognise', 'recognize').replace('programr', 'programmer') \
        .replace('programrs', 'programmer').replace('hasnt', 'has not').replace('litre', 'liter').replace('Isnt', 'is not') \
        .replace('learnt', 'learn').replace('favour', 'favor').replace('neighbour', 'neighbor').replace('demonetisation', 'demonetization') \
        .replace('₹', '').replace('&', 'and')
    return x

In [None]:
df["ABSTRACT"] = df["ABSTRACT"].apply(lambda x: remove_punctuations(x))
df["ABSTRACT"] = df["ABSTRACT"].apply(lambda x: clean_numbers(x))
df["ABSTRACT"] = df["ABSTRACT"].apply(lambda x: misspelled_words(x))
df_test["ABSTRACT"] = df_test["ABSTRACT"].apply(lambda x: remove_punctuations(x))
df_test["ABSTRACT"] = df_test["ABSTRACT"].apply(lambda x: clean_numbers(x))
df_test["ABSTRACT"] = df_test["ABSTRACT"].apply(lambda x: misspelled_words(x))

<h3> Splitiing the data into train and validation (80:20) </h3>

In [None]:
#Training and Cross-Validation Set
train, val = train_test_split(df, test_size=0.2, random_state=0)
train.shape, val.shape

<h3> Vectorizing train, validation and test dataset using Tfidf vectorizer</h3>

In [None]:
tfidfvec = TfidfVectorizer(min_df=3, max_features=None, ngram_range=(1, 2), strip_accents='unicode', stop_words='english')
tfidfvec.fit(df['ABSTRACT'])
train_vec = tfidfvec.transform(train['ABSTRACT'])
val_vec = tfidfvec.transform(val['ABSTRACT'])
test_vec = tfidfvec.transform(df_test['ABSTRACT'])
train_vec.shape, val_vec.shape, test_vec.shape

<h5> Here after vectorizing we are stacking the remaining 4 features into csr format. Here if we use numpy array format instead of csr format then our RAM won't be able to suffice hence it is important to pass data to our model in csr format </h5>

In [None]:
train_data = hstack((train_vec, train[['Computer Science', 'Mathematics', 'Physics', 'Statistics']]), format="csr", dtype='float64')
val_data = hstack((val_vec, val[['Computer Science', 'Mathematics', 'Physics', 'Statistics']]), format="csr", dtype='float64')
test_data = hstack((test_vec, df_test[['Computer Science', 'Mathematics', 'Physics', 'Statistics']]), format="csr", dtype='float64')
train_data.shape, val_data.shape, test_data.shape

<h3> Using Grid search to find best hyperparameters </h3>
<h5> Note: Since there was only single hyperparameter to tune hence i used GridSearchCV. If there are more hyperparameters it is wise to choose RandomizedSearchCV </h5>

In [None]:
parameters = {
    'estimator__C': [10 ** x for x in range(-2, 3)]
}

estimator = OneVsRestClassifier(LogisticRegression(max_iter=500, n_jobs=-1))
model = GridSearchCV(estimator, parameters, scoring='f1_micro', cv=5, n_jobs=-1, refit=False)
model.fit(train_data, train[TARGET_COLS])
best_C = model.best_params_['estimator__C']
print('The best value of C is', best_C)

<h3> Applying ML model using best hyperparameter and predicting on validation data

In [None]:
clf = OneVsRestClassifier(LogisticRegression(C = best_C, max_iter=500, n_jobs=-1))
clf.fit(train_data, train[TARGET_COLS])
pred = clf.predict(val_data)
f1_score(val[TARGET_COLS], pred, average='micro')

In [None]:
#This is a simple hack which is used to find the optimal treshold to calculate the best F1 score
def get_best_thresholds(true, preds):
    thresholds = [i/100 for i in range(100)]
    best_thresholds = []
    for idx in range(25):
        f1_scores = [f1_score(true[:, idx], (preds[:, idx] > thresh) * 1) for thresh in thresholds]
        best_thresh = thresholds[np.argmax(f1_scores)]
        best_thresholds.append(best_thresh)
    return best_thresholds

In [None]:
val_preds = clf.predict_proba(val_data)
best_thresholds = get_best_thresholds(val[TARGET_COLS].values, val_preds)
for i, thresh in enumerate(best_thresholds):
    val_preds[:, i] = (val_preds[:, i] > thresh) * 1
f1_score(val[TARGET_COLS], val_preds, average='micro')

<h4> As you can see above the F1 score after finding optimal tresholds has drastically improved from 0.73 to 0.78.
Such improvements can lead to the creation of more real life based ML Models

<h3> Submitting the predictions </h3>

In [None]:
ss = pd.read_csv('../input/topic-modeling-for-research-articles-20/submission.csv')
preds_test = clf.predict_proba(test_data)

for i, thresh in enumerate(best_thresholds):
    preds_test[:, i] = (preds_test[:, i] > thresh) * 1

ss[TARGET_COLS] = preds_test
ss.to_csv('submission_hacklive_nlp', index = False)