**Importing the required libraries.**

In [None]:
!pip install texthero

In [None]:
import pandas as pd
import numpy as np
import texthero as hero
from texthero import preprocessing as ppe
from texthero import visualization as viz
import spacy
from spacy import displacy
import re
import warnings
warnings.filterwarnings("ignore")
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from wordcloud import WordCloud
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV, RepeatedStratifiedKFold, cross_val_score, cross_val_predict
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier

**Reading the training and test dataset**

In [None]:
train_df = pd.read_csv('../input/nlp-getting-started/train.csv')
train_df.head()

In [None]:
test_df = pd.read_csv('../input/nlp-getting-started/test.csv')
test_df.head()

In [None]:
train_df.info()

In [None]:
test_df.info()

# Class Distribution

In [None]:
sns.countplot(x = 'target', data = train_df, facecolor=(0, 0, 0, 0),
                   linewidth=5,
                   edgecolor=sns.color_palette("dark", 3))
plt.xlabel('Class Names')
plt.ylabel('Count')
plt.title('Distribution of classes in the training dataset')
plt.show()

# Location column value distribution

In [None]:
plt.figure(figsize=(10,5))
sns.barplot(y=train_df['location'].value_counts()[:10].index,x=train_df['location'].value_counts()[:10],orient='h',
            facecolor=(0, 0, 0, 0), linewidth = 3,
           edgecolor=sns.color_palette("dark", 3))
plt.xlabel('Location Count')
plt.title('Top 10 locations with the maximum occurence in the training dataset')
plt.show()

# Keyword column value distribution

In [None]:
plt.figure(figsize=(10,5))
sns.barplot(y=train_df['keyword'].value_counts()[:10].index,x=train_df['keyword'].value_counts()[:10],orient='h',
            facecolor=(0, 0, 0, 0), linewidth = 2,
           edgecolor=sns.color_palette("dark", 3))
plt.xlabel('Keyword Count')
plt.title('Top 10 keywords with the maximum occurence in the training dataset')
plt.show()

In [None]:
test_df.keyword.value_counts()[:10]

In [None]:
test_df.location.value_counts()[:10]

**Dropping the Keyword and location columns from both training and test set.**

In [None]:
train_df.drop(['keyword', 'location'], axis = 1, inplace = True)
train_df.head()

In [None]:
test_df.drop(['keyword', 'location'], axis = 1, inplace = True)
test_df.head()

Getting the word count of the text column

In [None]:
train_df['word count'] = train_df.text.apply(len)
train_df.head()

The **describe()** method is used for calculating statistical data like percentile, mean and std of the numerical values of the Series or DataFrame. We're using this method below on the word count column.

In [None]:
train_df['word count'].describe()

# Most common and uncommon words in the text column

In [None]:
#most common words
freq = pd.Series(''.join(train_df['text']).split()).value_counts()[:10]
freq

In [None]:
#uncommon words
not_freq = pd.Series(''.join(train_df['text']).split()).value_counts()[-10:]
not_freq

# Data Preprocessing
For data pre-processing, we're mainly gonna use the awesome **TextHero** library. Under the hoods, Texthero utilizes various NLP and AI tool compartments like **Gensim, NLTK, SpaCy** and **scikit-learn**. We're gonna use mainly the pre-processing toolkit of this library. 

So, with this library, we can create custom pipeline where we can mention various textual data cleaning techniques like **removing whitespaces, stop words, punctuations**, etc. and then apply this on the text column.

In [None]:
custom_pipeline = [ppe.fillna, ppe.lowercase, ppe.remove_punctuation, ppe.remove_whitespace, 
                  ppe.remove_stopwords, ppe.remove_urls, ppe.remove_digits]

train_df['cleaned_text'] = hero.clean(train_df['text'], custom_pipeline)
test_df['cleaned_text'] = hero.clean(test_df['text'], custom_pipeline)

In [None]:
train_df.head()

In [None]:
test_df.head()

# Lemmatization
We use lemmatizer to convert the words into their root words. So, suppose if the data contains 2 words where one of them is in past tense (**believed**) and another one in future tense (**believing**), that particular word will be converted to its root word (**believe**). 

This particularly helps us while training our model, as the model doesn't need to learn 2 different words which basically have the same meaning.

In [None]:
def lemmatizer(r):
    wnl = WordNetLemmatizer()
    words = nltk.word_tokenize(r)
    lemmatized_words = [wnl.lemmatize(word) for word in words if word not in set(stopwords.words('english'))]
    return " ".join(lemmatized_words)

train_df['lemma_cleaned_text'] = train_df['cleaned_text'].apply(lemmatizer)
test_df['lemma_cleaned_text'] = test_df['cleaned_text'].apply(lemmatizer)

# Special Character removal
We're gonna use the below code to remove special characters like @, #, $ , etc. 
This step is really important as these characters can really hamper our model's performance while training.


In [None]:
def remove_special_characters(text):
    pattern = r'[^a-zA-Z]'
    text = re.sub(pattern, ' ', text)
    return text

train_df['special_char_cleaned_text'] = train_df['lemma_cleaned_text'].apply(remove_special_characters)
test_df['special_char_cleaned_text'] = test_df['lemma_cleaned_text'].apply(remove_special_characters)

In [None]:
train_df.head()

In [None]:
test_df.head()

# WordCloud 
This Wordcloud displays the most frequent words in the training dataset.

In [None]:
wordcloud = WordCloud().generate(' '.join(train_df['special_char_cleaned_text']))

In [None]:
print(wordcloud)
fig = plt.figure(1)
plt.figure(figsize=(10,5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

# Named Entity Recognition (NER)
Named-entity recognition is a subtask of information extraction that seeks to locate and classify named entities mentioned in unstructured text into pre-defined categories such as person names, organizations, locations, medical codes, time expressions, quantities, monetary values, percentages, etc. We're using the **SpaCy** library for that.

In [None]:
nlp = spacy.load('en_core_web_sm')
l = []
for i in train_df['special_char_cleaned_text']:
    doc = nlp(i)
    if doc.ents:
        for ent in doc.ents:
            ner = {
                    'Text' : [ent.text],
                    'NER Label' : [ent.label_],
                    'Label explaination' : [str(spacy.explain(ent.label_))]
                }
            l.append(ner)   
            df1 = pd.DataFrame(data = l)
            df1['Text'] = df1['Text'].str.get(0)
            df1['NER Label'] = df1['NER Label'].str.get(0)
            df1['Label explaination'] = df1['Label explaination'].str.get(0)
            

**Visualing the NER labels and the frequency of their occurence**

In [None]:
plt.figure(figsize=(10,5))
sns.barplot(y=df1['NER Label'].value_counts().index,x=df1['NER Label'].value_counts(),orient='h',
            facecolor=(0, 0, 0, 0), linewidth = 2,
           edgecolor=sns.color_palette("dark", 11))
plt.xlabel('NER Label Count')
plt.title('NER Labels and frequency of their occurence')
plt.show()

# Vectorization
In basic terms, Vectorization is the **process of converting text into numerical representation** which are also called **embeddings**. 
Since, the computers are not as intelligent as us (till now atleast ;), they can't understand textual data, so to make our data understandable to a computer, we first convert it to a numerical format.

There are various techniques for text vectorization like:-
* **Bag of Words**
* **Count Vectorizer**
* **TF-IDF Vectorizer**

Here, we're gonna use the **TF-IDF Vectorizer** approach. 
So, TF-IDF is an acronym for **Term Frequency - Inverse Document Frequency**. 
* TF makes sure to give high score to the word that appears frequently.
* IDF makes sure to give low score to the word if it appears pretty frequently in documents (not a unique identifier).

So, the amalgamation of **TF * IDF** is how the score is calculated for this vectorizer.

In [None]:
tfidf_vectorizer = TfidfVectorizer()
train_tfidf = tfidf_vectorizer.fit_transform(train_df['special_char_cleaned_text'])
test_tfidf = tfidf_vectorizer.transform(test_df['special_char_cleaned_text'])

# Training our model
So, here we're gonna use the** XGBClasifier** which basically harnesses the power of **boosting trees**. 
So, boosting trees are a little different than your normal decision trees. In decision trees, we're ensembling a model on top of another but at the end of the day we're using a single model for our prediction. But boosting trees take a smarter approach when it comes to training a model efficiently. So, rather than training all of the models in isolation of one another, boosting trains models in succession, with each new model being trained to correct the errors made by the previous ones. 
Models are added sequentially until no further improvements can be made.

The main advantage of this iterative approach is that the new models being added are focused on correcting the mistakes which were caused by other models. In a standard ensemble method, like random forest,  where models are trained in isolation, all of the models might simply end up making the same mistakes.

Since we have imbalanced data in our classes, we're gonna use **scale_pos_weight**. So, here we have defined a definite set of weights which we're gonna use in this hyper parameter.

Then, we'll be using **RepeatedStratifiedKFold** method which is going to repeat Stratified K-Fold 3 times with different randomization in each repetition. We're putting number of splits as 5, so it'll be a 5 fold cross validation.

After that, we're using **GridSearchCV** to feed and iterate through our mentioned weights and other hyper parameters. We're using **ROC AUC** curve for scoring our model's performance. So, the ROC AUC curve is the measure of the ability of a classifier to distinguish between classes. The higher the AUC, the better the performance of the model at distinguishing one class from another.

Lastly, we're summarizing the best configuration and printing them out.

In [None]:
model = XGBClassifier()
# define grid
weights = [1, 10, 15, 20]
param_grid = dict(scale_pos_weight=weights)
# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
# define grid search
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv, scoring='roc_auc')
# execute the grid search
grid_result = grid.fit(train_tfidf, train_df.target)
# report the best configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# report all configurations
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

**Predicting on the test set**

In [None]:
pred = grid_result.predict(test_tfidf)

**Final Output :)**

In [None]:
sample = pd.read_csv('../input/nlp-getting-started/sample_submission.csv')
data={"id":[],"target":[]}
for id,pred_1 in zip(sample['id'].unique(),pred): 
    data["id"].append(id) 
    data["target"].append(pred_1)

    
output=pd.DataFrame(data,columns=["id","target"])
output.to_csv('submission.csv', index=False)

In [None]:
output

# References:-
Some of the literature and learnings are borrowed from below sources. Feel free to check them out :)
* https://machinelearningmastery.com/xgboost-for-imbalanced-classification/
* https://towardsdatascience.com/a-beginners-guide-to-xgboost-87f5d4c30ed7
* https://texthero.org/docs/getting-started#preprocessing
* https://towardsdatascience.com/named-entity-recognition-ner-using-spacy-nlp-part-4-28da2ece57c6

# If you like my work,  don't forget to upvote ;)