In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Notebooks that I Adapted:

* https://www.kaggle.com/dineshkumaranbalagan/descriptive-analysis

* https://www.kaggle.com/mlconsult/score-57ish-with-additional-govt-datasets

* https://www.kaggle.com/armandmorin/show-us-data


# Introduction

## Goal

* The end goal is to do string matching of known datasets names in order to detect mentions of datasets in scientific publications.
* To build a strong NLP model that can infer from context whether or not a piece of text in a publication is refering to the usage of a dataset or not.

# Libraries

In [None]:
import pandas as pd
import numpy as np
import random
import time
import datetime

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS

import re
import json
from tqdm.autonotebook import tqdm
import string
import collections
from textblob import TextBlob

import spacy

import nltk
from nltk.probability import FreqDist
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer

from sklearn import preprocessing
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score, accuracy_score
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

from keras.preprocessing import sequence, text
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras import utils
from keras.models import *
from keras.layers import *
from keras.callbacks import *
from keras.models import Sequential
from keras.layers.recurrent import LSTM, GRU
from keras.layers import Dense, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.normalization import BatchNormalization
from keras.utils.vis_utils import plot_model

import warnings
warnings.filterwarnings('ignore')

# Data Description

         - train.csv- CSV file contains metadata of the publications
         - train-JSON file contains publications that are referenced in train.csv
         - test-CSV file contains publications for testing purpose
         - sample_submission.csv-CSV file conatins publications IDs column and prediction columns

**id** - publication id - note that there are multiple rows for some training documents, indicating multiple mentioned datasets.

**pub_title** -title of the publication (a small number of publications have the same title).

**dataset_title** -the title of the dataset that is mentioned within the publication.

**dataset_label** -a portion of the text that indicates the dataset.

**cleaned_label** -the dataset_label, as passed through the clean_text function from the Evaluation page.

**PredictionString** -To be filled with equivalent of cleaned_label of train data

# OBTAIN

### Train Set

In [None]:
#define paths
os.listdir('/kaggle/input/coleridgeinitiative-show-us-the-data/')
train_path = '../input/coleridgeinitiative-show-us-the-data/train'
test_path = '../input/coleridgeinitiative-show-us-the-data/test'

In [None]:
#read train data
train_df = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/train.csv')
train_df.head()

In [None]:
#create a function to get the text from the JSON file and append it to the new column in table
def read_json_pub(filename, train_path = train_path, output = 'text'):
    json_path = os.path.join(train_path, (filename + '.json'))
    headings = []
    contents = []
    combined = []
    with open(json_path, 'r') as f:
        json_decode = json.load(f)
        for data in json_decode:
            headings.append(data.get('section_title'))
            contents.append(data.get('text'))
            combined.append(data.get('section_title'))
            combined.append(data.get('text'))
    
    all_headings = ' '.join(headings)
    all_contents = ' '.join(contents)
    all_data = '. '.join(combined)
    
    if output == 'text':
        return all_contents
    elif output == 'head':
        return all_headings
    else:
        return all_data

In [None]:
#apply the function to train data
tqdm.pandas()
train_df['text'] = train_df['Id'].progress_apply(read_json_pub)

In [None]:
#recheck
train_df.head()

### Read Submission Data

In [None]:
#read submission data
submission_df = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/sample_submission.csv')

#apply the function to submission data
tqdm.pandas()
submission_df['text'] = submission_df['Id'].progress_apply(read_json_pub)

submission_df.head()

In [None]:
#save
submission_df.to_csv('submission_df.csv')

### Read Samples

In [None]:
#let's read the first sample

import json
with open('../input/coleridgeinitiative-show-us-the-data/train/d0fa7568-7d8e-4db9-870f-f9c6f668c17b.json') as f:
    sample = json.load(f)
    
sample[:2]

Within the first section, this publication mentions that they used data from the National Education Longitudinal Study. So the task of this competition is to find string of 'dataset_title' within the 'text' body and return 'cleaned_label'.

In [None]:
#get all 'section_title'
for s in sample:
    print(s['section_title'])

# SCRUB

### Basic Text Cleaning

Before we can create a bag of words or vectorize each document, we need to clean it up and split each document into an array of individual words. Computers are very particular about strings. If we tokenized our data in its current state, we would run into the following problems:

* Counting things that aren't actually words. 
* Punctuation and capitalization would mess up our word counts. We need to remove punctuation and capitalization, so that all words will be counted correctly.

In [None]:
#define stopwords
from nltk.corpus import stopwords
stopwords_list = stopwords.words('english') + list(string.punctuation)
stopwords_list += ["''", '""', '...', '``']

In [None]:
#https://towardsdatascience.com/text-analysis-feature-engineering-with-nlp-502d6ea9225d

def text_cleaning(text, flg_stemm = False, flg_lemm = True, lst_stopwords = None):
    '''
    Converts all text to lower case, tokenize, remove multiple spaces, stopwords, stemming, lemmatize, 
    then convert all back to string
    
    text: string - name of column containing text
    lst_stopwords: list - list of stopwords to remove
    flg_stemm: bool - whether stemming is to be applied
    flg_lemm: bool - whether lemmitisation is to be applied
    '''
    
    #clean (convert to lowercase and remove punctuations and characters and then strip)
    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
            
    #tokenize (convert from string to list)
    lst_text = text.split()
    
    #remove Stopwords
    if lst_stopwords is not None:
        lst_text = [word for word in lst_text if word not in 
                    stopwords_list]
                
    #stemming (remove -ing, -ly, ...)
    if flg_stemm == True:
        ps = nltk.stem.porter.PorterStemmer()
        lst_text = [ps.stem(word) for word in lst_text]
                
    #lemmatisation (convert the word into root word)
    if flg_lemm == True:
        lem = nltk.stem.wordnet.WordNetLemmatizer()
        lst_text = [lem.lemmatize(word) for word in lst_text]
            
    #back to string from list
    text = " ".join(lst_text)
    return text

In [None]:
#clean pub_title text
tqdm.pandas()
train_df['pub_title'] = train_df['pub_title'].progress_apply(text_cleaning)

In [None]:
#clean dataset_title text
tqdm.pandas()
train_df['dataset_title'] = train_df['dataset_title'].progress_apply(text_cleaning)

In [None]:
#clean train text
tqdm.pandas()
train_df['text'] = train_df['text'].progress_apply(text_cleaning)

In [None]:
#review
train_df.head()

In [None]:
#save
train_df.to_csv('train_df.csv')

# EXPLORE

In [None]:
#get info
train_df.info()

In [None]:
#check null values
train_df.isnull().sum()

In [None]:
#get summary
train_df.describe()

The Train dataset has 19,661 counts but only 14,316 unique 'Id' in the dataset. This means some 'Id' are duplicates, meaning some 'Id' use multiple datasets.

The 'pub_title' has 19,661 counts but has only 14,271 unique titles. This means some 'pub_titles' are duplicates. There are less 'pub_title' counts than 'Id' counts, meaning some 'pub_title' has multiple 'Id'.

The 'dataset_title' has 19,661 counts but has only 45 unique titles. This means some 'dataset_title' are used many times by different publications.

The 'dataset_label' has 19,661 counts but has only 130 unique labels. This means some 'dataset_label' are duplicates. There are less 'dataset_title' counts than 'dataset_label', meaning some 'dataset_title' are labeled differently by different publications.

In [None]:
print('Number of duplicates in Id:', train_df['Id'].duplicated().sum())
print('Number of duplicates in pub_title:', train_df['pub_title'].duplicated().sum())
print('Number of duplicates in dataset_title:', train_df['dataset_title'].duplicated().sum())
print('Number of duplicates in dataset_label:', train_df['dataset_label'].duplicated().sum())
print('Number of duplicates in cleaned_label:', train_df['cleaned_label'].duplicated().sum())

In [None]:
#check out duplicates
id_duplicates = train_df['Id'] == '170113f9-399c-489e-ab53-2faf5c64c5bc'
train_df.loc[id_duplicates][:10]

The same 'Id' and 'pub_title' of sample 14798 and 14799 are labeled differently as 'survey of earned doctorates' and 'national center for science and engineering' although they are the same dataset_title.

In [None]:
#check out duplicates
pub_title_duplicates = train_df['pub_title'] == 'science and engineering indicator 2014'
train_df.loc[pub_title_duplicates][:10]

'survey of science and engineering research' dataset_title is labeled differently as 'survey of science and engineering research' and 'national center for science and engineering'

In [None]:
#check out duplicates
dataset_title_duplicates = train_df['dataset_title'] == 'alzheimers disease neuroimaging initiative adni'
train_df.loc[dataset_title_duplicates][:10]

Here we see with samples 1450 and 12456, there are same 'text' with same 'Id' and same 'pub_title' but labeled different.

## Look At Dataset Metrics

In [None]:
def get_num_words_per_sample(sample_texts):
    """Returns the median number of words per sample given corpus.

    # Arguments
        sample_texts: list, sample texts.

    # Returns
        int, median number of words per sample.
    """
    num_words = [len(s.split()) for s in sample_texts]
    return np.median(num_words)

print('dataset_title median word count:', get_num_words_per_sample(train_df['dataset_title']))
print('cleaned_label median word count:', get_num_words_per_sample(train_df['cleaned_label']))
print('text median word count:', get_num_words_per_sample(train_df['text']))

In [None]:
#calculate the number of samples/number of words per sample ratio
len(train_df['dataset_title']) / get_num_words_per_sample(train_df['dataset_title'])

## Look At Each Feature Individually

### 'Id'

In [None]:
plt.figure(figsize = (30, 20)),

sns.countplot(y = train_df['Id'], 
              order = train_df['Id'].value_counts(ascending = False)[:20].index, 
              palette = 'Spectral')
plt.ylabel('Id',fontsize = 20)
plt.title('Id')
plt.show()

#save
plt.savefig('Id.png')

### 'pub_title'

In [None]:
train_df['pub_title'].unique()

In [None]:
train_df['pub_title'].value_counts().head(10).to_frame()

In [None]:
#create a frequency distribution to see which words are used the most
words = list( train_df['pub_title'].values)
stopwords = stopwords_list
split_words = []

for word in words:
    lo_w = []
    list_of_words = str(word).split()
    for w in list_of_words:
        if w not in stopwords:
            lo_w.append(w)
    split_words.append(lo_w)
allwords = []

for wordlist in split_words:
    allwords += wordlist
    
#get 100 most common words
mostcommon = FreqDist(allwords).most_common(100)
mostcommon

In [None]:
#plot frequency distributions
wordcloud = WordCloud(width = 1600, height = 800, 
                      background_color = 'black', 
                      colormap = 'Spectral', 
                      stopwords = stopwords_list).generate(str(mostcommon))

fig = plt.figure(figsize = (20, 10), facecolor = 'white')
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.title('Top 100 Most Common Words in dataset_title', fontsize = 30)
plt.tight_layout()

#save
plt.savefig('pub_title_wordcloud.png')

### 'dataset_title'

In [None]:
train_df['dataset_title'].unique()

In [None]:
train_df['dataset_title'].value_counts().head(20).to_frame()

In [None]:
#create a frequency distribution to see which words are used the most
words = list( train_df['dataset_title'].values)
stopwords = stopwords_list
split_words = []

for word in words:
    lo_w = []
    list_of_words = str(word).split()
    for w in list_of_words:
        if w not in stopwords:
            lo_w.append(w)
    split_words.append(lo_w)
allwords = []

for wordlist in split_words:
    allwords += wordlist
    
#get 100 most common words
mostcommon = FreqDist(allwords).most_common(100)
mostcommon

In [None]:
#plot frequency distributions
wordcloud = WordCloud(width = 1600, height = 800, 
                      background_color = 'black', 
                      colormap = 'Spectral', 
                      stopwords = stopwords_list).generate(str(mostcommon))

fig = plt.figure(figsize = (20, 10), facecolor = 'white')
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.title('Top 100 Most Common Words in dataset_title', fontsize = 30)
plt.tight_layout()

#save
plt.savefig('dataset_title_wordcloud.png')

In [None]:
plt.figure(figsize = (30, 30)),

sns.countplot(y = train_df['dataset_title'], 
              order = train_df['dataset_title'].value_counts().index, 
              palette = 'Spectral')
plt.ylabel('dataset_title',fontsize = 30)
plt.xticks(fontsize = 30)
plt.show()

#save
plt.savefig('dataset_title.png')

## BiGram

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

#get bigrams 
vectorizer = CountVectorizer(ngram_range = (2, 2))

#matrix of ngrams
ngrams = vectorizer.fit_transform(train_df['dataset_title']) 
features = (vectorizer.get_feature_names())
print('\n\nFeatures : \n', features)

#count frequency of ngrams
print('\n\nX1 : \n', ngrams.toarray())
  
#apply TFIDF
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range = (2, 2))
ngrams = vectorizer.fit_transform(train_df['dataset_title'])
scores = (ngrams.toarray())
print('\n\nScores : \n', scores)
  
#get top ranking features
sums = ngrams.sum(axis = 0)
data1 = []
for col, term in enumerate(features):
    data1.append( (term, sums[0,col] ))
ranking = pd.DataFrame(data1, columns = ['term','rank'])
words = (ranking.sort_values('rank', ascending = False))
print ('\n\nWords head : \n', words.head(20))

In [None]:
#count frequency of ngrams
count_values = ngrams.toarray().sum(axis = 0)

#list of ngrams
vocab = vectorizer.vocabulary_
df_bigram = pd.DataFrame(sorted([(count_values[i],k) for k,i in vocab.items()], reverse = True)
            ).rename(columns = {0: 'frequency', 1: 'bigram'})

plt.figure(figsize = (20, 10))
sns.lineplot(x = df_bigram['bigram'][:60], y = df_bigram['frequency'][:60])
plt.xticks(rotation = 90, fontsize = 16)
plt.xlabel('Bigram',fontsize = 20)
plt.ylabel('Frequency',fontsize = 20)
plt.title('Dataset Title Bigram',fontsize = 30)
plt.show()

#save
plt.savefig('dataset_title_bigram.png')

## TriGram

In [None]:
#get trigrams 
vectorizer = CountVectorizer(ngram_range = (3, 3))

#matrix of ngrams
ngrams = vectorizer.fit_transform(train_df['dataset_title']) 
features = (vectorizer.get_feature_names())
print('\n\nFeatures : \n', features)

#count frequency of ngrams
print('\n\nX1 : \n', ngrams.toarray())
  
#apply TFIDF
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range = (3,3))
ngrams = vectorizer.fit_transform(train_df['dataset_title'])
scores = (ngrams.toarray())
print('\n\nScores : \n', scores)
  
#get top ranking features
sums = ngrams.sum(axis = 0)
data1 = []
for col, term in enumerate(features):
    data1.append( (term, sums[0,col] ))
ranking = pd.DataFrame(data1, columns = ['term','rank'])
words = (ranking.sort_values('rank', ascending = False))
print ('\n\nWords head : \n', words.head(60))

In [None]:
#count frequency of ngrams
count_values = ngrams.toarray().sum(axis = 0)

#list of ngrams
vocab = vectorizer.vocabulary_
df_trigram = pd.DataFrame(sorted([(count_values[i],k) for k,i in vocab.items()], reverse = True)
            ).rename(columns = {0: 'frequency', 1:'trigram'})

plt.figure(figsize = (20, 10))
sns.lineplot(x = df_trigram['trigram'][:60], y = df_trigram['frequency'][:60])
plt.xticks(rotation = 90, fontsize = 16)
plt.xlabel('Trigram',fontsize = 20)
plt.ylabel('Frequency',fontsize = 20)
plt.title('Dataset Title Trigram',fontsize = 30)
plt.show()

#save
plt.savefig('dataset_title_trigram.png')

In [None]:
#find one-worded, two-worded, three-worded, four-worded, five-worded dataset title
one_worded_dataset_title = train_df[train_df['dataset_title'].str.split().apply(len) == 1]
two_worded_dataset_title = train_df[train_df['dataset_title'].str.split().apply(len) == 2]
three_worded_dataset_title = train_df[train_df['dataset_title'].str.split().apply(len) == 3]
four_worded_dataset_title = train_df[train_df['dataset_title'].str.split().apply(len) == 4]
five_worded_dataset_title = train_df[train_df['dataset_title'].str.split().apply(len) == 5]
six_worded_dataset_title = train_df[train_df['dataset_title'].str.split().apply(len) == 6]

#create a bar plot
fig, ax = plt.subplots(figsize = (10, 6))
ax.bar([1, 2, 3, 4, 5, 6], [one_worded_dataset_title.size,
                         two_worded_dataset_title.size,
                         three_worded_dataset_title.size,
                         four_worded_dataset_title.size,
                         five_worded_dataset_title.size,
                            six_worded_dataset_title.size])

#label the x-axis instances
ax.set_xticks([1, 2, 3, 4, 5, 6])
ax.set_xticklabels(["one", "two", "three", 'four', 'five', 'six'])

# set the title and the xy-axis labels
plt.title("Number of Words in Dataset Title")
plt.xlabel("Number of Words")
plt.ylabel("Dataset Title")

# display the plot
plt.show()

### 'cleaned_label'

In [None]:
train_df['cleaned_label'].unique()

In [None]:
train_df['cleaned_label'].value_counts().head(20).to_frame()

### Frequency Distributions

In [None]:
#create a frequency distribution to see which words are used the most
words = list( train_df['cleaned_label'].values)
stopwords = stopwords_list
split_words = []
for word in words:
    lo_w = []
    list_of_words = str(word).split()
    for w in list_of_words:
        if w not in stopwords:
            lo_w.append(w)
    split_words.append(lo_w)
allwords = []
for wordlist in split_words:
    allwords += wordlist
    
#get 100 most common words
mostcommon = FreqDist(allwords).most_common(100)
mostcommon

In [None]:
#plot frequency distributions
wordcloud = WordCloud(width = 1600, height = 800, 
                      background_color = 'black', 
                      colormap = 'Spectral', 
                      stopwords = stopwords_list).generate(str(mostcommon))

fig = plt.figure(figsize = (20, 10), facecolor = 'white')
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.title('Top 100 Most Common Words in cleaned_label', fontsize = 30)
plt.tight_layout()

#save
plt.savefig('cleaned_label_wordcloud.png')

In [None]:
plt.figure(figsize = (30, 40)),

sns.countplot(y = train_df['cleaned_label'], 
              order = train_df['cleaned_label'].value_counts().index, 
              palette = 'Spectral')
plt.ylabel('Cleaned Label',fontsize = 30)
plt.show()

#save
plt.savefig('cleaned_label.png')

The classes are highly imbalanced.

## BiGram

An n-gram means a sequence of n-words.

Some English words occur together more frequently. So, in a text document we may need to identify such pair of words which will help in sentiment analysis. 

Bigram is 2 consecutive words in a sentence.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

#get bigrams 
vectorizer = CountVectorizer(ngram_range = (2, 2))

#matrix of ngrams
ngrams = vectorizer.fit_transform(train_df['cleaned_label']) 
features = (vectorizer.get_feature_names())
print('\n\nFeatures : \n', features)

#count frequency of ngrams
print('\n\nX1 : \n', ngrams.toarray())
  
#apply TFIDF
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range = (2, 2))
ngrams = vectorizer.fit_transform(train_df['cleaned_label'])
scores = (ngrams.toarray())
print('\n\nScores : \n', scores)
  
#get top ranking features
sums = ngrams.sum(axis = 0)
data1 = []
for col, term in enumerate(features):
    data1.append( (term, sums[0,col] ))
ranking = pd.DataFrame(data1, columns = ['term','rank'])
words = (ranking.sort_values('rank', ascending = False))
print ('\n\nWords head : \n', words.head(20))

In [None]:
#count frequency of ngrams
count_values = ngrams.toarray().sum(axis = 0)

#list of ngrams
vocab = vectorizer.vocabulary_
df_bigram = pd.DataFrame(sorted([(count_values[i],k) for k,i in vocab.items()], reverse = True)
            ).rename(columns = {0: 'frequency', 1: 'bigram'})

plt.figure(figsize = (20, 10))
sns.lineplot(x = df_bigram['bigram'][:60], y = df_bigram['frequency'][:60])
plt.xticks(rotation = 90, fontsize = 16)
plt.xlabel('Bigram',fontsize = 20)
plt.ylabel('Frequency',fontsize = 20)
plt.title('Cleaned Label Bigram',fontsize = 30)
plt.show()

#save
plt.savefig('cleaned_label bigram.png')

## TriGram

Trigram is 3 consecutive words in a sentence. 

In [None]:
#get trigrams 
vectorizer = CountVectorizer(ngram_range = (3, 3))

#matrix of ngrams
ngrams = vectorizer.fit_transform(train_df['cleaned_label']) 
features = (vectorizer.get_feature_names())
print('\n\nFeatures : \n', features)

#count frequency of ngrams
print('\n\nX1 : \n', ngrams.toarray())
  
#apply TFIDF
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range = (3, 3))
ngrams = vectorizer.fit_transform(train_df['cleaned_label'])
scores = (ngrams.toarray())
print('\n\nScores : \n', scores)
  
#get top ranking features
sums = ngrams.sum(axis = 0)
data1 = []
for col, term in enumerate(features):
    data1.append( (term, sums[0,col] ))
ranking = pd.DataFrame(data1, columns = ['term','rank'])
words = (ranking.sort_values('rank', ascending = False))
print ('\n\nWords head : \n', words.head(20))

In [None]:
#count frequency of ngrams
count_values = ngrams.toarray().sum(axis = 0)

#list of ngrams
vocab = vectorizer.vocabulary_
df_trigram = pd.DataFrame(sorted([(count_values[i],k) for k,i in vocab.items()], reverse = True)
            ).rename(columns = {0: 'frequency', 1:'trigram'})

plt.figure(figsize = (20, 10))
sns.lineplot(x = df_trigram['trigram'][:60], y = df_trigram['frequency'][:60])
plt.xticks(rotation = 90, fontsize = 16)
plt.xlabel('Trigram',fontsize = 20)
plt.ylabel('Frequency',fontsize = 20)
plt.title('Cleaned Label Trigram',fontsize = 30)
plt.show()

#save
plt.savefig('cleaned_label trigram.png')

In [None]:
#find one-worded, two-worded, three-worded, four-worded, five-worded dataset title
one_worded_cleaned_label = train_df[train_df['cleaned_label'].str.split().apply(len) == 1]
two_worded_cleaned_label = train_df[train_df['cleaned_label'].str.split().apply(len) == 2]
three_worded_cleaned_label = train_df[train_df['cleaned_label'].str.split().apply(len) == 3]
four_worded_cleaned_label = train_df[train_df['cleaned_label'].str.split().apply(len) == 4]
five_worded_cleaned_label = train_df[train_df['cleaned_label'].str.split().apply(len) == 5]

#create a bar plot
fig, ax = plt.subplots(figsize = (10, 6))
ax.bar([1, 2, 3, 4, 5], [one_worded_cleaned_label.size,
                         two_worded_cleaned_label.size,
                         three_worded_cleaned_label.size,
                         four_worded_cleaned_label.size,
                         five_worded_cleaned_label.size])

#label the x-axis instances
ax.set_xticks([1, 2, 3, 4, 5])
ax.set_xticklabels(["one", "two", "three", 'four', 'five'])

# set the title and the xy-axis labels
plt.title("Number of Words in Cleaned Label")
plt.xlabel("Number of Words")
plt.ylabel("Cleaned Label")

# display the plot
plt.show()

# SENTIMENT ANALYSIS

TextBlob is another excellent open-source library for performing NLP tasks with ease, including sentiment analysis. It also an a sentiment lexicon (in the form of an XML file) which it leverages to give both polarity and subjectivity scores. The polarity score is a float within the range [-1.0, 1.0]. The subjectivity is a float within the range [0.0, 1.0] where 0.0 is very objective and 1.0 is very subjective.

In [None]:
from textblob import TextBlob

#get texxt sentiment
train_df['text_sentiment'] = train_df['text'].apply(lambda x: TextBlob(x).sentiment.polarity)

#review
train_df.head(5)

In [None]:
from textblob import TextBlob

#get text tag
train_df['dataset_title_tag'] = train_df['dataset_title'].apply(lambda x: TextBlob(x).tags)

#review
train_df.head(5)

In [None]:
#save
train_df.to_csv('train_df_tag.csv')