## Coleridge Starter EDA
- Identify Most popular/cited Datasets in the Training Data
- Identify Most important words in a Dataset
- Identify Top Keywords in the entire Training Corpus

### Addon: Identify a Normal Distribution in the Dataset

### Import Libraries

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import json
import nltk
import spacy
import seaborn as sns
import matplotlib.pyplot as plt
import os
from tqdm import tqdm
from functools import partial
import re
import gensim
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
from spacy.lang.en.stop_words import STOP_WORDS
import itertools
import collections

In [None]:
pwd

#### Import Training Data

In [None]:
train_df = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/train.csv')

In [None]:
train_df.shape

In [None]:
train_df.head()

In [None]:
train_df.isnull().sum()

In [None]:
print('Unique values in the training set:')
for col in train_df.columns:
    print(f'{col} : {train_df[col].nunique()}')

In [None]:
### Define Paths for Train and Test Json files
train_files_path = '../input/coleridgeinitiative-show-us-the-data/train'
test_files_path = '../input/coleridgeinitiative-show-us-the-data/test'

In [None]:
### Function to read JSON files and extract publication Text 

def json_to_text(filename, train_files_path=train_files_path, output='text'):
    json_path = os.path.join(train_files_path, (filename+'.json'))
    headings = []
    contents = []
    combined = []
    with open(json_path, 'r') as f:
        json_decode = json.load(f)
        for data in json_decode:
            headings.append(data.get('section_title'))
            contents.append(data.get('text'))
            combined.append(data.get('section_title'))
            combined.append(data.get('text'))
            
    all_headings = ' '.join(headings)
    all_contents = ' '.join(contents)
    all_data = ' '.join(combined)
    
    if output=='text':
        return all_contents
    elif output=='head':
        return all_headings
    else:
        return all_data

In [None]:
### Extract Publication Text for Training Data
tqdm.pandas()
train_df['text'] = train_df['Id'].progress_apply(json_to_text)

In [None]:
train_df.head()

In [None]:
### Reading the Sample Submission Data

sample_sub = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/sample_submission.csv')
sample_sub.head()
### Extract Publication Text for the sample publications 
sample_sub['text'] = sample_sub['Id'].apply(partial(json_to_text,train_files_path=test_files_path))

## Identify Most popular/cited Datasets in the Training Data

In [None]:
#### Set size for sns plots
sns.set(rc={'figure.figsize':(11.7,8.27)})

## Plot top 15 popular datasets
train_df.dataset_label.value_counts()[:15].plot(kind='bar',title='Famous Datasets',color = sns.color_palette("husl", 8))

- We can look at the percentage distribution of cited Datasets

In [None]:
((train_df.dataset_label.value_counts()/train_df.dataset_label.shape[0])*100)[:15]

- Alzheimers Accounts for roughly 30% of entire labels. 

## Identify Top Keywords in the entire Training Corpus

#### One of the most Important steps before any keyword identification process is Text Cleaning to avoid GIGO (Garbage In Garbage Out).
- Lemmatize Text to bring the word to its base form and hence removing redundant words from our vocabulary

In [None]:
def lemmatization(text):

    doc = nlp(text)
    lemma_list = [token.lemma_ for token in doc if not token.is_stop]
    return ' '.join(lemma_list)

def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower()).strip()

train_df['text'] = train_df['text'].progress_apply(clean_text)

nlp = spacy.load('en', disable=['parser', 'ner'])
stop_words = stopwords.words('english')

try:
    train_df['text'] = train_df['text'].progress_apply(lemmatization)
except:
    pass

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

docs = train_df['text'].tolist()

#Ignore words that appear in 85% texts, 
cv = CountVectorizer(max_df=0.85, stop_words=stop_words, max_features=60000)
word_count_vector = cv.fit_transform(docs)

tfidf_transformer = TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(word_count_vector)
feature_names = cv.get_feature_names()

def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """get the feature names and tf-idf score of top n items"""
    
    #use only topn items from vector
    sorted_items = sorted_items[:topn]

    score_vals = []
    feature_vals = []
    
    # word index and corresponding tf-idf score
    for idx, score in sorted_items:
        
        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])

    #create a tuples of feature,score
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    
    return results

In [None]:
%%time

Ids = train_df.Id.tolist()
keyword_df = pd.DataFrame()

for i in range(len(docs)):
    doc = docs[i]
    Id = Ids[i]
    tfidf_vector = tfidf_transformer.transform(cv.transform([doc]))

    #sort the tf-idf vectors by descending order of scores
    sorted_items=sort_coo(tfidf_vector.tocoo())

    #extract only the top n; n here is 10
    keywords=extract_topn_from_vector(feature_names,sorted_items,10)

    temp_df = pd.DataFrame()
    temp_df['keyword'] = keywords
    temp_df['weight'] = keywords.values()
    temp_df['id']=Id
    keyword_df = keyword_df.append(temp_df)

### We have a DataFrame with Keywords for each article and its keywords with their weights
keyword_df[['id','keyword','weight']].head()

In [None]:
keyword_df.groupby('keyword')['weight'].sum().sort_values(ascending=False)[:15].plot(
    kind='bar',title='Keywords with Top Weight',color = sns.color_palette("husl", 8))

- Kids and Education seems to be the most common theme -  (Student, School, Children, Teachers in top Keywords)
- cov and covid are in top keywords, reflects upon the research on the Coronavirus
- Keywords “Et al.” is short for the Latin term “et alia,” meaning “and others.” It is used in academic citations when referring to a source with multiple authors

In [None]:
train_df.dataset_label.value_counts()[:15]

In [None]:
keyword_df.groupby('keyword')['weight'].sum().sort_values(ascending=False)[:15]

In [None]:
keyword_df.groupby('keyword')['keyword'].count().sort_values(ascending=False)[:15]

In [None]:
keyword_df.groupby(['id','keyword'])['weight'].sum().sort_values(ascending=False)[:20]

### Identify Keywords for a specific Article

In [None]:
### Looking at the keywords from the first article

doc = docs[0]
tfidf_vector = tfidf_transformer.transform(cv.transform([doc]))

#sort the tf-idf vectors by descending order of scores
sorted_items=sort_coo(tfidf_vector.tocoo())

#extract only the top n; n here is 10
keywords=extract_topn_from_vector(feature_names,sorted_items,20)

print(train_df.cleaned_label[0])
# now print the results
print("\n===Keywords===")
for k in keywords:
    print(k,keywords[k])

### How does the Article length Distribution Looks like? Pretty "Normal"

In [None]:
train_df.text.str.len().plot(kind='hist')

- Some Articles are really huge, need to adjust the outliers to get a sense of the distribution

In [None]:
## Function to remove outlier article lenghts

def get_iqr(df):
    df['text_len'] = df['text'].str.len()
    sorted_len = np.sort(df['text_len'])
    Q1,Q3 = np.percentile(sorted_len , [25,75])
    IQR = Q3-Q1
    upper_range = Q3 + (1.5 * IQR)
    return int(upper_range)

df_eda = train_df.copy()
upper_limit = get_iqr(df_eda)
adjusted_len = train_df.text.str.slice(0,upper_limit)
sns.histplot(data=adjusted_len.str.len())

### Publications length follows a right skewed Normal Distribution, with Median around 25000 words, and exceptions with 80K+ words as well.

In [None]:
### Save Cleaned Train file and Keywords to csv for quick reference
train_df.to_csv('./train_df_cleaned.csv')
keyword_df.to_csv('./keywords.csv')
#keyword_df = pd.read_csv('../input/coleridgetrainkeywords/keywords.csv')

Reference :https://www.kaggle.com/manabendrarout/tabular-data-preparation-basic-eda-and-baseline
           https://kavita-ganesan.com/extracting-keywords-from-text-tfidf/#.YIQBCpAzaUk
