In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Libraries installation and import

In [None]:
!pip3 install hvplot[pandas]

In [None]:
import json
import spacy

In [None]:
from functools import partial
from pathlib import Path
from tqdm import tqdm

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_validate

In [None]:
import matplotlib.pyplot as plt

import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
import hvplot.pandas 

import yellowbrick
from yellowbrick.text import FreqDistVisualizer, UMAPVisualizer

# Dataset

In [None]:
# reading csv files and train & test file paths
path = Path("/kaggle/input/coleridgeinitiative-show-us-the-data")
train_df = pd.read_csv(path / 'train.csv')
sample_sub = pd.read_csv(path / 'sample_submission.csv')
train_files_path = path / 'train'
test_files_path = path / 'test'

In [None]:
train_df.head(3)

## Retrieve introduction / abstract

In [None]:
def retrieve_first_section(filename, train_files_path=train_files_path):
    """Return the first section of the article (most probably abstract or introduction)."""
    
    json_path = (train_files_path / filename).with_suffix('.json')
    first_section = []
    
    with json_path.open() as f:
        json_decode = json.load(f)
        for data in json_decode:
            if data.get('text').strip() != '' and not first_section:
                first_section.append(data.get('text'))

    return first_section[0]

In [None]:
tqdm.pandas()   #tqdm is used to show any code running with a progress bar. 
train_df['text'] = train_df['Id'].progress_apply(retrieve_first_section)

In [None]:
tqdm.pandas()
sample_sub['text'] = sample_sub['Id'].progress_apply(partial(retrieve_first_section, train_files_path=test_files_path))

In [None]:
train_df.head(3)

In [None]:
# Drop every first_section with more than 100000 caracters
train_df = train_df[train_df.text.str.len() < 100000]

In [None]:
train_df.index.size

In [None]:
# Remove badly represented labels/articles
train_df = train_df[train_df['cleaned_label'].isin(train_df['cleaned_label'].value_counts()[train_df['cleaned_label'].value_counts()>5].index)]

In [None]:
train_df.index.size

## Text cleaning

In [None]:
import spacy

is_using_gpu = spacy.prefer_gpu()
nlp = spacy.load("en_core_web_lg")

In [None]:
corpus = train_df.text.str.lower()

In [None]:
def lemmatize(doc):
    return [token.lemma_ for token in doc 
            if token.is_alpha 
            and not token.is_stop 
            and (len(token.text) > 2)] 

In [None]:
def preprocess(docs, batch_size=200):
    preprocessed = []
    for doc in tqdm(nlp.pipe(docs, batch_size=batch_size)):
        preprocessed.append(' '.join(lemmatize(doc)))
    return preprocessed

In [None]:
train_df['preprocessed'] = preprocess(corpus)

In [None]:
train_df.preprocessed.str.len().hvplot.hist()

In [None]:
# Drop every first_section with more than 2000 preprocessed words
train_df = train_df[train_df.preprocessed.str.len() < 20000]

# Exploratory Data Analysis

## Sample  size

In [None]:
train_df.index.size

In [None]:
train_df.info()

## Labels distribution

Check labels balance and unicity

In [None]:
labels = train_df.cleaned_label.unique()

In [None]:
labels.size

In [None]:
pd.Series(train_df.cleaned_label).value_counts().head(50).hvplot.bar(width=900, height=1000, rot=70)

## Token frequency distribution

In [None]:
vectorizer = CountVectorizer()
docs = vectorizer.fit_transform(train_df.preprocessed)
features = vectorizer.get_feature_names()

In [None]:
visualizer = FreqDistVisualizer(features=features, orient="v", size=(1920, 1080))
visualizer.fit(docs)
visualizer.ax.set_xlabel("Tokens", fontsize=22)
visualizer.ax.set_ylabel("Frequency [#]", fontsize=22)
visualizer.ax.tick_params(axis='x', labelsize=16, rotation=60)
visualizer.ax.tick_params(axis='y', labelsize=16)
visualizer.set_title("Frequency Distribution of Top 50 tokens")
yellowbrick.style.rcmod.set_style(rc={"legend.fontsize": 32})

## Wordcloud representation

In [None]:
!pip install wordcloud

In [None]:
from wordcloud import WordCloud

text = " ".join(article for article in train_df.preprocessed)
wordcloud = WordCloud(
    max_font_size=50, max_words=100, background_color="white", width=250, height=180
).generate(text)
plt.figure(figsize=(16, 10))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

# Baseline

In [None]:
le = LabelEncoder()
labels = le.fit_transform(train_df.cleaned_label)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import KFold, StratifiedKFold, StratifiedShuffleSplit

In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
scorings = {
    "jaccard": "jaccard_weighted",
    "accuracy": "accuracy",
    "balanced_accuracy": "balanced_accuracy",
    "precision": "precision_weighted",
    "recall": "recall_weighted",
    "f1": "f1_weighted",
    "roc_auc": "roc_auc_ovr_weighted",
}

## Logistic regression

In [None]:
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2))
lr = LogisticRegression(class_weight='balanced', solver='liblinear')
pipe = make_pipeline(tfidf, lr)

In [None]:
res = cross_validate(pipe, train_df.preprocessed, train_df.cleaned_label, cv=cv, scoring=scorings, n_jobs=-1)

In [None]:
for scoring, scores in res.items():
    if not scoring.endswith("time"):
        print(
            f"{' '.join(scoring.split('_')[1:])}: {scores.mean():0.2f} (+/- {scores.std() * 2:0.2f})"
        )

In [None]:
# Actual training
pipe.fit(train_df.preprocessed, y=train_df.cleaned_label)

## Save model

In [None]:
import joblib

In [None]:
filename = "./baseline_lr_model.pkl"

In [None]:
joblib.dump(pipe, filename)

##  Load model

In [None]:
baseline_model = joblib.load(filename)

## Predictions

In [None]:
sample_sub_preprocessed = preprocess(sample_sub.text.str.lower())

In [None]:
sample_sub['PredictionString'] = baseline_model.predict(sample_sub_preprocessed)

In [None]:
sample_sub.drop(columns=['text']).to_csv('submission.csv', index=False)