In [1]:
%load_ext autoreload
%autoreload 2

import lzma
import pickle
import hashlib
import pandas as pd
import numpy as np
import requests as r
import seaborn as sns
import warnings
import matplotlib as mpl
import matplotlib.pyplot as plt
from json import load, dump, loads, dumps
from joblib import hash
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MaxAbsScaler
from sklearn.semi_supervised import SelfTrainingClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree

warnings.simplefilter('ignore')
mpl.style.use('ggplot')

## Source Data

IF source data is missing run Elasticsearch query to extract data and then save it in JSON format to `data` directory

In [2]:
# news_json = r.get('http://elasticsearch.storage:9200/news/scraped/_search?sort=date:desc&size=6000').json()
# with lzma.open('./data/news.json.xz', 'wt', encoding='utf-8') as fh:
#   fh.write(dumps(news_json['hits']['hits']))

In [3]:
b = loads(lzma.open('./data/news.json.xz', 'rt', encoding='utf-8').read())
df = pd.json_normalize(b)

## Common issues that we generally face during the data preparation phase:
 - Format and structure normalization
 - Detect and fix missing values
 - Duplicates removal
 - Units normalization
 - Constraints validations
 - Anomaly detection and removal
 - Study of features importance/relevance
 - Dimentional reduction, feature selection & extraction

In [4]:
df = df[['_source.body', '_source.date', '_source.subject', '_source.language', '_source.categories']]
df.columns = ['body', 'pubdate', 'subject', 'language', 'categories']

In [5]:
df.drop_duplicates(inplace=True)
df.head(1).T.style

Unnamed: 0,0
body,"Norway has urged the United Nations Security Council to grant Palestine full membership status, saying the Nordic country will vote in favor when the General Assembly votes on the issue, Anadolu Agency reports. “Norway has made it very clear that it supports Palestine’s application for full membership status in the UN, and that Norway will vote in favor the day the General Assembly holds a vote on the application,” Foreign Minister Espen Barth Eide said in a statement issued late Friday. The announcement comes after the UN General Assembly overwhelmingly approved a resolution on Friday calling for the reevaluation of Palestine’s UN membership bid and the grant of additional rights. The US, Israel, Hungary, Argentina, Micronesia, Nauru, Palau, and Papua […]"
pubdate,2024-05-11T17:02:19.000Z
subject,UN Security Council must approve Palestine's membership: Norway
language,English
categories,"Asia & Americas, Europe & Russia, International Organisations"


In [6]:
df = df.loc[(df['categories'] != 'News') & 
            (df['categories'] != 'Uncategorized') &
            (df['language'] == 'English')]

In [7]:
df['categories'] = df['categories'].str.replace(r'[^a-zA-Z_, ]+', '').replace(', ', '')
df['categories'] = df['categories'].str.replace(r'^, ', '')
df = df[~df.eq('').all(1)]

In [8]:
df.groupby(['categories']).agg({'count'}).drop_duplicates()

Unnamed: 0_level_0,body,pubdate,subject,language
Unnamed: 0_level_1,count,count,count,count
categories,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
"#Culture, #Life, Books",1,1,1,1
"#Culture, featured, Featured Culture",3,3,3,3
"#Current Affairs, featured, Featured Current Affairs",2,2,2,2
"#Islam, featured, Featured Islam",4,4,4,4
"Afghanistan, Africa, Asia & Americas",6,6,6,6
"Afghanistan, Asia & Americas, Europe & Russia",13,13,13,13
"Africa, Algeria, Article",8,8,8,8
"Africa, Algeria, Asia & Americas",9,9,9,9
"Africa, Algeria, Europe & Russia",30,30,30,30
"Africa, Algeria, News",12,12,12,12


In [9]:
df['cat_id']       = df['categories'].factorize()[0]
df['lang_id']      = df['language'].factorize()[0]
df['char_count']   = df['body'].apply(len)
df['word_count']   = df['body'].apply(lambda x: len(x.split()))
df['word_density'] = df['char_count'] / (df['word_count']+1)


In [10]:
df = df.dropna()

In [11]:
df.shape

(5672, 10)

In [12]:
# sns.set()
# sns.pairplot(df, height=3.5, kind="reg", palette="husl", diag_kind="auto")

In [13]:
xtrain, xtest, ytrain, ytest = train_test_split(df['body'], df['categories'], test_size=0.2, random_state=42, shuffle=True)

In [14]:
engines = [
      # ('AdaBoostClassifier', AdaBoostClassifier(n_estimators=100)), # 0.02
      ('LogisticRegression', SelfTrainingClassifier(LogisticRegression(fit_intercept=False, max_iter=1000))), # 20%
      # ('SGDClassifier', SelfTrainingClassifier(SGDClassifier(fit_intercept=False, loss='modified_huber', max_iter=2000))), # 10%
      ('RandomForestClassifier', SelfTrainingClassifier(RandomForestClassifier(min_samples_leaf=0.01))), # 16%
      ('MLPClassifier', SelfTrainingClassifier(MLPClassifier(max_iter=300))), # 21%
      # ('DecisionTreeClassifier', SelfTrainingClassifier(DecisionTreeClassifier(min_samples_leaf=0.01))), # 15%
      # ('GradientBoostingClassifier', GradientBoostingClassifier(min_samples_leaf=0.01)), # takes too long > 20 min
    ]

In [15]:
vote = Pipeline([
      ('vect', TfidfVectorizer(sublinear_tf=True, encoding='latin-1', analyzer='char', lowercase=False, ngram_range=(1, 2), stop_words='english')),
      ('tfidf', TfidfTransformer(sublinear_tf=True, use_idf=True, smooth_idf=True)),
      ('scaler', MaxAbsScaler(copy=False)),
      # ('feature_selection', SelectKBest(score_func=f_classif, k=300)),
      ('clf', VotingClassifier(estimators=engines, voting='soft', verbose=True))])
vote.fit(xtrain, ytrain)
prediction = vote.predict(xtest)
score      = vote.score(xtest, prediction)

[Voting] ....... (1 of 3) Processing LogisticRegression, total= 5.9min
[Voting] ... (2 of 3) Processing RandomForestClassifier, total=  21.4s
[Voting] ............ (3 of 3) Processing MLPClassifier, total= 3.0min


In [16]:
print(accuracy_score(ytest, prediction))

0.3030837004405286


In [23]:
# print(classification_report(ytest, prediction))

In [17]:
s = '''

‘Guys, you’ve got to hear this,” I said. I was sitting in front of my computer one day in July 2012, with one eye on a screen of share prices and the other on a live stream of the House of Commons Treasury select committee hearings. As the Barclays share price took a graceful swan dive, I pulled my headphones out of the socket and turned up the volume so everyone could hear. My colleagues left their terminals and came around to watch BBC Parliament with me.

It didn’t take long to realise what was happening. “Bob’s getting murdered,” someone said.

Bob Diamond, the swashbuckling chief executive of Barclays, had been called before the committee to explain exactly what his bank had been playing at in regards to the Libor rate-fixing scandal. The day before his appearance, he had made things very much worse by seeming to accuse the deputy governor of the Bank of England of ordering him to fiddle an important benchmark, then walking back the accusation as soon as it was challenged. He was trying to turn on his legendary charm in front of a committee of angry MPs, and it wasn’t working. On our trading floor, in Mayfair, calls were coming in from all over the City. Investors needed to know what was happening and whether the damage was reparable.

A couple of weeks later, the damage was done. The money was gone, Diamond was out of a job and the market, as it always does, had moved on. We were left asking ourselves: How did we get it so wrong?

'''

In [18]:
vote.predict([s])[0].split(',')

['Activism', ' Environment', ' Health & Wellness']

In [19]:
with lzma.open('./data/voting_classifier.pickle.xz', 'wb') as f:
  pickle.dump(vote, f, protocol=5)

In [234]:
# dt = DecisionTreeClassifier(min_samples_leaf=1)
# dt.fit(vote.named_steps['vect'].transform(xtrain), vote.predict(xtrain))

In [None]:
def serialize_and_hash(data):
    serialized_data = pickle.dumps(data)
    data_hash = hashlib.sha256(serialized_data).hexdigest()
    return serialized_data, data_hash

def verify_integrity(serialized_data, stored_hash):
    calculated_hash = hashlib.sha256(serialized_data).hexdigest()
    return calculated_hash == stored_hash

# Example usage
data = b
serialized_data, data_hash = serialize_and_hash(data)

# When loading
# Load serialized_data and stored_hash
with lzma.open('./data/voting_classifier.pickle.xz', 'rb') as f:
    loaded_serialized_data = f.read()

# Load the stored hash
with lzma.open('./data/news.json.xz', 'rt') as f:
    stored_hash = f.read()

if verify_integrity(loaded_serialized_data, stored_hash):
    loaded_data = pickle.loads(loaded_serialized_data)
    print("Data integrity verified!")
else:
    print("Data integrity verification failed!")