In [22]:
%load_ext autoreload
%autoreload 2

import lzma
import pickle
import hashlib
import pandas as pd
import numpy as np
import requests as r
import seaborn as sns
import warnings
import matplotlib as mpl
import matplotlib.pyplot as plt
from json import load, dump, loads, dumps
from joblib import hash
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.cluster import KMeans
from sklearn.preprocessing import MaxAbsScaler
from sklearn.semi_supervised import SelfTrainingClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree

warnings.simplefilter('ignore')
mpl.style.use('ggplot')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Source Data

IF source data is missing run Elasticsearch query to extract data and then save it in JSON format to `data` directory

In [30]:
# news_json = r.get('http://elasticsearch.storage:9200/news/scraped/_search?sort=date:desc&size=6000').json()
# with lzma.open('./data/news.json.xz', 'wt', encoding='utf-8') as fh:
#   fh.write(dumps(news_json['hits']['hits']))

In [37]:
# df = pd.json_normalize(news_json['hits']['hits'])
# df = df[['_source.body', '_source.date', '_source.subject', '_source.language', '_source.categories']]
# df.columns = ['body', 'pubdate', 'subject', 'language', 'categories']
# df = df.loc[(df['categories'] != 'News') & 
#             (df['categories'] != 'Uncategorized') &
#             (df['language'] == 'English')]

In [38]:
# df.to_csv('./data/news.json.xz', mode='wt', encoding='utf-8', compression='xz', index=False)

In [39]:
# b = loads(lzma.open('./data/news.json.xz', 'rt', encoding='utf-8').read())
# df = pd.json_normalize(b)
df = pd.read_csv('./data/news.json.xz')

## Common issues that we generally face during the data preparation phase:
 - Format and structure normalization
 - Detect and fix missing values
 - Duplicates removal
 - Units normalization
 - Constraints validations
 - Anomaly detection and removal
 - Study of features importance/relevance
 - Dimentional reduction, feature selection & extraction

In [40]:
df.drop_duplicates(inplace=True)
df.head(1).T.style

Unnamed: 0,0
Unnamed: 0,0
body,"Saudi Arabia and the UN Environment Programme (UNEP) will celebrate World Environment Day on 5 June under the theme “Our Land, Our Future”. According to the Saudi Press Agency (SPA), the ceremony will be held at the King Abdulaziz International Conference Center in Riyadh with dignitaries, officials, experts, and specialists from various countries in attendance. The Ministry of Environment, Water and Agriculture (MEWA) said the event will showcase the Kingdom’s commitment to environmental protection and sustainable development, aligning with Vision 2030 goals. The focus will be on land restoration, combating desertification, and building drought resilience. Saudi Arabia, UNEP to Host World Environment Day.https://t.co/YKKGxiUsI4#SPAGOV pic.twitter.com/dlgTPWLw1I — SPAENG (@Spa_Eng) June 1, 2024 Last month, Deputy Executive Director of UNEP, Elizabeth Mrema, spoke […]"
pubdate,2024-06-02T13:30:57.000Z
subject,Saudi Arabia and UNEP to mark World Environment Day with major event in Riyadh
language,English
categories,"Middle East, News, Saudi Arabia"


In [41]:
df['categories'] = df['categories'].str.replace(r'[^a-zA-Z_, ]+', '').replace(', ', '')
df['categories'] = df['categories'].str.replace(r'^, ', '')
df = df[~df.eq('').all(1)]

In [42]:
df.groupby(['categories']).agg({'count'}).drop_duplicates()

Unnamed: 0_level_0,Unnamed: 0,body,pubdate,subject,language
Unnamed: 0_level_1,count,count,count,count,count
categories,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
"#Culture, #Life, Books",1,1,1,1,1
"#Culture, featured, Featured Culture",3,3,3,3,3
"#Islam, Arabic Studies, featured",2,2,2,2,2
"#Islam, featured, Featured Islam",4,4,4,4,4
"Afghanistan, Africa, Asia & Americas",6,6,6,6,6
"Afghanistan, Asia & Americas, Europe & Russia",14,14,14,14,14
Africa,5,5,5,5,5
"Africa, Algeria, Article",8,8,8,8,8
"Africa, Algeria, Europe & Russia",30,30,30,30,30
"Africa, Algeria, International Organisations",7,7,7,7,7


In [43]:
df['cat_id']       = df['categories'].factorize()[0]
df['lang_id']      = df['language'].factorize()[0]
df['char_count']   = df['body'].apply(len)
df['word_count']   = df['body'].apply(lambda x: len(x.split()))
df['word_density'] = df['char_count'] / (df['word_count']+1)


In [44]:
df = df.dropna()

In [45]:
df.shape

(4915, 11)

In [46]:
# sns.set()
# sns.pairplot(df, height=3.5, kind="reg", palette="husl", diag_kind="auto")

In [47]:
xtrain, xtest, ytrain, ytest = train_test_split(df['body'], df['categories'], test_size=0.2, random_state=42, shuffle=True)

In [48]:
estimators = [
      # ('AdaBoostClassifier', AdaBoostClassifier(n_estimators=100)), # 0.02
      # ('LogisticRegression', SelfTrainingClassifier(LogisticRegression(fit_intercept=False, max_iter=1000))), # 20%
      # ('SGDClassifier', SelfTrainingClassifier(SGDClassifier(fit_intercept=False, loss='modified_huber', max_iter=2000))), # 10%
      ('RandomForestClassifier', SelfTrainingClassifier(RandomForestClassifier(min_samples_leaf=0.01))), # 16%
      ('MLPClassifier', SelfTrainingClassifier(MLPClassifier(max_iter=300))), # 21%
      ('DecisionTreeClassifier', SelfTrainingClassifier(DecisionTreeClassifier(min_samples_leaf=0.01))), # 15%
      # ('GradientBoostingClassifier', GradientBoostingClassifier(min_samples_leaf=0.01)), # takes too long > 20 min
    ]

In [49]:
vote = Pipeline([
      ('vect', TfidfVectorizer(sublinear_tf=True, encoding='latin-1', analyzer='char', lowercase=False, ngram_range=(1, 2), stop_words='english')),
      ('tfidf', TfidfTransformer(sublinear_tf=True, use_idf=True, smooth_idf=True)),
      ('scaler', MaxAbsScaler(copy=False)),
      ('feature_union', FeatureUnion([
        ('select_best', SelectKBest(score_func=mutual_info_classif, k='all')),
        ('kmeans', KMeans(n_clusters=5, random_state=42, verbose=1)),
      ])),
      ('clf', VotingClassifier(estimators=estimators, voting='soft', verbose=True))])
vote.fit(xtrain, ytrain)
prediction = vote.predict(xtest)
score      = vote.score(xtest, prediction)

[Voting] ... (1 of 3) Processing RandomForestClassifier, total=  23.2s
[Voting] ............ (2 of 3) Processing MLPClassifier, total= 4.0min
[Voting] ... (3 of 3) Processing DecisionTreeClassifier, total=  41.1s


In [50]:
print(f'{accuracy_score(ytest, prediction) * 100:.2f}%')

26.14%


In [51]:
# print(classification_report(ytest, prediction))

In [52]:
s = '''

‘Guys, you’ve got to hear this,” I said. I was sitting in front of my computer one day in July 2012, with one eye on a screen of share prices and the other on a live stream of the House of Commons Treasury select committee hearings. As the Barclays share price took a graceful swan dive, I pulled my headphones out of the socket and turned up the volume so everyone could hear. My colleagues left their terminals and came around to watch BBC Parliament with me.

It didn’t take long to realise what was happening. “Bob’s getting murdered,” someone said.

Bob Diamond, the swashbuckling chief executive of Barclays, had been called before the committee to explain exactly what his bank had been playing at in regards to the Libor rate-fixing scandal. The day before his appearance, he had made things very much worse by seeming to accuse the deputy governor of the Bank of England of ordering him to fiddle an important benchmark, then walking back the accusation as soon as it was challenged. He was trying to turn on his legendary charm in front of a committee of angry MPs, and it wasn’t working. On our trading floor, in Mayfair, calls were coming in from all over the City. Investors needed to know what was happening and whether the damage was reparable.

A couple of weeks later, the damage was done. The money was gone, Diamond was out of a job and the market, as it always does, had moved on. We were left asking ourselves: How did we get it so wrong?

'''

In [53]:
vote.predict([s])[0].split(',')

['Opinion']

In [20]:
with lzma.open('./data/voting_classifier.pickle.xz', 'wb') as f:
  pickle.dump(vote, f, protocol=5)

In [234]:
# dt = DecisionTreeClassifier(min_samples_leaf=1)
# dt.fit(vote.named_steps['vect'].transform(xtrain), vote.predict(xtrain))

In [None]:
def serialize_and_hash(data):
    serialized_data = pickle.dumps(data)
    data_hash = hashlib.sha256(serialized_data).hexdigest()
    return serialized_data, data_hash

def verify_integrity(serialized_data, stored_hash):
    calculated_hash = hashlib.sha256(serialized_data).hexdigest()
    return calculated_hash == stored_hash

# Example usage
data = df
serialized_data, data_hash = serialize_and_hash(data)

# When loading
# Load serialized_data and stored_hash
with lzma.open('./data/voting_classifier.pickle.xz', 'rb') as f:
    loaded_serialized_data = f.read()

# Load the stored hash
with lzma.open('./data/news.json.xz', 'rt') as f:
    stored_hash = f.read()

if verify_integrity(loaded_serialized_data, stored_hash):
    loaded_data = pickle.loads(loaded_serialized_data)
    print("Data integrity verified!")
else:
    print("Data integrity verification failed!")