#### Author: Serge Wilson MENDY

### Dependencies installation

In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.32.1-py3-none-any.whl (7.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.5/7.5 MB[0m [31m51.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m19.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m67.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m21.4 MB/s[0m eta [36m0:00:0

### Librairies importation

In [2]:
import pandas as pd
import torch
import torch.nn as nn
from torch.optim import AdamW
from transformers import BertTokenizer, BertForSequenceClassification


### Data importation

In [4]:
path = "inshort_news_data-1.csv"
df = pd.read_csv(path)
df.head()

Unnamed: 0.1,Unnamed: 0,news_headline,news_article,news_category
0,0,50-year-old problem of biology solved by Artif...,DeepMind's AI system 'AlphaFold' has been reco...,technology
1,1,Microsoft Teams to stop working on Internet Ex...,Microsoft Teams will stop working on Internet ...,technology
2,2,Hope US won't erect barriers to cooperation: C...,"China, in response to reports of US adding Chi...",technology
3,3,Global smartphone sales in Q3 falls 5.7% to 36...,The global smartphone sales in the third quart...,technology
4,4,EU hoping Biden will clarify US position on di...,The European Union (EU) is hoping that US Pres...,technology


In [5]:
df["news"] = df["news_headline"] + df["news_article"]
df.head()

Unnamed: 0.1,Unnamed: 0,news_headline,news_article,news_category,news
0,0,50-year-old problem of biology solved by Artif...,DeepMind's AI system 'AlphaFold' has been reco...,technology,50-year-old problem of biology solved by Artif...
1,1,Microsoft Teams to stop working on Internet Ex...,Microsoft Teams will stop working on Internet ...,technology,Microsoft Teams to stop working on Internet Ex...
2,2,Hope US won't erect barriers to cooperation: C...,"China, in response to reports of US adding Chi...",technology,Hope US won't erect barriers to cooperation: C...
3,3,Global smartphone sales in Q3 falls 5.7% to 36...,The global smartphone sales in the third quart...,technology,Global smartphone sales in Q3 falls 5.7% to 36...
4,4,EU hoping Biden will clarify US position on di...,The European Union (EU) is hoping that US Pres...,technology,EU hoping Biden will clarify US position on di...


### Preprocessing

In [6]:
input_texts = df["news"].tolist()
labels = df["news_category"].tolist()

num_classes = len(set(labels))

In [7]:
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=num_classes)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
label_to_index = {label: index for index, label in enumerate(set(labels))}
index_to_label = {}
for z in zip(label_to_index.keys(), label_to_index.values()):
  index_to_label[z[1]] = z[0]

In [9]:
input_encodings = tokenizer(input_texts, truncation=True, padding=True, return_tensors="pt")

encoded_labels = torch.tensor([label_to_index[label] for label in labels])

In [10]:
from torch.utils.data import random_split, DataLoader


data = torch.utils.data.TensorDataset(input_encodings["input_ids"], input_encodings["attention_mask"], encoded_labels)

train_size = int(0.8 * len(data))
test_size = len(data) - train_size

train_data, test_data = random_split(data, [train_size, test_size], generator=torch.Generator().manual_seed(42))

batch_size = 32

train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False)

### Model training

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [None]:
optimizer = AdamW(model.parameters(), lr=1e-5)

num_epochs = 20

for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        batch = tuple(t.to(device) for t in batch)
        batch_inputs, batch_attention_mask, batch_labels = batch
        optimizer.zero_grad()
        outputs = model(input_ids=batch_inputs, attention_mask = batch_attention_mask, labels=batch_labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item()}")

Epoch [1/20], Loss: 0.5070596933364868
Epoch [2/20], Loss: 0.3211589455604553
Epoch [3/20], Loss: 0.06190405786037445
Epoch [4/20], Loss: 0.0578983835875988
Epoch [5/20], Loss: 0.45408913493156433
Epoch [6/20], Loss: 0.06483367830514908
Epoch [7/20], Loss: 0.05168631300330162
Epoch [8/20], Loss: 0.35180261731147766
Epoch [9/20], Loss: 0.1468428522348404
Epoch [10/20], Loss: 0.08819860965013504
Epoch [11/20], Loss: 0.030188560485839844
Epoch [12/20], Loss: 0.018308911472558975
Epoch [13/20], Loss: 0.1551586091518402
Epoch [14/20], Loss: 0.06928528845310211
Epoch [15/20], Loss: 0.005571668967604637
Epoch [16/20], Loss: 0.08200711756944656
Epoch [17/20], Loss: 0.11239516735076904
Epoch [18/20], Loss: 0.0872802883386612
Epoch [19/20], Loss: 0.1976519674062729
Epoch [20/20], Loss: 0.03152480721473694


### Model evaluation

#### Accuracies computation

In [None]:
model.eval()

train_labels = []
pred_train_labels = []

for batch in train_loader:
  batch = tuple(t.to(device) for t in batch)
  input_ids, attention_mask, labels = batch
  with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    predictions = logits.argmax(dim=1).cpu().numpy()
    pred_train_labels.extend(predictions)
    train_labels.extend(labels.cpu().numpy())

test_labels = []
pred_test_labels = []

for batch in test_loader:
  batch = tuple(t.to(device) for t in batch)
  input_ids, attention_mask, labels = batch
  with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    predictions = logits.argmax(dim=1).cpu().numpy()
    pred_test_labels.extend(predictions)
    test_labels.extend(labels.cpu().numpy())


In [None]:
from sklearn.metrics import accuracy_score

accuracy_train = accuracy_score(train_labels, pred_train_labels)
accuracy_test = accuracy_score(test_labels, pred_test_labels)

print("Accuracy sur les données d'entrainement:", accuracy_train)
print("Accuracy sur les données de test :", accuracy_test)

Accuracy sur les données d'entrainement: 0.9732675837010122
Accuracy sur les données de test : 0.9429460580912863


#### Test on new articles

In [None]:
def predict(text:str):
  text_encoding = tokenizer(text, truncation=True, return_tensors="pt")
  outputs = model(text_encoding["input_ids"].to(device), attention_mask = text_encoding['attention_mask'].to(device))
  logits = outputs.logits
  result = logits.argmax().cpu().numpy().tolist()

  return index_to_label[result]

In [None]:
articles = ["""
Kane, aged 30, has one year left on his contract with Spurs and must now make a decision about whether or not to leave.

Tottenham and Bayern have reportedly reached an agreement regarding the transfer of Harry Kane - for a deal worth more than 100 million euros  - and now it is up to the striker to decide whether he wants to make a move to the German side.

Kane, who is the second highest-scoring player in Premier League history with 213 goals, is entering the last year of his contract at Spurs and can leave as a free agent in 2024. This puts pressure on Tottenham to sell him this summer if the club is to get any money for its star player.

Kane reached the Champions League final with Tottenham in 2019. However, he has never lifted a major trophy at the club and the team's progress has stalled. An eighth-place finish in the Premier League last season meant Tottenham failed to qualify for European competition.

The three-time Premier League top scorer could be the key to revitalising Bayern, which lacked a focused goal-scoring threat through the centre last season after Robert Lewandowski left for Barcelona.

Bayern needed goal difference to beat Borussia Dortmund to the German title on the final day of a turbulent season in which its top Bundesliga scorer was Serge Gnabry with 14 goals. Two years ago, Lewandowski hit the back of the net a record 41 times.
""", """
Technology giant Nvidia says its sales have hit a record after more than doubling as demand for its artificial intelligence (AI) chips soars.

The company says revenue jumped to above $13.5bn (£10.6bn) for the three months to the end of June.

Nvidia also expects sales to soar further in the current quarter and plans to buy back $25bn of its stock.

The firm's shares rose by more than 6.5% in extended trading in New York, adding to their huge gains this year.

Nvidia also said it expects revenue of around $16bn for the three months to the end of September.

That is much higher than Wall Street expectations and would equate to a rise of around 170%, compared to the same time last year.

"A new computing era has begun," Nvidia's chief executive, Jensen Huang, said in a statement.

"Companies worldwide are transitioning from general-purpose to accelerated computing and generative AI," he added.

The strong performance was driven by Nvidia's data centre business, which includes AI chips.

Revenue for that unit came in at more $10.3bn, a rise of more than 170% from year ago, as cloud computing service providers and large consumer internet companies snapped up its next-generation processors.

This year, Nvidia's stock market value has jumped to more than $1 trillion as its shares more than tripled in value.

That made it the fifth publicly traded US company to join the so-called "Trillion dollar club", along with Apple, Microsoft, Alphabet and Amazon.

Sarah Kunst, the managing director of Cleo Capital, which invests in startups, told the BBC's Today programme that she was fascinated by "the almost mania" around Nvidia.

"They've been making chips for a very long time and it's only really been in the last couple of years that the market has sort of caught on to this," she said.

Nvidia was originally known for making the type of computer chips that process graphics, particularly for computer games.

Now its hardware underpins most AI applications, with one report finding it had cornered 95% of the market for machine learning.

ChatGPT - which generates human-like responses to user queries within seconds - was trained using 10,000 of Nvidia's graphics processing units clustered together in a supercomputer belonging to Microsoft.

AI products are expected to dramatically change how we use computers and the role they play in our lives.
"""
]

In [None]:
print("Some predictions".center(50,"*"))
for i in range(len(articles)):
    prediction = predict(articles[i])
    print(f"Text {i+1}: {prediction}")

*****************Some predictions*****************
Text 1: sports
Text 2: technology


In [None]:
predict(df["news"][2])

'technology'

In [None]:
df["news"][2]

"Hope US won't erect barriers to cooperation: China on blacklist reportChina, in response to reports of US adding Chinese chipmaker SMIC and national offshore oil producer CNOOC to a defence blacklist, said that it hoped US will not erect barriers and obstacles to cooperation. Reuters reported that President Donald Trump's administration is poised to add SMIC and CNOOC to a list of companies allegedly owned or controlled by Chinese military."

In [None]:
model.save_pretrained("/content/Drive/MyDrive/bert_model")
tokenizer.save_pretrained("/content/Drive/MyDrive/bert_model")

('/content/Drive/MyDrive/bert_model/tokenizer_config.json',
 '/content/Drive/MyDrive/bert_model/special_tokens_map.json',
 '/content/Drive/MyDrive/bert_model/vocab.txt',
 '/content/Drive/MyDrive/bert_model/added_tokens.json')

In [None]:
!git lfs install

Git LFS initialized.


In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|
    
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) n
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
from huggingface_hub import HfFolder
import os
os.environ['HF_AUTH'] = HfFolder().get_token()

In [None]:
!git config --global user.email "mendysergewilson@gmail.com"

In [None]:
!git config --global user.name "serge-wilson"

In [None]:
!git clone https://user:$HF_AUTH@huggingface.co/serge-wilson/news_classification

Cloning into 'news_classification'...
remote: Enumerating objects: 3, done.[K
remote: Total 3 (delta 0), reused 0 (delta 0), pack-reused 3[K
Unpacking objects: 100% (3/3), 425 bytes | 425.00 KiB/s, done.


In [None]:
! cp -r /content/Drive/MyDrive/bert_model/* news_classification

In [None]:
%cd /content/news_classification

/content/news_classification


In [None]:
!git add .
!git commit -m "model deployment"

[main 6ab7179] model deployment
 1 file changed, 8 insertions(+), 8 deletions(-)


In [None]:
!git push

Enumerating objects: 5, done.
Counting objects:  20% (1/5)Counting objects:  40% (2/5)Counting objects:  60% (3/5)Counting objects:  80% (4/5)Counting objects: 100% (5/5)Counting objects: 100% (5/5), done.
Delta compression using up to 2 threads
Compressing objects:  33% (1/3)Compressing objects:  66% (2/3)Compressing objects: 100% (3/3)Compressing objects: 100% (3/3), done.
Writing objects:  33% (1/3)Writing objects:  66% (2/3)Writing objects: 100% (3/3)Writing objects: 100% (3/3), 380 bytes | 380.00 KiB/s, done.
Total 3 (delta 2), reused 0 (delta 0), pack-reused 0
To https://huggingface.co/serge-wilson/news_classification
   cd0b4ab..6ab7179  main -> main
