<a href="https://colab.research.google.com/github/razreshili/toy/blob/main/German_run_on_real_ipynb%22.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Preprocessing and running on real dataset

In [None]:
!pip install transformers
!pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99


In [None]:
from transformers import pipeline
import pandas as pd

## Loading real data from Tobias

In [None]:
PATH_TO_DATA = "/content/dataset.csv"

In [None]:
# PATH_TO_DATA = "../datasets/20231109_cleaned_news_data_from_en_sortByDate_2023-09-05_to_2023-11-08.csv"

In [None]:
df = pd.read_csv(PATH_TO_DATA)

In [None]:
df.head()

Unnamed: 0,headline,date_publish,language,description,source_domain,article,url,company,rank,api,topic,country,is_opinion,_score,model_input
0,"SAP-CFO greift durch: ""Wir werden jeden Stein ...",2023-11-09 13:09:00,de,Europas größter Softwarekonzern SAP hat im dri...,finanznachrichten.de,Europas größter Softwarekonzern SAP hat im dri...,https://www.finanznachrichten.de/nachrichten-2...,Infineon,8207.0,https://api.newscatcherapi.com/v2/search,finance,DE,False,14.040912,Europe's largest software company SAP surprisi...
1,Haftung bei Unfall in zweispurigem Baustellenb...,2023-10-12 00:00:00,de,Beim Überholen im zweispurigen Baustellenberei...,haufe.de,Bild: Haufe Online Redaktion\n \n \nBeim Überh...,https://www.haufe.de/recht/weitere-rechtsgebie...,Porsche,14831.0,https://api.newscatcherapi.com/v2/search,news,DE,False,14.847401,Things often get tight when overtaking in two-...
2,"Ist das alles gewesen, Porsche Automobil?",2023-11-16 20:32:00,de,"Ist das alles gewesen, Porsche Automobil? () |...",aktiencheck.de,Für die Aktie Porsche Automobil stehen per 15....,https://www.aktiencheck.de/news/Artikel-Ist_al...,Porsche,115991.0,https://api.newscatcherapi.com/v2/search,news,DE,False,14.850629,"Was that all, Porsche Automobile? () | aktienc..."
3,Herando Erfahrungen: Die neuen Cayenne-Modelle...,2023-10-10 08:02:54,de,Kürzlich präsentierte die Luxusmarke Porsche d...,prnews24.com,Prnews24.com\n\n\nKürzlich präsentierte die Lu...,https://www.prnews24.com/403001/herando-erfahr...,Porsche,568988.0,https://api.newscatcherapi.com/v2/search,news,DE,False,14.850753,The luxury brand Porsche recently presented th...
4,Kartellamt gibt Deal zwischen Porsche und VfB ...,2023-11-03 15:47:11,de,Der Einstieg von Porsche beim Fußball-Bundesli...,augsburger-allgemeine.de,Der Einstieg von Porsche beim Fußball-Bundesli...,https://www.augsburger-allgemeine.de/sport/fus...,Porsche,7501.0,https://api.newscatcherapi.com/v2/search,news,DE,False,14.973518,Porsche's entry into the Bundesliga soccer clu...


In [None]:
NEWS_COLUMN = "model_input"

In [None]:
df[NEWS_COLUMN].head()

0    Europe's largest software company SAP surprisi...
1    Things often get tight when overtaking in two-...
2    Was that all, Porsche Automobile? () | aktienc...
3    The luxury brand Porsche recently presented th...
4    Porsche's entry into the Bundesliga soccer clu...
Name: model_input, dtype: object

In [None]:
set(df["language"])

{'de', 'en', nan}

### Checking for null values

In [None]:
sum(df[NEWS_COLUMN].isnull())

0

In [None]:
df.dropna(subset=[NEWS_COLUMN], inplace=True)

In [None]:
len(df)

46193

## Check length of column to use for sentiment analysis

In [None]:
def check_length(df):
  lengths = df[NEWS_COLUMN].apply(len)
  df_length_of_sentences = pd.DataFrame(list(df[NEWS_COLUMN].apply(len)), columns=[NEWS_COLUMN])
  statistics = df_length_of_sentences.describe()
  print(statistics)
  return df_length_of_sentences

In [None]:
check_length(df)

        model_input
count  46193.000000
mean     354.577035
std     1006.912212
min        2.000000
25%      104.000000
50%      160.000000
75%      199.000000
max    27338.000000


Unnamed: 0,model_input
0,195
1,177
2,53
3,97
4,177
...,...
46188,2652
46189,2458
46190,976
46191,791


# Running sentiment prediction

In [None]:
from transformers import BertForSequenceClassification, BertTokenizer
import torch
import pandas as pd


def load_model(saved_model_path):
    # Load the saved model from the local directory
    model = BertForSequenceClassification.from_pretrained(saved_model_path)
    model.to('cuda')
    # Load the tokenizer if saved separately
    tokenizer = BertTokenizer.from_pretrained(saved_model_path)
    return model, tokenizer

def split_in_chunks(text, tokenizer, chunksize=512):
    """
    Function to split the text in 512 chunks
    :return: the chunks
    """
    # tokenize
    tokens = tokenizer.encode_plus(text, add_special_tokens=False, return_tensors='pt')
    # chunk input ids and mask into chunksize-2 to leave space for 2 tokens for beginning and end: [101], [102]
    input_id_chunks = list(tokens['input_ids'][0].split(chunksize-2))
    mask_chunks = list(tokens['attention_mask'][0].split(chunksize-2))

    for i in range(len(input_id_chunks)):
        # append tokens for beginning of sentence
        input_id_chunks[i] = torch.cat([
            torch.Tensor([101]), input_id_chunks[i], torch.Tensor([102])
        ])
        mask_chunks[i] = torch.cat([
            torch.Tensor([1]), mask_chunks[i], torch.Tensor([1])
        ])

        pad_len = chunksize - input_id_chunks[i].shape[0]
        # adding the padding for "incomplete" sentence
        if pad_len > 0:
            input_id_chunks[i] = torch.cat([
                input_id_chunks[i], torch.Tensor([0] * pad_len)
            ])
            mask_chunks[i] = torch.cat([
                mask_chunks[i], torch.Tensor([0] * pad_len)
            ])
    # put in the right format for the BERT model
    input_ids = torch.stack(input_id_chunks)
    attention_mask = torch.stack(mask_chunks)

    input_dict = {
        'input_ids': input_ids.long(),
        'attention_mask': attention_mask.int()
    }
    return input_dict

def apply_the_model(input_dict, model):
    input_dict = {key: value.to('cuda') for key, value in input_dict.items()}
    model.to('cuda')
    with torch.no_grad():
      outputs = model(**input_dict)
    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
    mean = probs.mean(dim=0)
    result = torch.argmax(mean).item()
    map_the_label = {'0': "positive", '1': "negative", "2": "neutral"}
    return map_the_label[str(result)]

def analyze_sentiment_batch(texts, model, tokenizer, chunksize=512):
    predictions = []
    for i, text in enumerate(texts):
        input_dict = split_in_chunks(text, tokenizer, chunksize)
        label = apply_the_model(input_dict, model)
        predictions.append({NEWS_COLUMN: text, 'sentiment': label})
    return predictions

In [None]:
# Load the model and tokenizer
model, tokenizer = load_model("ProsusAI/finbert")


# Specify the batch size
batch_size = 100


config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
for i in range(0, len(df), batch_size):
    # Extract a batch of texts
    batch_texts = df[NEWS_COLUMN].iloc[i:i + batch_size].tolist()

    # Run sentiment analysis on the batch
    batch_predictions = analyze_sentiment_batch(batch_texts, model, tokenizer, chunksize=512)

    # Save the results (replace 'sentiment_results.csv' with your desired file path)
    results_df = pd.DataFrame(batch_predictions)
    results_df.to_csv('results.csv', mode='a')

    # Print progress
    print(f"Processed {i + len(batch_texts)} rows out of {len(df)}")

Processed 100 rows out of 46193
Processed 200 rows out of 46193
Processed 300 rows out of 46193
Processed 400 rows out of 46193
Processed 500 rows out of 46193
Processed 600 rows out of 46193
Processed 700 rows out of 46193
Processed 800 rows out of 46193
Processed 900 rows out of 46193
Processed 1000 rows out of 46193
Processed 1100 rows out of 46193
Processed 1200 rows out of 46193
Processed 1300 rows out of 46193
Processed 1400 rows out of 46193
Processed 1500 rows out of 46193
Processed 1600 rows out of 46193
Processed 1700 rows out of 46193
Processed 1800 rows out of 46193
Processed 1900 rows out of 46193
Processed 2000 rows out of 46193
Processed 2100 rows out of 46193
Processed 2200 rows out of 46193
Processed 2300 rows out of 46193
Processed 2400 rows out of 46193
Processed 2500 rows out of 46193
Processed 2600 rows out of 46193
Processed 2700 rows out of 46193
Processed 2800 rows out of 46193
Processed 2900 rows out of 46193
Processed 3000 rows out of 46193
Processed 3100 rows

In [None]:
res = df.read("sentiment_results.csv")