In [1]:
import pandas as pd

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 7.9 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 46.8 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 69.2 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1


In [4]:
model_name="yiyanghkust/finbert-tone"

In [5]:
from transformers import AutoModelForSequenceClassification
new_model = AutoModelForSequenceClassification.from_pretrained("/content/drive/MyDrive/Colab Notebooks/yiyanghkust finbert-tone/model")

In [6]:
from transformers import AutoTokenizer
new_tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading:   0%|          | 0.00/533 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

In [7]:
import nltk
from nltk.stem.porter import PorterStemmer
import re
from nltk.corpus import stopwords
nltk.download("stopwords")
def preprocessing(text):
    ps = PorterStemmer()
    sentence = re.sub('[^a-zA-Z]',' ',text) # noktalama işaretlerini silme.
    sentence = sentence.lower() # kelimelerin hepsini küçük harf 
    sentence = sentence.split() # keliemeleri listeye atma
    sentence = [ps.stem(kelime) for kelime in sentence if not kelime in set(stopwords.words("english"))] #anlamı olmayan(stopwords) kelimeleri listeden atma
    sentence = " ".join(sentence) # listede kalan kelimeleri birleştirip yeni cümleyi oluşturma.
    return sentence

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [8]:
sentence = preprocessing("He went to airport.")
print(sentence)

went airport


In [9]:
import torch
import numpy as np

def get_prediction(text):
    text = preprocessing(text)
    encoding = new_tokenizer(text, return_tensors="pt", padding="max_length", truncation=True, max_length=128)
    # encoding = {k: v.to(trainer.model.device) for k,v in encoding.items()}
    outputs = new_model(**encoding)

    logits = outputs.logits

    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(logits.squeeze().cpu())
    probs = probs.detach().numpy()
    label = np.argmax(probs, axis=-1) # değeri 0 veya 1'e yuvarlıyoruz.
    
    if label == 1:
        print(f"sentiment: True, probability: {probs[1]}")
        return 1
    else:
        print(f"sentiment: Fake, probability: {probs[1]}")
        return 0

In [10]:
get_prediction("He goes to airport.")

sentiment: True, probability: 0.9936510920524597


1

In [11]:
submission = pd.read_csv("/content/drive/MyDrive/submission.csv")
# y = submission["Truth"]
# x = submission["Haber"]

In [12]:
accuracy = 0
y_pred = []
for i in range(submission.shape[0]):
    row = submission.iloc[i]
    text = row[0]
    truth_value = row[1]
    print(f"{i}-)")
    result = get_prediction(text)
    y_pred.append(result)
    if(result == int(truth_value)):
        accuracy += 1 
accuracy

0-)
sentiment: True, probability: 0.9937022924423218
1-)
sentiment: Fake, probability: 0.09930296987295151
2-)
sentiment: Fake, probability: 0.08127196133136749
3-)
sentiment: True, probability: 0.9935838580131531
4-)
sentiment: Fake, probability: 0.08820915967226028
5-)
sentiment: True, probability: 0.9936891794204712
6-)
sentiment: True, probability: 0.9937020540237427
7-)
sentiment: Fake, probability: 0.08128499984741211
8-)
sentiment: True, probability: 0.9936916828155518
9-)
sentiment: True, probability: 0.9936816692352295
10-)
sentiment: True, probability: 0.9936502575874329
11-)
sentiment: True, probability: 0.9936901330947876
12-)
sentiment: True, probability: 0.9936849474906921
13-)
sentiment: True, probability: 0.9936406016349792
14-)
sentiment: True, probability: 0.9937107563018799
15-)
sentiment: True, probability: 0.9936342239379883
16-)
sentiment: Fake, probability: 0.08127650618553162
17-)
sentiment: True, probability: 0.9936783909797668
18-)
sentiment: Fake, probability

978

In [13]:
from sklearn.metrics import confusion_matrix

In [14]:
y_true=submission["Truth"].tolist()

In [15]:
conf_matrix=confusion_matrix(y_true, y_pred)
conf_matrix

array([[480,  12],
       [ 10, 498]])

In [16]:
TP = conf_matrix[1][1]
TN = conf_matrix[0][0]
FP = conf_matrix[0][1]
FN = conf_matrix[1][0]
print('True Positives:', TP)
print('True Negatives:', TN)
print('False Positives:', FP)
print('False Negatives:', FN)

True Positives: 498
True Negatives: 480
False Positives: 12
False Negatives: 10


In [17]:
conf_accuracy = (float (TP+TN) / float(TP + TN + FP + FN))

conf_error_rate = 1- conf_accuracy
    
conf_sensitivity = (TP / float(TP + FN)) #recall

conf_specificity = (TN / float(TN + FP))
    
conf_precision = (TN / float(TN + FP))

conf_f1 = 2 * ((conf_precision * conf_sensitivity) / (conf_precision + conf_sensitivity))
print('-'*50)
print(f'Accuracy: {round(conf_accuracy,2)}') 
print(f'Error_rate: {round(conf_error_rate,2)}') 
print(f'Sensitivity: {round(conf_sensitivity,2)}') 
print(f'Specificity: {round(conf_specificity,2)}') 
print(f'Precision: {round(conf_precision,2)}')
print(f'f_1 Score: {round(conf_f1,2)}')

--------------------------------------------------
Accuracy: 0.98
Error_rate: 0.02
Sensitivity: 0.98
Specificity: 0.98
Precision: 0.98
f_1 Score: 0.98
