In [25]:
#!pip install pandas
#!pip install numpy
#!pip install joblib
#!pip install transformers
#!pip install scikit-learn
#!pip install keras
#!pip install tensorflow
#!pip install Keras-Preprocessing

In [2]:
import pandas as pd
import numpy as np

In [3]:
real_news = pd.read_csv('merged_data_csv.csv')
real_news = real_news[real_news['label'] == 'real']
real_news = real_news.sample(n=20, random_state=42).reset_index(drop=True)
real_news['type'] = 7
display(real_news)

Unnamed: 0,article,label,type
0,was born and raised in the west coast city of ...,real,7
1,The chairman of the business select committee ...,real,7
2,The government has set itself on a collision c...,real,7
3,Golf is expected to retain its status despite ...,real,7
4,In the days since an election that will substa...,real,7
5,In all the hype around Black figuring out whet...,real,7
6,Police are investigating suspected drug after ...,real,7
7,Artistic star spokeswoman the artistic directo...,real,7
8,The president of and this peace laureate has p...,real,7
9,and are understood to have escaped Football As...,real,7


In [4]:
fake_news = pd.read_csv('step1_labeled_fake_data.csv')
fake_news = fake_news.head(20).reset_index(drop=True)
fake_news['type'] = fake_news['type'].map({'bs': 0, 'bias': 1, 'conspiracy': 2, 'hate': 3, 'satire': 4, 'junksci': 5, 'fake': 0})
display(fake_news)

Unnamed: 0,article,label,type
0,In todays of reports on a fake Proof of Life J...,Fake,2
1,On the people will have the opportunity to do ...,Fake,2
2,J This is the continuation of the testimony I ...,Fake,0
3,The great thing about modern technology and fo...,Fake,5
4,on Longtime If All of s s She Could Lose State...,Fake,1
5,More than out in groups of or descended upon t...,Fake,3
6,Mandatory vaccinations are about to open up a ...,Fake,2
7,VOTG continues across today as the electorate ...,Fake,1
8,Tin pot rule and corruption in the United Stat...,Fake,0
9,version Font Size The West and are entering th...,Fake,0


In [5]:
# Combining fake news dataset and real dataset
combined_df = pd.concat([real_news, fake_news], ignore_index=True)
display(combined_df)

Unnamed: 0,article,label,type
0,was born and raised in the west coast city of ...,real,7
1,The chairman of the business select committee ...,real,7
2,The government has set itself on a collision c...,real,7
3,Golf is expected to retain its status despite ...,real,7
4,In the days since an election that will substa...,real,7
5,In all the hype around Black figuring out whet...,real,7
6,Police are investigating suspected drug after ...,real,7
7,Artistic star spokeswoman the artistic directo...,real,7
8,The president of and this peace laureate has p...,real,7
9,and are understood to have escaped Football As...,real,7


In [6]:
# Shuffling data rows
shuffled_df = combined_df.sample(frac=1, random_state=42).reset_index(drop=True)
display(shuffled_df)

Unnamed: 0,article,label,type
0,Among the philanthropies with dynastic names a...,real,7
1,If I ever imagined meeting actor and comedian ...,real,7
2,While battles with its fake news is dealing wi...,real,7
3,Mandatory vaccinations are about to open up a ...,Fake,2
4,In the days since an election that will substa...,real,7
5,Two nights after J Trump won the presidential ...,real,7
6,So heres the ultimate If some rich people can ...,Fake,1
7,VOTG continues across today as the electorate ...,Fake,1
8,on The rubble of the smashed town of Click to ...,Fake,0
9,Police are investigating suspected drug after ...,real,7


In [7]:
shuffled_df['label'] = shuffled_df['label'].map({'Fake': 'fake', 'real': 'real'})
display(shuffled_df)

Unnamed: 0,article,label,type
0,Among the philanthropies with dynastic names a...,real,7
1,If I ever imagined meeting actor and comedian ...,real,7
2,While battles with its fake news is dealing wi...,real,7
3,Mandatory vaccinations are about to open up a ...,fake,2
4,In the days since an election that will substa...,real,7
5,Two nights after J Trump won the presidential ...,real,7
6,So heres the ultimate If some rich people can ...,fake,1
7,VOTG continues across today as the electorate ...,fake,1
8,on The rubble of the smashed town of Click to ...,fake,0
9,Police are investigating suspected drug after ...,real,7


In [8]:
np.set_printoptions(threshold=np.inf)

In [9]:
import pandas as pd
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import pickle
import zipfile
from torch.utils.data import Dataset, DataLoader

# Extract the ZIP file containing the model, tokenizer, and LabelEncoder
with zipfile.ZipFile("model_and_training_process2.zip", "r") as zip_ref:
    zip_ref.extractall("model_and_training_process")

# Load the trained model
model = AutoModelForSequenceClassification.from_pretrained("model_and_training_process/model", num_labels=2)

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("model_and_training_process/model")

# Load the LabelEncoder
with open("./model_and_training_process/label_encoder.pkl", "rb") as le_file:
    label_encoder = pickle.load(le_file)

def classify_text(text):
    # Tokenize the input text
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    # Perform inference
    with torch.no_grad():
        outputs = model(**inputs)
    # Get predicted label
    predicted_label_id = torch.argmax(outputs.logits).item()
    # Check if predicted label index is within bounds
    if predicted_label_id < len(label_encoder.classes_):
        predicted_label = label_encoder.classes_[predicted_label_id]
    else:
        # Handle out of bounds index 
        predicted_label = "Unknown"
    return predicted_label


# Apply text classification function to the "article" column and store predictions in a new column
shuffled_df["predicted_label"] = shuffled_df["article"].apply(classify_text)


shuffled_df

  from .autonotebook import tqdm as notebook_tqdm
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Unnamed: 0,article,label,type,predicted_label
0,Among the philanthropies with dynastic names a...,real,7,real
1,If I ever imagined meeting actor and comedian ...,real,7,real
2,While battles with its fake news is dealing wi...,real,7,real
3,Mandatory vaccinations are about to open up a ...,fake,2,fake
4,In the days since an election that will substa...,real,7,real
5,Two nights after J Trump won the presidential ...,real,7,real
6,So heres the ultimate If some rich people can ...,fake,1,fake
7,VOTG continues across today as the electorate ...,fake,1,fake
8,on The rubble of the smashed town of Click to ...,fake,0,fake
9,Police are investigating suspected drug after ...,real,7,real


In [10]:
import joblib
import keras
from keras.preprocessing.sequence import pad_sequences

tokenizer2 = joblib.load("tokenizer2.sav")
model2 = joblib.load("model2.sav")

combined_sample_tokenised = tokenizer2.texts_to_sequences(shuffled_df["article"].tolist())
combined_sample_train_data = pad_sequences(combined_sample_tokenised, maxlen=199)

combined_predicted = model2.predict(combined_sample_train_data, batch_size=1024, verbose=1)

df_combined_pred = pd.DataFrame(combined_predicted)
df_combined_pred = df_combined_pred.where(df_combined_pred!=0).rank(1, ascending=False, method='dense').eq(1).astype(int)

df_combined_pred.columns = ["bias", "bs", "conspiracy", "fake", "hate", "junksci", "satire", "state"]

print(np.array(df_combined_pred).tolist())

df_combined_pred = pd.from_dummies(df_combined_pred)
df_combined_pred.columns = ["predicted_type"]

shuffled_df = pd.concat([shuffled_df, df_combined_pred], axis=1)

shuffled_df


[[0, 0, 0, 0, 0, 1, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0], [0, 0, 0, 0, 0, 1, 0, 0], [0, 0, 0, 0, 0, 0, 1, 0], [0, 0, 0, 0, 1, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1, 0, 0], [0, 0, 0, 0, 0, 0, 1, 0], [1, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1, 0, 0], [0, 0, 0, 0, 0, 0, 1, 0], [0, 0, 0, 0, 1, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0], [0, 1, 0, 0

Unnamed: 0,article,label,type,predicted_label,predicted_type
0,Among the philanthropies with dynastic names a...,real,7,real,junksci
1,If I ever imagined meeting actor and comedian ...,real,7,real,bs
2,While battles with its fake news is dealing wi...,real,7,real,hate
3,Mandatory vaccinations are about to open up a ...,fake,2,fake,junksci
4,In the days since an election that will substa...,real,7,real,satire
5,Two nights after J Trump won the presidential ...,real,7,real,hate
6,So heres the ultimate If some rich people can ...,fake,1,fake,bias
7,VOTG continues across today as the electorate ...,fake,1,fake,bias
8,on The rubble of the smashed town of Click to ...,fake,0,fake,junksci
9,Police are investigating suspected drug after ...,real,7,real,satire


In [12]:
shuffled_df.type = shuffled_df.type.replace({0:"bias", 1:"bs",2: "conspiracy", 3:"fake", 4:"hate",5:"junksci", 6:"satire",7: "state"})

In [13]:
shuffled_df

Unnamed: 0,article,label,type,predicted_label,predicted_type
0,Among the philanthropies with dynastic names a...,real,state,real,junksci
1,If I ever imagined meeting actor and comedian ...,real,state,real,bs
2,While battles with its fake news is dealing wi...,real,state,real,hate
3,Mandatory vaccinations are about to open up a ...,fake,conspiracy,fake,junksci
4,In the days since an election that will substa...,real,state,real,satire
5,Two nights after J Trump won the presidential ...,real,state,real,hate
6,So heres the ultimate If some rich people can ...,fake,bs,fake,bias
7,VOTG continues across today as the electorate ...,fake,bs,fake,bias
8,on The rubble of the smashed town of Click to ...,fake,bias,fake,junksci
9,Police are investigating suspected drug after ...,real,state,real,satire


In [14]:
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score

In [15]:
accuracy_score(shuffled_df.label, shuffled_df.predicted_label)

1.0

In [18]:
print(classification_report(shuffled_df.label, shuffled_df.predicted_label))

              precision    recall  f1-score   support

        fake       1.00      1.00      1.00        20
        real       1.00      1.00      1.00        20

    accuracy                           1.00        40
   macro avg       1.00      1.00      1.00        40
weighted avg       1.00      1.00      1.00        40



In [16]:
accuracy_score(shuffled_df.type, shuffled_df.predicted_type)

0.15

In [19]:
print(classification_report(shuffled_df.type, shuffled_df.predicted_type))

              precision    recall  f1-score   support

        bias       0.21      0.43      0.29         7
          bs       0.20      0.25      0.22         8
  conspiracy       0.00      0.00      0.00         3
        fake       0.00      0.00      0.00         1
        hate       0.00      0.00      0.00         0
     junksci       0.25      1.00      0.40         1
      satire       0.00      0.00      0.00         0
       state       0.00      0.00      0.00        20

    accuracy                           0.15        40
   macro avg       0.08      0.21      0.11        40
weighted avg       0.08      0.15      0.10        40



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
