In [4]:
import pandas as pd

# Define the preprocess_text function
def preprocess_text(text):
    # Example: Lowercase the text
    text = text.lower()
    # Add other preprocessing steps as needed
    return text

train_df = pd.read_csv("train.tsv", sep="\t", header=None)
test_df = pd.read_csv("test.tsv", sep="\t", header=None)
val_df = pd.read_csv("valid.tsv", sep="\t", header=None)

# Assign column names
train_df.columns = ["id", "label", "statement", "subject", "speaker", "job", "state", "party", "barely_true", "false", "half_true", "mostly_true", "pants_on_fire", "context"]
test_df.columns = ["id", "label", "statement", "subject", "speaker", "job", "state", "party", "barely_true", "false", "half_true", "mostly_true", "pants_on_fire", "context"]
val_df.columns = ["id", "label", "statement", "subject", "speaker", "job", "state", "party", "barely_true", "false", "half_true", "mostly_true", "pants_on_fire", "context"]

# Apply the preprocessing to the 'statement' column
train_df['statement'] = train_df['statement'].apply(preprocess_text)
test_df['statement'] = test_df['statement'].apply(preprocess_text)
val_df['statement'] = val_df['statement'].apply(preprocess_text)

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=5000)  # Adjust max_features as needed
X_train = vectorizer.fit_transform(train_df['statement'])
X_test = vectorizer.transform(test_df['statement'])
X_val = vectorizer.transform(val_df['statement'])

## LOGISTIC REGRESSION

In [6]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train, train_df['label'])

In [7]:
from sklearn.metrics import accuracy_score
y_pred = model.predict(X_test)
accuracy = accuracy_score(test_df['label'], y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.24704025256511444


In [8]:
!pip install gensim



In [9]:
import gensim.downloader as api
glove_model = api.load("glove-wiki-gigaword-100") # Choose an appropriate dimension (e.g., 100, 300)



In [10]:
import numpy as np
def get_statement_embedding(statement, model):
    words = statement.split()
    embeddings = [model[word] for word in words if word in model]
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(model.vector_size)

train_embeddings = train_df['statement'].apply(lambda x: get_statement_embedding(x, glove_model))
test_embeddings = test_df['statement'].apply(lambda x: get_statement_embedding(x, glove_model))
val_embeddings = val_df['statement'].apply(lambda x: get_statement_embedding(x, glove_model))

# Convert to NumPy arrays
X_train_embeddings = np.vstack(train_embeddings)
X_test_embeddings = np.vstack(test_embeddings)
X_val_embeddings = np.vstack(val_embeddings)

In [11]:
!pip install nltk textblob



In [12]:
import nltk
nltk.download('vader_lexicon')
nltk.download('punkt')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [13]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from textblob import TextBlob

sia = SentimentIntensityAnalyzer()

def get_sentiment_scores(statement):
    vader_scores = sia.polarity_scores(statement)
    textblob_score = TextBlob(statement).sentiment.polarity
    return vader_scores['compound'], textblob_score # Return compound score from VADER and polarity from TextBlob

train_df[['vader_sentiment', 'textblob_sentiment']] = train_df['statement'].apply(get_sentiment_scores).apply(pd.Series)
test_df[['vader_sentiment', 'textblob_sentiment']] = test_df['statement'].apply(get_sentiment_scores).apply(pd.Series)
val_df[['vader_sentiment', 'textblob_sentiment']] = val_df['statement'].apply(get_sentiment_scores).apply(pd.Series)

In [14]:
import re

def get_linguistic_features(statement):
    num_pronouns = len(re.findall(r'\b(i|me|my|mine|you|your|yours|he|him|his|she|her|hers|it|its|we|us|our|ours|they|them|their|theirs)\b', statement, re.IGNORECASE))
    num_punctuation = len(re.findall(r'[.,!?;:]', statement))
    # Add more features as needed (e.g., specific keywords)
    return num_pronouns, num_punctuation

train_df[['num_pronouns', 'num_punctuation']] = train_df['statement'].apply(get_linguistic_features).apply(pd.Series)
test_df[['num_pronouns', 'num_punctuation']] = test_df['statement'].apply(get_linguistic_features).apply(pd.Series)
val_df[['num_pronouns', 'num_punctuation']] = val_df['statement'].apply(get_linguistic_features).apply(pd.Series)

In [15]:
from scipy.sparse import hstack
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer # Importing TfidfVectorizer

# Create TF-IDF features
vectorizer = TfidfVectorizer(max_features=5000) # You might need to adjust max_features
X_train_tfidf = vectorizer.fit_transform(train_df['statement'])
X_test_tfidf = vectorizer.transform(test_df['statement'])
X_val_tfidf = vectorizer.transform(val_df['statement'])

# Assuming X_train_tfidf is your TF-IDF feature matrix
X_train_combined = hstack([X_train_tfidf, X_train_embeddings, train_df[['vader_sentiment', 'textblob_sentiment', 'num_pronouns', 'num_punctuation']].values])
X_test_combined = hstack([X_test_tfidf, X_test_embeddings, test_df[['vader_sentiment', 'textblob_sentiment', 'num_pronouns', 'num_punctuation']].values])
X_val_combined = hstack([X_val_tfidf, X_val_embeddings, val_df[['vader_sentiment', 'textblob_sentiment', 'num_pronouns', 'num_punctuation']].values])

# Standardize numerical features for better model performance
scaler = StandardScaler()
X_train_combined = scaler.fit_transform(X_train_combined.toarray())
X_test_combined = scaler.transform(X_test_combined.toarray())
X_val_combined = scaler.transform(X_val_combined.toarray())

In [16]:
from sklearn.linear_model import LogisticRegression  # Or any other model you want to try
model = LogisticRegression()
model.fit(X_train_combined, train_df['label'])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [17]:
from sklearn.metrics import accuracy_score, classification_report

y_pred = model.predict(X_test_combined)
accuracy = accuracy_score(test_df['label'], y_pred)
print(f"Accuracy: {accuracy}")
print(classification_report(test_df['label'], y_pred))

Accuracy: 0.21468034727703236
              precision    recall  f1-score   support

 barely-true       0.25      0.24      0.24       212
       false       0.26      0.27      0.26       249
   half-true       0.20      0.18      0.19       265
 mostly-true       0.21      0.22      0.22       241
  pants-fire       0.15      0.17      0.16        92
        true       0.19      0.18      0.18       208

    accuracy                           0.21      1267
   macro avg       0.21      0.21      0.21      1267
weighted avg       0.22      0.21      0.21      1267



In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report

In [19]:
# Create a Logistic Regression model
model = LogisticRegression(random_state=42, max_iter=1000)  # Increase max_iter if needed

# Define the hyperparameter grid for GridSearchCV
param_grid = {
    'penalty': ['l1', 'l2'],  # Regularization type
    'C': [0.1, 1, 10, 100],  # Regularization strength
    'solver': ['liblinear', 'saga']  # Solver algorithms for different penalties
}

# Define the parameter distributions for RandomizedSearchCV (alternative to GridSearchCV)
param_dist = {
    'penalty': ['l1', 'l2'],
    'C': [0.1, 1, 10, 100],  # or use a distribution like scipy.stats.loguniform(0.01, 100)
    'solver': ['liblinear', 'saga']
}

In [20]:
!pip install transformers



In [21]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch

# Load pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'  # Choose an appropriate BERT model
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=6) # Assuming 6 labels: barely-true, false, half-true, mostly-true, pants-fire, true

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
subset_size = int(len(train_df) * 0.1)  # Use 10% of the training data
train_df_subset = train_df.sample(n=subset_size, random_state=42)  # Randomly select samples

In [26]:
# Tokenize the statements from the subset
train_encodings_subset = tokenizer(train_df_subset['statement'].tolist(), truncation=True, padding=True)

# Create PyTorch dataset using the subset
train_dataset_subset = LiarDataset(train_encodings_subset, train_df_subset['label'].tolist())

In [27]:
# Define the LiarDataset class
class LiarDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Then, you can create your dataset:
train_dataset_subset = LiarDataset(train_encodings_subset, train_df_subset['label'].tolist())

In [28]:
# Tokenize the statements from the subset
train_encodings_subset = tokenizer(train_df_subset['statement'].tolist(), truncation=True, padding=True)

# Create PyTorch dataset using the subset
train_dataset_subset = LiarDataset(train_encodings_subset, train_df_subset['label'].tolist())

In [29]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch

# Tokenize the validation data
val_encodings = tokenizer(val_df['statement'].tolist(), truncation=True, padding=True)
val_dataset = LiarDataset(val_encodings, val_df['label'].tolist())

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    # ... other training arguments ...
)

# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset_subset,  # Use the subset for training
    eval_dataset=val_dataset
)

In [30]:
# Bert model on Historical dataset (Liar dataset)
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch
import numpy as np
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

# Load pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=6)

# Assuming your dataframes are named train_df, test_df, and val_df

# 1. Label Encoding
label_encoder = LabelEncoder()
label_encoder.fit(train_df['label'])

train_df['label_encoded'] = label_encoder.transform(train_df['label'])
test_df['label_encoded'] = label_encoder.transform(test_df['label'])
val_df['label_encoded'] = label_encoder.transform(val_df['label'])

# 2. Create the LiarDataset class
class LiarDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

# 3. Tokenize the data and Create Datasets
train_encodings = tokenizer(train_df['statement'].tolist(), truncation=True, padding=True)
test_encodings = tokenizer(test_df['statement'].tolist(), truncation=True, padding=True)
val_encodings = tokenizer(val_df['statement'].tolist(), truncation=True, padding=True)

train_dataset = LiarDataset(train_encodings, train_df['label_encoded'].tolist())
test_dataset = LiarDataset(test_encodings, test_df['label_encoded'].tolist())
val_dataset = LiarDataset(val_encodings, val_df['label_encoded'].tolist())

# 4. Subset for Faster Training (Optional)
subset_size = int(len(train_df) * 0.1)
train_df_subset = train_df.sample(n=subset_size, random_state=42)
train_encodings_subset = tokenizer(train_df_subset['statement'].tolist(), truncation=True, padding=True)
train_dataset_subset = LiarDataset(train_encodings_subset, train_df_subset['label_encoded'].tolist())

# 5. Define Training Arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch"
)

# 6. Create Trainer and Train
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset_subset,
    eval_dataset=val_dataset
)
trainer.train()

# 7. Save the Model
trainer.save_model("./fine_tuned_bert")

# 8. Evaluate the Model
predictions = trainer.predict(test_dataset)
predicted_labels = np.argmax(predictions.predictions, axis=1)
accuracy = accuracy_score(test_df['label_encoded'].tolist(), predicted_labels)
print(f"Accuracy: {accuracy}")
print(classification_report(test_df['label_encoded'].tolist(), predicted_labels))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mstrawberrykim[0m ([33mstrawberrykim-amdocs[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Epoch,Training Loss,Validation Loss
1,1.8357,1.780299
2,1.7248,1.764552
3,1.6754,1.789451


Accuracy: 0.20126282557221783
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       212
           1       0.67      0.02      0.03       249
           2       0.26      0.05      0.09       265
           3       0.20      0.96      0.32       241
           4       0.00      0.00      0.00        92
           5       0.26      0.03      0.05       208

    accuracy                           0.20      1267
   macro avg       0.23      0.18      0.08      1267
weighted avg       0.27      0.20      0.09      1267



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# API INTEGRATION

In [31]:
!pip install Flask



/content/fine_tuned_bert
path of the fine tuned bert model


/content/wandb   wandb

In [32]:
tokenizer.save_pretrained("./fine_tuned_bert")

('./fine_tuned_bert/tokenizer_config.json',
 './fine_tuned_bert/special_tokens_map.json',
 './fine_tuned_bert/vocab.txt',
 './fine_tuned_bert/added_tokens.json')

In [33]:
model_path = "./fine_tuned_bert"
tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained(model_path)

In [34]:
# Save the fine-tuned model
trainer.save_model("./fine_tuned_bert")
tokenizer.save_pretrained("./fine_tuned_bert") # Save the tokenizer!

('./fine_tuned_bert/tokenizer_config.json',
 './fine_tuned_bert/special_tokens_map.json',
 './fine_tuned_bert/vocab.txt',
 './fine_tuned_bert/added_tokens.json')

In [35]:
# Load your fine-tuned BERT model and tokenizer
model_path = "./fine_tuned_bert"  # Same path for both
tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained(model_path)

In [36]:
%%writefile app.py
from flask import Flask, request, jsonify
from transformers import BertTokenizer, BertForSequenceClassification
import torch

app = Flask(__name__)

# Load your fine-tuned BERT model and tokenizer
model_path = "./fine_tuned_bert"  # Path to your saved model
tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained(model_path)

@app.route('/predict', methods=['POST'])
def predict():
    data = request.get_json()
    text = data['text']

    # Preprocess the text
    inputs = tokenizer(text, return_tensors="pt")

    # Make prediction
    with torch.no_grad():
        outputs = model(**inputs)
        predicted_class = torch.argmax(outputs.logits).item()

    # Get label for predicted class (if using LabelEncoder)
    # label = label_encoder.inverse_transform([predicted_class])[0]

    # Return the prediction
    return jsonify({'predicted_class': predicted_class}) #, 'label': label})

if __name__ == '__main__':
    app.run(debug=True, host='0.0.0.0', port=5000)

Writing app.py


In [37]:
!python app.py

In [39]:
!curl -X POST -H "Content-Type: application/json" -d '{"text": "'"$statement_to_check"'"'}' http://0.0.0.0:5000/predict

/bin/bash: -c: line 1: unexpected EOF while looking for matching `''
/bin/bash: -c: line 2: syntax error: unexpected end of file


In [40]:
!pip install pyngrok

Collecting pyngrok
  Downloading pyngrok-7.2.3-py3-none-any.whl.metadata (8.7 kB)
Downloading pyngrok-7.2.3-py3-none-any.whl (23 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.2.3


In [41]:
from pyngrok import ngrok

# Authenticate with your ngrok authtoken (optional but recommended)
# You can get your authtoken from your ngrok account dashboard
ngrok.set_auth_token("YOUR_AUTHTOKEN")



In [45]:
from pyngrok import ngrok

# **Replace "YOUR_AUTHTOKEN" with your actual ngrok authtoken**
# You can get your authtoken from your ngrok account dashboard: https://dashboard.ngrok.com/get-started/your-authtoken
ngrok.set_auth_token("PASTE_YOUR_AUTHTOKEN_HERE")

public_url = ngrok.connect(5000)  # Connect to the port your Flask app is running on (5000 in this case)
print(public_url)

In [44]:
!curl -X POST -H "Content-Type: application/json" -d '{"text": "your sample text"}' <public_url>/predict

/bin/bash: line 1: public_url: No such file or directory
