In [80]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Load data from CSV file
file_path = '/content/drive/MyDrive/data_with_predictions.csv'
df = pd.read_csv(file_path)

# Remove rows where any column has NaN values
df = df.dropna()

# Define features and target for Gaussian Naive Bayes
X =df[['Open ','High ','Low','Shares Traded ', 'Turnover']]
y = df['Prize Action']

# Scale the features for Gaussian Naive Bayes
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data into training and testing sets for Gaussian Naive Bayes
X_train_nb, X_test_nb, y_train_nb, y_test_nb = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Initialize Gaussian Naive Bayes classifier
clf_nb = BernoulliNB()

# Fit the Gaussian Naive Bayes model
clf_nb.fit(X_train_nb, y_train_nb)
y_pred_nb=clf_nb.predict(X_test_nb)
# Predict probabilities on the test set
y_pred_prob_nb = clf_nb.predict_proba(X_test_nb)[:, 1]  # Probability of the positive class
accuracy = accuracy_score(y_test_nb, y_pred_nb)
precision = precision_score(y_test_nb, y_pred_nb)
recall = recall_score(y_test_nb, y_pred_nb)
f1= f1_score(y_test_nb, y_pred_nb)
# conf_matrix_rf = confusion_matrix(y_test_rf, y_pred_rf)

# Print evaluation metrics
print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")
print(f"F1 Score: {f1 * 100:.2f}%")
# print(f"Confusion Matrix:\n{conf_matrix}")

Accuracy: 54.76%
Precision: 54.12%
Recall: 51.17%
F1 Score: 52.61%


In [81]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [82]:
import pandas as pd
import torch
from transformers import BertForSequenceClassification, BertTokenizer

# Define the directory path in Google Drive
model_directory = '/content/drive/MyDrive/bert_model'

# Load the BERT model and tokenizer
model = BertForSequenceClassification.from_pretrained(model_directory)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Define maximum token limit for BERT
max_length = model.config.max_position_embeddings

# Load data from CSV file
csv_file_path = '/content/drive/MyDrive/dataset_small.csv'
df = pd.read_csv(csv_file_path, encoding='ISO-8859-1')


# Define a function to make predictions
def predict(text):
  try:
      inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt")

    # Check for presence of OOV words
      # if tokenizer.convert_ids_to_tokens(inputs["input_ids"][0]) != text.split():  # Assuming tokenization splits on whitespace
      #   print(f"WARNING: Text contains OOV words. Using special tokens.")
      # print(len(inputs))
      outputs = model(**inputs)
      logits = outputs.logits
      predictions = torch.argmax(logits, dim=1).cpu().numpy()
      if predictions[0]==0:
        return predictions[0],'Positive'
      else:
        return predictions[0],'Negative'
  except Exception as e:
        # Print the error value for debugging
        print(f"Error during prediction for text: {text}\nError message: {e}")
        return None  # Or any desired default value


In [83]:
print(len(X_test_nb))

5464


In [84]:
X_lr = pd.DataFrame({
    'GNB_Prob': y_pred_nb,
    'Sentiment': X_test_nb[:, -1]
})

# Split data into training and testing sets for Logistic Regression
X_train_lr, X_test_lr, y_train_lr, y_test_lr = train_test_split(X_lr, y_test_nb, test_size=0.2, random_state=42)

# Initialize Logistic Regression classifier
clf_lr = LogisticRegression()

# Fit the Logistic Regression model
clf_lr.fit(X_train_lr, y_train_lr)

# Predict on the test set
y_pred_lr = clf_lr.predict(X_test_lr)

# Evaluate the Logistic Regression model
accuracy_lr = accuracy_score(y_test_lr, y_pred_lr)
precision_lr = precision_score(y_test_lr, y_pred_lr)
recall_lr = recall_score(y_test_lr, y_pred_lr)
f1_lr = f1_score(y_test_lr, y_pred_lr)
conf_matrix_lr = confusion_matrix(y_test_lr, y_pred_lr)

# Print evaluation metrics
print("Logistic Regression Classifier Metrics:")
print(f"Accuracy: {accuracy_lr * 100:.2f}%")
print(f"Precision: {precision_lr * 100:.2f}%")
print(f"Recall: {recall_lr * 100:.2f}%")
print(f"F1 Score: {f1_lr * 100:.2f}%")
print(f"Confusion Matrix:\n{conf_matrix_lr}")

Logistic Regression Classifier Metrics:
Accuracy: 55.44%
Precision: 53.27%
Recall: 51.73%
F1 Score: 52.49%
Confusion Matrix:
[[337 236]
 [251 269]]


In [61]:
from collections import Counter

In [87]:
#Adani
text=[
    "View: The Adani short sale puts investor trust in India in doubt",
    "India’s Adani Group loses $48bn in stocks over fraud claims",
    "Adani claims US investment firm’s fraud allegations are an ‘attack on India",
    "Adani group loses $48 billion since January 25"
]
data=np.array([[3300,3342.25,2712,8729,2.630224557]])
data = scaler.fit_transform(data)
sentiment=[]
for i in text:
  x,y=predict(i)
  print(i,":",y)
  sentiment.append(x)
frequency_counter = Counter(sentiment)

max_frequency = max(frequency_counter.values())

final_sentiment = [number for number, frequency in frequency_counter.items() if frequency == max_frequency]

y_pred_nb = clf_nb.predict(data)
y_pred_nb = np.append(y_pred_nb, final_sentiment)
fd=np.array([y_pred_nb])

y_pred_lr = clf_lr.predict(fd)
print("Price_action: ",y_pred_lr[0])

View: The Adani short sale puts investor trust in India in doubt : Negative
India’s Adani Group loses $48bn in stocks over fraud claims : Negative
Adani claims US investment firm’s fraud allegations are an ‘attack on India : Negative
Adani group loses $48 billion since January 25 : Negative
Price_action:  0




In [98]:
#PaytM
text=[
    "RBI move on Paytm Payments Bank may be precursor to licence cancellation",
    "Paytm may get NPCI nod for third-party application license but no relief for banking arm beyond March 15"
]
data=np.array([[525,528,446.65,615676,2.998]])
data = scaler.fit_transform(data)
sentiment=[]
for i in text:
  x,y=predict(i)
  print(i,":",y)
  sentiment.append(x)
frequency_counter = Counter(sentiment)

max_frequency = max(frequency_counter.values())

final_sentiment = [number for number, frequency in frequency_counter.items() if frequency == max_frequency]

y_pred_nb = clf_nb.predict(data)
y_pred_nb = np.append(y_pred_nb, final_sentiment)
fd=np.array([y_pred_nb])

y_pred_lr = clf_lr.predict(fd)
print("Price_action: ",y_pred_lr[0])

RBI move on Paytm Payments Bank may be precursor to licence cancellation : Negative
Paytm may get NPCI nod for third-party application license but no relief for banking arm beyond March 15 : Negative
Price_action:  0


