In [1]:
import pandas as pd
dataset = pd.read_csv("../10_cleaned_data/processed_text.csv")

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer
token = RegexpTokenizer(r'[a-zA-Z0-9]+')
cv = CountVectorizer(stop_words='english', lowercase=False, ngram_range = (1,1), tokenizer = token.tokenize)
text_counts = cv.fit_transform(dataset['text'])

In [3]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(text_counts, dataset['label'], test_size = 0.25, random_state = 5)

In [4]:
from sklearn.naive_bayes import MultinomialNB
MNB = MultinomialNB()
MNB.fit(X_train, Y_train)


In [5]:
from sklearn import metrics
predicted = MNB.predict(X_test)
accuracy_score = metrics.accuracy_score(predicted, Y_test)
print('accuracy score with bag of words multinomial: ' +str('{:04.2f}'.format(accuracy_score*100))+'%')

accuracy score with bag of words multinomial: 86.84%


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
text_count_2 = tfidf.fit_transform(dataset['text'])
x_train, x_test, y_train, y_test = train_test_split(text_count_2, dataset['label'], test_size = 0.25, random_state=5)
MNB.fit(x_train, y_train)
accuracy_score_mnb = metrics.accuracy_score(MNB.predict(x_test), y_test)
print('accuracy score with tf-idf multinomial: ' +str('{:4.2f}'.format(accuracy_score_mnb*100)) + '%')

accuracy score with tf-idf multinomial: 87.45%


In [7]:
from sklearn.naive_bayes import BernoulliNB
#with bag of words encoding
BNB = BernoulliNB()
BNB.fit(X_train, Y_train)
accuracy_score_bnb = metrics.accuracy_score(BNB.predict(X_test), Y_test)
print('BNB accuracy with bag of words bernoulli: ' + str('{:4.2f}'.format(accuracy_score_bnb*100)+'%'))
#with tf-idf
BNB = BernoulliNB()
BNB.fit(x_train, y_train)
accuracy_score_bnb = metrics.accuracy_score(BNB.predict(x_test), y_test)
print('BNB accuracy with tf idf bernoulli:  ' + str('{:4.2f}'.format(accuracy_score_bnb*100)+'%'))

BNB accuracy with bag of words bernoulli: 86.22%
BNB accuracy with tf idf bernoulli:  84.88%


In [None]:
def generate_synthetic_data(naive_bayes_model, vocabulary):

    total_samples_required = len(data)
    prior_word_prob = np.exp(naive_bayes_model.feature_log_prob_)

    # Generating Positive Samples - class 1
    pos_sentences = []
    for n in range((total_samples_required // 2)):
        word_list = random.choices(
            vocabulary, prior_word_prob[1], k=random.randint(5, 15)
        )
        pos_sentences.append(" ".join(word_list))
    pos_df = pd.DataFrame({"Title": pos_sentences, "sentiment": 1})

    # Generating Negative Samples - class 0
    neg_sentences = []
    for n in range((total_samples_required // 2)):
        word_list = random.choices(
            vocabulary, prior_word_prob[0], k=random.randint(5, 15)
        )
        neg_sentences.append(" ".join(word_list))
    neg_df = pd.DataFrame({"Title": neg_sentences, "sentiment": 0})
    synthetic_data = pd.concat([pos_df, neg_df])
    synthetic_data.to_csv(
        f'synthetic_data_{dt.now().strftime("%m_%d_%Y_%H_%M_%S")}.csv'
    )

    return synthetic_data

disregard what comes after we probably don't need this now

In [8]:
print(x_train)

  (0, 22044)	0.3148968935422105
  (0, 9693)	0.2825194971254728
  (0, 860)	0.27641665042723856
  (0, 4483)	0.3119469280248466
  (0, 9613)	0.3453605162027167
  (0, 14104)	0.14627218784002882
  (0, 8020)	0.27329191804184777
  (0, 11364)	0.16568354667452698
  (0, 25815)	0.1665806981698989
  (0, 16014)	0.20231825735740416
  (0, 12276)	0.105972495765561
  (0, 9327)	0.17089109784576476
  (0, 21607)	0.18303861108371627
  (0, 25097)	0.11366699560340493
  (0, 2291)	0.2757238537850669
  (0, 4762)	0.10816829035911245
  (0, 17727)	0.09716000188202065
  (0, 10729)	0.19319498884977998
  (0, 3383)	0.11269745729344553
  (0, 4595)	0.11853987444351503
  (0, 23837)	0.09761777115774446
  (0, 12818)	0.0987327159688658
  (0, 16324)	0.10039095157488408
  (0, 12763)	0.2415803911364989
  (1, 664)	0.42900714497315645
  :	:
  (21246, 8068)	0.3505007888400098
  (21246, 5710)	0.3026437514629066
  (21246, 11191)	0.26195474788861206
  (21246, 9039)	0.24708345165092135
  (21246, 26101)	0.278715232774355
  (21246, 1157

In [9]:
print(y_train)

27779    1
7213     0
2186     0
20875    1
24255    1
        ..
3046     0
26301    1
20463    1
18638    1
2915     0
Name: label, Length: 21247, dtype: int64


In [10]:
# first number 0, second number 1
MNB.predict_proba(x_train)

array([[0.34134283, 0.65865717],
       [0.86395327, 0.13604673],
       [0.76390166, 0.23609834],
       ...,
       [0.21459147, 0.78540853],
       [0.52041587, 0.47958413],
       [0.94496501, 0.05503499]])

In [11]:
import numpy as np

z = MNB.predict_proba(x_train)
labels = np.argmax(z, axis=1)
classes = MNB.classes_
labels = [classes[i] for i in labels]



In [18]:
import numpy as np

# only need x_train because it derives y value using argmax on z
def combine(x_train):
   # empty dictionary
    combo = []
    
    z = MNB.predict_proba(x_train)
    keys = np.argmax(z, axis=1)
    values = np.amax(z, axis=1)
    
    # fill list with keys and values
    for i in range(len(keys)):
        combo.append([keys[i], values[i]])

    print(combo)
    
    return combo

In [None]:
def synthetic_data(x_train):
    # empty dictionary
    synthetic = []

    # get combined list of labels and percentages
    combo = combine(x_train)

    for range(len(combo)):
        

    

In [None]:
# call to get synthetic data
synthetic = synthetic_data(x_train)

In [52]:
import numpy as np

# create a dictionary with keys as labels from y vector (0,1) and values are multinomial tf-idf predicted probabilities

def create_synthetic_data(X, y, model, num_samples):
    predicted_prob = {}
    for i in range(2):
        predicted_prob[i] = model.predict_proba(X[y==i])[:,1]
    # create synthetic data
    synthetic_data = []
    synthetic_labels = []
    for i in range(2):
        for j in range(num_samples):
            # select a random label from the original data
            random_label = np.random.choice(y.unique())
            # select a random index from the original data
            random_index = np.random.choice(len(predicted_prob[random_label]))
            # select a random probability from the predicted probabilities of the selected label
            random_prob = np.random.choice(predicted_prob[random_label])
            # if the random probability is greater than the predicted probability of the original data, then flip the label
            if random_prob > predicted_prob[i][random_index]:
                synthetic_labels.append(1-i)
            else:
                synthetic_labels.append(i)
            # append the original data to the synthetic data
            synthetic_data.append(X[y==i].iloc[random_index])
    return synthetic_data, synthetic_labels

In [53]:
# test the synthetic data with the original model
xs_train, xs_test, ys_train, ys_test = train_test_split(******, dataset['label'], test_size = 0.25, random_state=5)
MNB.fit(xs_train, ys_train)
accuracy_score_mnb = metrics.accuracy_score(MNB.predict(xs_test), ys_test)
print('accuracy score with synthetic data and tf-idf multinomial: ' +str('{:4.2f}'.format(accuracy_score_mnb*100)) + '%')

SyntaxError: invalid syntax (1268594570.py, line 2)