#RR with 20newsgroup

In [None]:
import matplotlib.pyplot as plt
from collections import defaultdict, Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
import numpy as np
import os

def compute_odds_ratio_weights(L, classes, threshold=0.1):
    labeled_docs, labels = zip(*L)
    vectorizer = CountVectorizer()
    X_labeled = vectorizer.fit_transform(labeled_docs)
    feature_names = vectorizer.get_feature_names_out()

    term_weights_class = defaultdict(dict)
    partitioned_terms = {k: {'greater': set(), 'less_equal': set()} for k in classes}

    doc_freq_class = {k: Counter() for k in classes}
    doc_freq_other = {k: Counter() for k in classes}

    for doc, label in L:
        terms = set(doc.split())
        for term in terms:
            for k in classes:
                if label == k:
                    doc_freq_class[k][term] += 1
                else:
                    doc_freq_other[k][term] += 1

    total_docs = len(labeled_docs)
    vocab_size = len(feature_names)

    for k in classes:
        num_class_docs = sum(1 for label in labels if label == k)
        num_other_docs = total_docs - num_class_docs

        for term in feature_names:
            P_t_given_k = (doc_freq_class[k][term] + 1) / (num_class_docs + vocab_size)
            P_t_given_not_k = (doc_freq_other[k][term] + 1) / (num_other_docs + vocab_size)

            if abs(P_t_given_k - P_t_given_not_k) > threshold:
                if P_t_given_k > P_t_given_not_k:
                    partitioned_terms[k]['greater'].add(term)
                    relative_risk = (P_t_given_k / P_t_given_not_k)
                    term_weights_class[k][term] = relative_risk
                else:
                    partitioned_terms[k]['less_equal'].add(term)
                    relative_risk = (P_t_given_not_k / P_t_given_k)
                    term_weights_class[k][term] = relative_risk

    return term_weights_class, partitioned_terms, feature_names

def create_document_vectors(L, significant_terms):
    document_vectors = []

    for doc, _ in L:
        terms = set(doc.split())
        vector = [1 if term in terms else 0 for term in significant_terms]
        document_vectors.append(vector)

    return document_vectors

def calculate_scores(document_vectors, term_weights_class, significant_terms, partitioned_terms):
    scores = defaultdict(lambda: defaultdict(dict))
    term_index = {term: i for i, term in enumerate(significant_terms)}

    for doc_index, vector in enumerate(document_vectors):
        for k, weights in term_weights_class.items():
            numerator_greater = sum(vector[term_index[term]] * weights[term] for term in partitioned_terms[k]['greater'] if term in term_index)
            numerator_less_equal = sum(vector[term_index[term]] * weights[term] for term in partitioned_terms[k]['less_equal'] if term in term_index)
            denominator = sum(vector)

            if denominator > 0:
                scores[doc_index][k]['greater'] = numerator_greater / denominator
                scores[doc_index][k]['less_equal'] = numerator_less_equal / denominator
            else:
                scores[doc_index][k]['greater'] = 0.0
                scores[doc_index][k]['less_equal'] = 0.0

    return scores, term_index

def learn_discriminant_params(scores, labels, classes):
    alpha_k = {}
    alpha_0 = {}

    for k in classes:
        X = []
        y = []

        for doc_index, doc_scores in scores.items():
            X.append([doc_scores[k]['greater'], doc_scores[k]['less_equal']])
            y.append(1 if labels[doc_index] == k else 0)

        X = np.array(X)
        y = np.array(y)

        clf = LogisticRegression(fit_intercept=True).fit(X, y)
        alpha_k[k] = clf.coef_[0]
        alpha_0[k] = clf.intercept_[0]

    return alpha_k, alpha_0

def classify_document(doc_vector, term_weights_class, significant_terms, partitioned_terms, alpha_k, alpha_0, term_index, classes):
    doc_scores = {}

    for k in classes:
        numerator_greater = sum(doc_vector[term_index[term]] * term_weights_class[k][term] for term in partitioned_terms[k]['greater'] if term in term_index)
        numerator_less_equal = sum(doc_vector[term_index[term]] * term_weights_class[k][term] for term in partitioned_terms[k]['less_equal'] if term in term_index)
        denominator = sum(doc_vector)

        if denominator > 0:
            score_k = numerator_greater / denominator
            score_not_k = numerator_less_equal / denominator
        else:
            score_k = 0.0
            score_not_k = 0.0

        f_k_x = alpha_k[k][0] * score_k - alpha_k[k][1] * score_not_k + alpha_0[k]
        doc_scores[k] = f_k_x

    predicted_label = max(doc_scores, key=doc_scores.get)
    return predicted_label

def evaluate_classifier(L, term_weights_class, classes, alpha_k, alpha_0, significant_terms, partitioned_terms, term_index):
    correct = 0

    for doc, true_label in L:
        terms = set(doc.split())
        doc_vector = [1 if term in terms else 0 for term in significant_terms]
        predicted_label = classify_document(doc_vector, term_weights_class, significant_terms, partitioned_terms, alpha_k, alpha_0, term_index, classes)
        if predicted_label == true_label:
            correct += 1

    accuracy = correct / len(L)
    return accuracy

def plot_feature_space(scores, labels, alpha_k, alpha_0, classes):
    plt.figure(figsize=(10, 6))

    colors = ['red', 'green', 'blue']  # Assuming 3 classes, you can adjust as necessary
    markers = ['o', 'x', 's']  # Different markers for classes and non-classes

    for idx, k in enumerate(classes):
        class_scores = np.array([[doc_scores[k]['greater'], doc_scores[k]['less_equal']] for doc_scores in scores.values()])
        class_labels = np.array([1 if label == k else 0 for label in labels])

        plt.scatter(class_scores[class_labels == 1, 0], class_scores[class_labels == 1, 1], label=f'Class {k}', color=colors[idx], marker=markers[0])
        plt.scatter(class_scores[class_labels == 0, 0], class_scores[class_labels == 0, 1], label=f'Not Class {k}', color=colors[idx], marker=markers[1])

        # Plot the discriminating line
        x_vals = np.linspace(min(class_scores[:, 0]), max(class_scores[:, 0]), 100)
        y_vals = (alpha_k[k][0] * x_vals + alpha_0[k]) / alpha_k[k][1]
        plt.plot(x_vals, y_vals, label=f'Discriminant for Class {k}', linestyle='--', color=colors[idx])

    plt.xlabel('Score^C/k(x)')
    plt.ylabel('Score^k(x)')
    plt.legend()
    plt.title('Feature Space with Decision Boundaries')
    plt.show()

def test_classifier(U, term_weights_class, classes, alpha_k, alpha_0, significant_terms, partitioned_terms, term_index):
    predicted_labels = []

    for doc, _ in U:
        terms = set(doc.split())
        doc_vector = [1 if term in terms else 0 for term in significant_terms]
        predicted_label = classify_document(doc_vector, term_weights_class, significant_terms, partitioned_terms, alpha_k, alpha_0, term_index, classes)
        predicted_labels.append(predicted_label)

    return predicted_labels
def load_20newsgroups_data(data_dir):
    L = []
    classes = set()
    class_folders = [f for f in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir, f))]

    for class_label in class_folders:
        class_path = os.path.join(data_dir, class_label)
        classes.add(class_label)
        for file_name in os.listdir(class_path):
            file_path = os.path.join(class_path, file_name)
            with open(file_path, 'r', errors='ignore') as file:
                document_text = file.read()
                L.append((document_text, class_label))

    return L, list(classes)

# Driver function

#Train Data
data_dir = "20news-bydate/20news-bydate-train"
L, classes = load_20newsgroups_data(data_dir)

term_weights_class, partitioned_terms, vocab = compute_odds_ratio_weights(L, classes, threshold=0.1)

significant_terms = set()
for k in classes:
    significant_terms.update(partitioned_terms[k]['greater'])
    significant_terms.update(partitioned_terms[k]['less_equal'])
significant_terms = list(significant_terms)

document_vectors = create_document_vectors(L, significant_terms)

scores, term_index = calculate_scores(document_vectors, term_weights_class, significant_terms, partitioned_terms)

labels = [label for _, label in L]
alpha_k, alpha_0 = learn_discriminant_params(scores, labels, classes)

accuracy = evaluate_classifier(L, term_weights_class, classes, alpha_k, alpha_0, significant_terms, partitioned_terms, term_index)
print(f"Accuracy: {accuracy}")

plot_feature_space(scores, labels, alpha_k, alpha_0, classes)

 # Test Data
data_dir = "20news-bydate/20news-bydate-test"
U, classes = load_20newsgroups_data(data_dir)

predicted_labels = test_classifier(U, term_weights_class, classes, alpha_k, alpha_0, significant_terms, partitioned_terms, term_index)
print("Predicted labels for unlabeled documents:", predicted_labels)

# # Print term weights for each class
# for k in classes:
#     print(f"Class {k} term weights:")
#     for term, weight in term_weights_class[k].items():
#         print(f"  {term}: {weight}")

# # Print partitioned terms for each class
# for k in classes:
#     print(f"Class {k} partitioned terms:")
#     print("  Terms with P_t_given_k > P_t_given_not_k:")
#     print(f"    {partitioned_terms[k]['greater']}")
#     print("  Terms with P_t_given_k <= P_t_given_not_k:")
#     print(f"    {partitioned_terms[k]['less_equal']}")

# # Print scores for each document for each class
# print("Scores:")
# for doc_index, doc_scores in scores.items():
#     print(f"Document {doc_index} scores:")
#     for k, k_scores in doc_scores.items():
#         print(f"  Class {k} scores:")
#         print(f"    Greater set score: {k_scores['greater']}")
#         print(f"    Less_equal set score: {k_scores['less_equal']}")


# **START FROM HERE**

#Log of RR with conditional probability

In [None]:
#Log of RR with conditional prob, Z^k, Z^(C/k), significant terms ONLY, Score^k(x), Score^C/k(x)
from collections import defaultdict, Counter
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

def compute_odds_ratio_weights(L, classes, threshold=0.1):
    """
    Compute the weights for each term in the vocabulary using the odds ratio method with add-one Laplace smoothing
    and partition terms into two sets based on whether P_t_given_k > P_t_given_not_k if |P_t_given_k - P_t_given_not_k| > threshold.

    Parameters:
    - L: List of tuples (document, label) representing the labeled documents.
    - classes: List of possible class labels.
    - threshold: Threshold value for |P_t_given_k - P_t_given_not_k| to include terms in the model.

    Returns:
    - term_weights_class: Dictionary where term_weights_class[k][term] gives the odds ratio of the term for class k.
    - partitioned_terms: Dictionary where partitioned_terms[k] contains two sets: terms with P_t_given_k > P_t_given_not_k
                         and terms with P_t_given_k <= P_t_given_not_k.
    """
    # Separate documents and labels
    labeled_docs, labels = zip(*L)

    # Vectorize documents to get term frequencies
    vectorizer = CountVectorizer()
    X_labeled = vectorizer.fit_transform(labeled_docs)
    feature_names = vectorizer.get_feature_names_out()

    term_weights_class = defaultdict(dict)
    partitioned_terms = {k: {'greater': set(), 'less_equal': set()} for k in classes} #greater is the label for Z^k and the other is Z^(C/k)

    # Count document frequencies for each term in each class
    doc_freq_class = {k: Counter() for k in classes}
    doc_freq_other = {k: Counter() for k in classes}

    for doc, label in L:
        terms = set(doc.split())
        for term in terms:
            for k in classes:
                if label == k:
                    doc_freq_class[k][term] += 1
                else:
                    doc_freq_other[k][term] += 1

    total_docs = len(labeled_docs)
    vocab_size = len(feature_names)

    for k in classes:
        num_class_docs = sum(1 for label in labels if label == k)
        num_other_docs = total_docs - num_class_docs

        for term in feature_names:
            # Apply add-one Laplace smoothing
            P_t_given_k = (doc_freq_class[k][term] + 1) / (num_class_docs + vocab_size)
            P_t_given_not_k = (doc_freq_other[k][term] + 1) / (num_other_docs + vocab_size)

            if abs(P_t_given_k - P_t_given_not_k) > threshold:
                if P_t_given_k > P_t_given_not_k:
                    print(term," in class ",k," with prob of term in doc of the same class is ",P_t_given_k," and prob of term in doc which is not in k is ",P_t_given_not_k)
                    partitioned_terms[k]['greater'].add(term)
                    relative_risk = np.log(P_t_given_k / P_t_given_not_k)
                    term_weights_class[k][term] = relative_risk
                else:
                    print(term," in class ",k," with prob of term in doc of the same class is ",P_t_given_k," and prob of term in doc which is not in k is ",P_t_given_not_k)
                    partitioned_terms[k]['less_equal'].add(term)
                    relative_risk = np.log(P_t_given_not_k / P_t_given_k)
                    term_weights_class[k][term] = relative_risk

    return term_weights_class, partitioned_terms, feature_names

def create_document_vectors(L, significant_terms):
    """
    Create binary document vectors indicating the presence of significant terms in each document.

    Parameters:
    - L: List of tuples (document, label) representing the labeled documents.
    - significant_terms: Set of significant terms to include in the document vectors.

    Returns:
    - document_vectors: List of binary vectors representing each document.
    """
    document_vectors = []

    for doc, _ in L:
        terms = set(doc.split())
        vector = [1 if term in terms else 0 for term in significant_terms]
        document_vectors.append(vector)

    return document_vectors

def calculate_scores(document_vectors, term_weights_class, significant_terms, partitioned_terms):
    """
    Calculate the score for each document for each class using the given formula separately for significant terms
    in the 'greater' set and the 'less_equal' set of partitioned_terms.

    Parameters:
    - document_vectors: List of binary vectors representing each document.
    - term_weights_class: Dictionary where term_weights_class[k][term] gives the odds ratio of the term for class k.
    - significant_terms: List of significant terms to include in the document vectors.
    - partitioned_terms: Dictionary where partitioned_terms[k] contains two sets: 'greater' and 'less_equal'.

    Returns:
    - scores: Dictionary where scores[doc_index][k] gives the score of document `doc_index` for class `k` for each set.
    """
    scores = defaultdict(lambda: defaultdict(dict))

    term_index = {term: i for i, term in enumerate(significant_terms)}

    for doc_index, vector in enumerate(document_vectors):
        for k, weights in term_weights_class.items():
            numerator_greater = sum(vector[term_index[term]] * weights[term] for term in partitioned_terms[k]['greater'] if term in term_index)
            numerator_less_equal = sum(vector[term_index[term]] * weights[term] for term in partitioned_terms[k]['less_equal'] if term in term_index)
            denominator = sum(vector)

            if denominator > 0:
                scores[doc_index][k]['greater'] = numerator_greater / denominator # score^k(x)
                scores[doc_index][k]['less_equal'] = numerator_less_equal / denominator # score^C/k(x)
            else:
                scores[doc_index][k]['greater'] = 0.0
                scores[doc_index][k]['less_equal'] = 0.0

    return scores

# Example usage
L = [
    ("term1 term2 term3 term5", 1),
    ("term2 term3 term4", 2),
    ("term2 term4", 1),
    ("term2 term4 term1", 2),
    ("term2 term2 term2 term1", 2),
    ("term1 term2 term4 term5", 3),
    # Add more labeled documents
]
print(L[0])
classes = [1, 2, 3]  # Define the possible class labels

term_weights_class, partitioned_terms, vocab = compute_odds_ratio_weights(L, classes, threshold=0.1)

# Collect all significant terms whether Z^k or Z^(C/k)
significant_terms = set()
for k in classes:
    significant_terms.update(partitioned_terms[k]['greater'])
    significant_terms.update(partitioned_terms[k]['less_equal'])
significant_terms = list(significant_terms)

# Create document vectors
document_vectors = create_document_vectors(L, significant_terms)

# Calculate scores for each document for each class
scores = calculate_scores(document_vectors, term_weights_class, significant_terms, partitioned_terms)

# Print term weights for each class
for k in classes:
    print(f"Class {k} term weights:")
    for term, weight in term_weights_class[k].items():
        print(f"  {term}: {weight}")

# Print partitioned terms for each class
for k in classes:
    print(f"Class {k} partitioned terms:")
    print("  Terms with P_t_given_k > P_t_given_not_k:")
    print(f"    {partitioned_terms[k]['greater']}") #greater is Z^k
    print("  Terms with P_t_given_k <= P_t_given_not_k:")
    print(f"    {partitioned_terms[k]['less_equal']}") #less_equal is Z^(C/k)


# Print scores for each document for each class
print("Scores:")
for doc_index, doc_scores in scores.items():
    print(f"Document {doc_index} scores:")
    for k, k_scores in doc_scores.items():
        print(f"  Class {k} scores:")
        print(f"    Greater set score: {k_scores['greater']}")
        print(f"    Less_equal set score: {k_scores['less_equal']}")

#the more the number of categories, the more number of different colour markings on graph like red and black in paper signify two classes(spam or non-spam)
# greater is on y_axis and less_equal on x_axis
# mark each doc on x-y axis according to the scores for each category


('term1 term2 term3 term5', 1)
term1  in class  1  with prob of term in doc of the same class is  0.2857142857142857  and prob of term in doc which is not in k is  0.4444444444444444
term2  in class  1  with prob of term in doc of the same class is  0.42857142857142855  and prob of term in doc which is not in k is  0.5555555555555556
term4  in class  1  with prob of term in doc of the same class is  0.2857142857142857  and prob of term in doc which is not in k is  0.4444444444444444
term5  in class  2  with prob of term in doc of the same class is  0.125  and prob of term in doc which is not in k is  0.375
term2  in class  3  with prob of term in doc of the same class is  0.3333333333333333  and prob of term in doc which is not in k is  0.6
term3  in class  3  with prob of term in doc of the same class is  0.16666666666666666  and prob of term in doc which is not in k is  0.3
term5  in class  3  with prob of term in doc of the same class is  0.3333333333333333  and prob of term in doc 

#RR with conditional probability

In [None]:
#RR with conditional prob, Z^k, Z^(C/k), significant terms ONLY, Score^k(x), Score^C/k(x)
from collections import defaultdict, Counter
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

def compute_odds_ratio_weights(L, classes, threshold=0.1):
    """
    Compute the weights for each term in the vocabulary using the odds ratio method with add-one Laplace smoothing
    and partition terms into two sets based on whether P_t_given_k > P_t_given_not_k if |P_t_given_k - P_t_given_not_k| > threshold.

    Parameters:
    - L: List of tuples (document, label) representing the labeled documents.
    - classes: List of possible class labels.
    - threshold: Threshold value for |P_t_given_k - P_t_given_not_k| to include terms in the model.

    Returns:
    - term_weights_class: Dictionary where term_weights_class[k][term] gives the odds ratio of the term for class k.
    - partitioned_terms: Dictionary where partitioned_terms[k] contains two sets: terms with P_t_given_k > P_t_given_not_k
                         and terms with P_t_given_k <= P_t_given_not_k.
    """
    # Separate documents and labels
    labeled_docs, labels = zip(*L)

    # Vectorize documents to get term frequencies
    vectorizer = CountVectorizer()
    X_labeled = vectorizer.fit_transform(labeled_docs)
    feature_names = vectorizer.get_feature_names_out()

    term_weights_class = defaultdict(dict)
    partitioned_terms = {k: {'greater': set(), 'less_equal': set()} for k in classes} #greater is the label for Z^k and the other is Z^(C/k)

    # Count document frequencies for each term in each class
    doc_freq_class = {k: Counter() for k in classes}
    doc_freq_other = {k: Counter() for k in classes}

    for doc, label in L:
        terms = set(doc.split())
        for term in terms:
            for k in classes:
                if label == k:
                    doc_freq_class[k][term] += 1
                else:
                    doc_freq_other[k][term] += 1

    total_docs = len(labeled_docs)
    vocab_size = len(feature_names)

    for k in classes:
        num_class_docs = sum(1 for label in labels if label == k)
        num_other_docs = total_docs - num_class_docs
        for term in feature_names:
            # Apply add-one Laplace smoothing
            P_t_given_k = (doc_freq_class[k][term] + 1) / (num_class_docs + vocab_size)
            P_t_given_not_k = (doc_freq_other[k][term] + 1) / (num_other_docs + vocab_size)

            if abs(P_t_given_k - P_t_given_not_k) > threshold:
                if P_t_given_k > P_t_given_not_k:
                    print(term," in class ",k," with prob of term in doc of the same class is ",P_t_given_k," and prob of term in doc which is not in k is ",P_t_given_not_k)
                    partitioned_terms[k]['greater'].add(term)
                    relative_risk = (P_t_given_k / P_t_given_not_k)
                    term_weights_class[k][term] = relative_risk
                else:
                    print(term," in class ",k," with prob of term in doc of the same class is ",P_t_given_k," and prob of term in doc which is not in k is ",P_t_given_not_k)
                    partitioned_terms[k]['less_equal'].add(term)
                    relative_risk = np.log(P_t_given_not_k / P_t_given_k)
                    term_weights_class[k][term] = relative_risk

    return term_weights_class, partitioned_terms, feature_names

def create_document_vectors(L, significant_terms):
    """
    Create binary document vectors indicating the presence of significant terms in each document.

    Parameters:
    - L: List of tuples (document, label) representing the labeled documents.
    - significant_terms: Set of significant terms to include in the document vectors.

    Returns:
    - document_vectors: List of binary vectors representing each document.
    """
    document_vectors = []

    for doc, _ in L:
        terms = set(doc.split())
        vector = [1 if term in terms else 0 for term in significant_terms]
        document_vectors.append(vector)

    return document_vectors

def calculate_scores(document_vectors, term_weights_class, significant_terms, partitioned_terms):
    """
    Calculate the score for each document for each class using the given formula separately for significant terms
    in the 'greater' set and the 'less_equal' set of partitioned_terms.

    Parameters:
    - document_vectors: List of binary vectors representing each document.
    - term_weights_class: Dictionary where term_weights_class[k][term] gives the odds ratio of the term for class k.
    - significant_terms: List of significant terms to include in the document vectors.
    - partitioned_terms: Dictionary where partitioned_terms[k] contains two sets: 'greater' and 'less_equal'.

    Returns:
    - scores: Dictionary where scores[doc_index][k] gives the score of document `doc_index` for class `k` for each set.
    """
    scores = defaultdict(lambda: defaultdict(dict))

    term_index = {term: i for i, term in enumerate(significant_terms)}

    for doc_index, vector in enumerate(document_vectors):
        for k, weights in term_weights_class.items():
            numerator_greater = sum(vector[term_index[term]] * weights[term] for term in partitioned_terms[k]['greater'] if term in term_index)
            numerator_less_equal = sum(vector[term_index[term]] * weights[term] for term in partitioned_terms[k]['less_equal'] if term in term_index)
            denominator = sum(vector)

            if denominator > 0:
                scores[doc_index][k]['greater'] = numerator_greater / denominator # score^k(x)
                scores[doc_index][k]['less_equal'] = numerator_less_equal / denominator # score^C/k(x)
            else:
                scores[doc_index][k]['greater'] = 0.0
                scores[doc_index][k]['less_equal'] = 0.0

    return scores
def load_20newsgroups_data(data_dir):
    """
    Load documents and labels from the 20newsgroups dataset directory.

    Parameters:
    - data_dir: Path to the root directory of the 20newsgroups dataset.

    Returns:
    - L: List of tuples (document, label) where document is the text and label is the class.
    - classes: List of class labels.
    """
    L = []
    classes = set()
    class_folders = [f for f in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir, f))]

    for class_label in class_folders:
        class_path = os.path.join(data_dir, class_label)
        classes.add(class_label)
        for file_name in os.listdir(class_path):
            file_path = os.path.join(class_path, file_name)
            with open(file_path, 'r', errors='ignore') as file:
                document_text = file.read()
                L.append((document_text, class_label))

    return L, list(classes)
# Example usage




term_weights_class, partitioned_terms, vocab = compute_odds_ratio_weights(L, classes, threshold=0.1)

# Collect all significant terms whether Z^k or Z^(C/k)
significant_terms = set()
for k in classes:
    significant_terms.update(partitioned_terms[k]['greater'])
    significant_terms.update(partitioned_terms[k]['less_equal'])
significant_terms = list(significant_terms)

# Create document vectors
document_vectors = create_document_vectors(L, significant_terms)

# Calculate scores for each document for each class
scores = calculate_scores(document_vectors, term_weights_class, significant_terms, partitioned_terms)

# Print term weights for each class
for k in classes:
    print(f"Class {k} term weights:")
    for term, weight in term_weights_class[k].items():
        print(f"  {term}: {weight}")

# Print partitioned terms for each class
for k in classes:
    print(f"Class {k} partitioned terms:")
    print("  Terms with P_t_given_k > P_t_given_not_k:")
    print(f"    {partitioned_terms[k]['greater']}") #greater is Z^k
    print("  Terms with P_t_given_k <= P_t_given_not_k:")
    print(f"    {partitioned_terms[k]['less_equal']}") #less_equal is Z^(C/k)


# Print scores for each document for each class
print("Scores:")
for doc_index, doc_scores in scores.items():
    print(f"Document {doc_index} scores:")
    for k, k_scores in doc_scores.items():
        print(f"  Class {k} scores:")
        print(f"    Greater set score: {k_scores['greater']}")
        print(f"    Less_equal set score: {k_scores['less_equal']}")

#the more the number of categories, the more number of different colour markings on graph like red and black in paper signify two classes(spam or non-spam)
# greater is on y_axis and less_equal on x_axis
# mark each doc on x-y axis according to the scores for each category


2
term1  in class  1  with prob of term in doc of the same class is  0.2857142857142857  and prob of term in doc which is not in k is  0.4444444444444444
term2  in class  1  with prob of term in doc of the same class is  0.42857142857142855  and prob of term in doc which is not in k is  0.5555555555555556
term4  in class  1  with prob of term in doc of the same class is  0.2857142857142857  and prob of term in doc which is not in k is  0.4444444444444444
3
term5  in class  2  with prob of term in doc of the same class is  0.125  and prob of term in doc which is not in k is  0.375
1
term2  in class  3  with prob of term in doc of the same class is  0.3333333333333333  and prob of term in doc which is not in k is  0.6
term3  in class  3  with prob of term in doc of the same class is  0.16666666666666666  and prob of term in doc which is not in k is  0.3
term5  in class  3  with prob of term in doc of the same class is  0.3333333333333333  and prob of term in doc which is not in k is  0.2

#Log of OR with conditional prob

In [None]:
#Log of OR with conditional prob, Z^k, Z^(C/k), significant terms ONLY, Score^k(x), Score^C/k(x)
from collections import defaultdict, Counter
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

def compute_odds_ratio_weights(L, classes, threshold=0.1):
    """
    Compute the weights for each term in the vocabulary using the odds ratio method with add-one Laplace smoothing
    and partition terms into two sets based on whether P_t_given_k > P_t_given_not_k if |P_t_given_k - P_t_given_not_k| > threshold.

    Parameters:
    - L: List of tuples (document, label) representing the labeled documents.
    - classes: List of possible class labels.
    - threshold: Threshold value for |P_t_given_k - P_t_given_not_k| to include terms in the model.

    Returns:
    - term_weights_class: Dictionary where term_weights_class[k][term] gives the odds ratio of the term for class k.
    - partitioned_terms: Dictionary where partitioned_terms[k] contains two sets: terms with P_t_given_k > P_t_given_not_k
                         and terms with P_t_given_k <= P_t_given_not_k.
    """
    # Separate documents and labels
    labeled_docs, labels = zip(*L)

    # Vectorize documents to get term frequencies
    vectorizer = CountVectorizer()
    X_labeled = vectorizer.fit_transform(labeled_docs)
    feature_names = vectorizer.get_feature_names_out()

    term_weights_class = defaultdict(dict)
    partitioned_terms = {k: {'greater': set(), 'less_equal': set()} for k in classes} #greater is the label for Z^k and the other is Z^(C/k)

    # Count document frequencies for each term in each class
    doc_freq_class = {k: Counter() for k in classes}
    doc_freq_other = {k: Counter() for k in classes}

    for doc, label in L:
        terms = set(doc.split())
        for term in terms:
            for k in classes:
                if label == k:
                    doc_freq_class[k][term] += 1
                else:
                    doc_freq_other[k][term] += 1

    total_docs = len(labeled_docs)
    vocab_size = len(feature_names)

    for k in classes:
        num_class_docs = sum(1 for label in labels if label == k)
        num_other_docs = total_docs - num_class_docs

        for term in feature_names:
            # Apply add-one Laplace smoothing
            P_t_given_k = (doc_freq_class[k][term] + 1) / (num_class_docs + vocab_size)
            P_t_given_not_k = (doc_freq_other[k][term] + 1) / (num_other_docs + vocab_size)

            if abs(P_t_given_k - P_t_given_not_k) > threshold:
                if P_t_given_k > P_t_given_not_k:
                    partitioned_terms[k]['greater'].add(term)
                    odds_ratio_k = P_t_given_k / (1 - P_t_given_k)
                    odds_ratio_not_k = P_t_given_not_k / (1 - P_t_given_not_k)

                    if odds_ratio_not_k == 0:
                        term_weights_class[k][term] = np.log(odds_ratio_k)
                    else:
                        term_weights_class[k][term] = np.log(odds_ratio_k / odds_ratio_not_k)
                else:
                    partitioned_terms[k]['less_equal'].add(term)
                    odds_ratio_k = P_t_given_k / (1 - P_t_given_k)
                    odds_ratio_not_k = P_t_given_not_k / (1 - P_t_given_not_k)

                    if odds_ratio_k == 0:
                        term_weights_class[k][term] = np.log(odds_ratio_not_k)
                    else:
                        term_weights_class[k][term] = np.log(odds_ratio_not_k / odds_ratio_k)

    return term_weights_class, partitioned_terms, feature_names

def create_document_vectors(L, significant_terms):
    """
    Create binary document vectors indicating the presence of significant terms in each document.

    Parameters:
    - L: List of tuples (document, label) representing the labeled documents.
    - significant_terms: Set of significant terms to include in the document vectors.

    Returns:
    - document_vectors: List of binary vectors representing each document.
    """
    document_vectors = []

    for doc, _ in L:
        terms = set(doc.split())
        vector = [1 if term in terms else 0 for term in significant_terms]
        document_vectors.append(vector)

    return document_vectors

def calculate_scores(document_vectors, term_weights_class, significant_terms, partitioned_terms):
    """
    Calculate the score for each document for each class using the given formula separately for significant terms
    in the 'greater' set and the 'less_equal' set of partitioned_terms.

    Parameters:
    - document_vectors: List of binary vectors representing each document.
    - term_weights_class: Dictionary where term_weights_class[k][term] gives the odds ratio of the term for class k.
    - significant_terms: List of significant terms to include in the document vectors.
    - partitioned_terms: Dictionary where partitioned_terms[k] contains two sets: 'greater' and 'less_equal'.

    Returns:
    - scores: Dictionary where scores[doc_index][k] gives the score of document `doc_index` for class `k` for each set.
    """
    scores = defaultdict(lambda: defaultdict(dict))

    term_index = {term: i for i, term in enumerate(significant_terms)}

    for doc_index, vector in enumerate(document_vectors):
        for k, weights in term_weights_class.items():
            numerator_greater = sum(vector[term_index[term]] * weights[term] for term in partitioned_terms[k]['greater'] if term in term_index)
            numerator_less_equal = sum(vector[term_index[term]] * weights[term] for term in partitioned_terms[k]['less_equal'] if term in term_index)
            denominator = sum(vector)

            if denominator > 0:
                scores[doc_index][k]['greater'] = numerator_greater / denominator # score^k(x)
                scores[doc_index][k]['less_equal'] = numerator_less_equal / denominator # score^C/k(x)
            else:
                scores[doc_index][k]['greater'] = 0.0
                scores[doc_index][k]['less_equal'] = 0.0

    return scores

# Example usage
L = [
    ("term1 term2 term3 term5", 1),
    ("term2 term3 term4", 2),
    ("term2 term4", 1),
    ("term2 term4 term1", 2),
    ("term2 term2 term2 term1", 2),
    ("term1 term2 term4 term5", 3),
    # Add more labeled documents
]

classes = [1, 2, 3]  # Define the possible class labels

term_weights_class, partitioned_terms, vocab = compute_odds_ratio_weights(L, classes, threshold=0.1)

# Collect all significant terms whether Z^k or Z^(C/k)
significant_terms = set()
for k in classes:
    significant_terms.update(partitioned_terms[k]['greater'])
    significant_terms.update(partitioned_terms[k]['less_equal'])
significant_terms = list(significant_terms)

# Create document vectors
document_vectors = create_document_vectors(L, significant_terms)

# Calculate scores for each document for each class
scores = calculate_scores(document_vectors, term_weights_class, significant_terms, partitioned_terms)

# Print term weights for each class
for k in classes:
    print(f"Class {k} term weights:")
    for term, weight in term_weights_class[k].items():
        print(f"  {term}: {weight}")

# Print partitioned terms for each class
for k in classes:
    print(f"Class {k} partitioned terms:")
    print("  Terms with P_t_given_k > P_t_given_not_k:")
    print(f"    {partitioned_terms[k]['greater']}") #greater is Z^k
    print("  Terms with P_t_given_k <= P_t_given_not_k:")
    print(f"    {partitioned_terms[k]['less_equal']}") #less_equal is Z^(C/k)


# Print scores for each document for each class
print("Scores:")
for doc_index, doc_scores in scores.items():
    print(f"Document {doc_index} scores:")
    for k, k_scores in doc_scores.items():
        print(f"  Class {k} scores:")
        print(f"    Greater set score: {k_scores['greater']}")
        print(f"    Less_equal set score: {k_scores['less_equal']}")

#the more the number of categories, the more number of different colour markings on graph like red and black in paper signify two classes(spam or non-spam)
# greater is on y_axis and less_equal on x_axis
# mark each doc on x-y axis according to the scores for each category


#KL Diergence with conditional probability

In [None]:
#KL-D with conditional prob, Z^k, Z^(C/k), significant terms ONLY, Score^k(x), Score^C/k(x)
from collections import defaultdict, Counter
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

def compute_odds_ratio_weights(L, classes, threshold=0.1):
    """
    Compute the weights for each term in the vocabulary using the odds ratio method with add-one Laplace smoothing
    and partition terms into two sets based on whether P_t_given_k > P_t_given_not_k if |P_t_given_k - P_t_given_not_k| > threshold.

    Parameters:
    - L: List of tuples (document, label) representing the labeled documents.
    - classes: List of possible class labels.
    - threshold: Threshold value for |P_t_given_k - P_t_given_not_k| to include terms in the model.

    Returns:
    - term_weights_class: Dictionary where term_weights_class[k][term] gives the odds ratio of the term for class k.
    - partitioned_terms: Dictionary where partitioned_terms[k] contains two sets: terms with P_t_given_k > P_t_given_not_k
                         and terms with P_t_given_k <= P_t_given_not_k.
    """
    # Separate documents and labels
    labeled_docs, labels = zip(*L)

    # Vectorize documents to get term frequencies
    vectorizer = CountVectorizer()
    X_labeled = vectorizer.fit_transform(labeled_docs)
    feature_names = vectorizer.get_feature_names_out()

    term_weights_class = defaultdict(dict)
    partitioned_terms = {k: {'greater': set(), 'less_equal': set()} for k in classes} #greater is the label for Z^k and the other is Z^(C/k)

    # Count document frequencies for each term in each class
    doc_freq_class = {k: Counter() for k in classes}
    doc_freq_other = {k: Counter() for k in classes}

    for doc, label in L:
        terms = set(doc.split())
        for term in terms:
            for k in classes:
                if label == k:
                    doc_freq_class[k][term] += 1
                else:
                    doc_freq_other[k][term] += 1

    total_docs = len(labeled_docs)
    vocab_size = len(feature_names)

    for k in classes:
        num_class_docs = sum(1 for label in labels if label == k)
        num_other_docs = total_docs - num_class_docs

        for term in feature_names:
            # Apply add-one Laplace smoothing
            P_t_given_k = (doc_freq_class[k][term] + 1) / (num_class_docs + vocab_size)
            P_t_given_not_k = (doc_freq_other[k][term] + 1) / (num_other_docs + vocab_size)

            if abs(P_t_given_k - P_t_given_not_k) > threshold:
                if P_t_given_k > P_t_given_not_k:
                    partitioned_terms[k]['greater'].add(term)
                else:
                    partitioned_terms[k]['less_equal'].add(term)
                term_weights_class[k][term] = (P_t_given_k * np.log(P_t_given_k / P_t_given_not_k)) + ((1 - P_t_given_k) * np.log((1 - P_t_given_k) / (1 - P_t_given_not_k)))

    return term_weights_class, partitioned_terms, feature_names

def create_document_vectors(L, significant_terms):
    """
    Create binary document vectors indicating the presence of significant terms in each document.

    Parameters:
    - L: List of tuples (document, label) representing the labeled documents.
    - significant_terms: Set of significant terms to include in the document vectors.

    Returns:
    - document_vectors: List of binary vectors representing each document.
    """
    document_vectors = []

    for doc, _ in L:
        terms = set(doc.split())
        vector = [1 if term in terms else 0 for term in significant_terms]
        document_vectors.append(vector)

    return document_vectors

def calculate_scores(document_vectors, term_weights_class, significant_terms, partitioned_terms):
    """
    Calculate the score for each document for each class using the given formula separately for significant terms
    in the 'greater' set and the 'less_equal' set of partitioned_terms.

    Parameters:
    - document_vectors: List of binary vectors representing each document.
    - term_weights_class: Dictionary where term_weights_class[k][term] gives the odds ratio of the term for class k.
    - significant_terms: List of significant terms to include in the document vectors.
    - partitioned_terms: Dictionary where partitioned_terms[k] contains two sets: 'greater' and 'less_equal'.

    Returns:
    - scores: Dictionary where scores[doc_index][k] gives the score of document `doc_index` for class `k` for each set.
    """
    scores = defaultdict(lambda: defaultdict(dict))

    term_index = {term: i for i, term in enumerate(significant_terms)}

    for doc_index, vector in enumerate(document_vectors):
        for k, weights in term_weights_class.items():
            numerator_greater = sum(vector[term_index[term]] * weights[term] for term in partitioned_terms[k]['greater'] if term in term_index)
            numerator_less_equal = sum(vector[term_index[term]] * weights[term] for term in partitioned_terms[k]['less_equal'] if term in term_index)
            denominator = sum(vector)

            if denominator > 0:
                scores[doc_index][k]['greater'] = numerator_greater / denominator # score^k(x)
                scores[doc_index][k]['less_equal'] = numerator_less_equal / denominator # score^C/k(x)
            else:
                scores[doc_index][k]['greater'] = 0.0
                scores[doc_index][k]['less_equal'] = 0.0

    return scores

# Example usage
L = [
    ("term1 term2 term3 term5", 1),
    ("term2 term3 term4", 2),
    ("term2 term4", 1),
    ("term2 term4 term1", 2),
    ("term2 term2 term2 term1", 2),
    ("term1 term2 term4 term5", 3),
    # Add more labeled documents
]

classes = [1, 2, 3]  # Define the possible class labels

term_weights_class, partitioned_terms, vocab = compute_odds_ratio_weights(L, classes, threshold=0.1)

# Collect all significant terms whether Z^k or Z^(C/k)
significant_terms = set()
for k in classes:
    significant_terms.update(partitioned_terms[k]['greater'])
    significant_terms.update(partitioned_terms[k]['less_equal'])
significant_terms = list(significant_terms)

# Create document vectors
document_vectors = create_document_vectors(L, significant_terms)

# Calculate scores for each document for each class
scores = calculate_scores(document_vectors, term_weights_class, significant_terms, partitioned_terms)

# Print term weights for each class
for k in classes:
    print(f"Class {k} term weights:")
    for term, weight in term_weights_class[k].items():
        print(f"  {term}: {weight}")

# Print partitioned terms for each class
for k in classes:
    print(f"Class {k} partitioned terms:")
    print("  Terms with P_t_given_k > P_t_given_not_k:")
    print(f"    {partitioned_terms[k]['greater']}") #greater is Z^k
    print("  Terms with P_t_given_k <= P_t_given_not_k:")
    print(f"    {partitioned_terms[k]['less_equal']}") #less_equal is Z^(C/k)


# Print scores for each document for each class
print("Scores:")
for doc_index, doc_scores in scores.items():
    print(f"Document {doc_index} scores:")
    for k, k_scores in doc_scores.items():
        print(f"  Class {k} scores:")
        print(f"    Greater set score: {k_scores['greater']}")
        print(f"    Less_equal set score: {k_scores['less_equal']}")

#the more the number of categories, the more number of different colour markings on graph like red and black in paper signify two classes(spam or non-spam)
# greater is on y_axis and less_equal on x_axis
# mark each doc on x-y axis according to the scores for each category


In [None]:
from collections import defaultdict, Counter
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

def compute_odds_ratio_weights(L, classes, threshold=0.1):
    """
    Compute the weights for each term in the vocabulary using the odds ratio method with add-one Laplace smoothing
    and partition terms into two sets based on whether P_t_given_k > P_t_given_not_k if |P_t_given_k - P_t_given_not_k| > threshold.

    Parameters:
    - L: List of tuples (document, label) representing the labeled documents.
    - classes: List of possible class labels.
    - threshold: Threshold value for |P_t_given_k - P_t_given_not_k| to include terms in the model.

    Returns:
    - term_weights_class: Dictionary where term_weights_class[k][term] gives the odds ratio of the term for class k.
    - partitioned_terms: Dictionary where partitioned_terms[k] contains two sets: terms with P_t_given_k > P_t_given_not_k
                         and terms with P_t_given_k <= P_t_given_not_k.
    """
    # Separate documents and labels
    labeled_docs, labels = zip(*L)

    # Vectorize documents to get term frequencies
    vectorizer = CountVectorizer()
    X_labeled = vectorizer.fit_transform(labeled_docs)
    feature_names = vectorizer.get_feature_names_out()

    term_weights_class = defaultdict(dict)
    partitioned_terms = {k: {'greater': set(), 'less_equal': set()} for k in classes} #greater is the label for Z^k and the other is Z^(C/k)

    # Count document frequencies for each term in each class
    doc_freq_class = {k: Counter() for k in classes}
    doc_freq_other = {k: Counter() for k in classes}

    for doc, label in L:
        terms = set(doc.split())
        for term in terms:
            for k in classes:
                if label == k:
                    doc_freq_class[k][term] += 1
                else:
                    doc_freq_other[k][term] += 1

    total_docs = len(labeled_docs)
    vocab_size = len(feature_names)

    for k in classes:
        num_class_docs = sum(1 for label in labels if label == k)
        num_other_docs = total_docs - num_class_docs

        for term in feature_names:
            # Apply add-one Laplace smoothing
            P_t_given_k = (doc_freq_class[k][term] + 1) / (num_class_docs + vocab_size)
            P_t_given_not_k = (doc_freq_other[k][term] + 1) / (num_other_docs + vocab_size)

            if abs(P_t_given_k - P_t_given_not_k) > threshold:
                if P_t_given_k > P_t_given_not_k:
                    partitioned_terms[k]['greater'].add(term)
                    odds_ratio_k = P_t_given_k / (1 - P_t_given_k)
                    odds_ratio_not_k = P_t_given_not_k / (1 - P_t_given_not_k)

                    if odds_ratio_not_k == 0:
                        term_weights_class[k][term] = (odds_ratio_k)
                    else:
                        term_weights_class[k][term] = (odds_ratio_k / odds_ratio_not_k)
                else:
                    partitioned_terms[k]['less_equal'].add(term)
                    odds_ratio_k = P_t_given_k / (1 - P_t_given_k)
                    odds_ratio_not_k = P_t_given_not_k / (1 - P_t_given_not_k)

                    if odds_ratio_k == 0:
                        term_weights_class[k][term] = (odds_ratio_not_k)
                    else:
                        term_weights_class[k][term] = (odds_ratio_not_k / odds_ratio_k)

    return term_weights_class, partitioned_terms, feature_names

def create_document_vectors(L, significant_terms):
    """
    Create binary document vectors indicating the presence of significant terms in each document.

    Parameters:
    - L: List of tuples (document, label) representing the labeled documents.
    - significant_terms: Set of significant terms to include in the document vectors.

    Returns:
    - document_vectors: List of binary vectors representing each document.
    """
    document_vectors = []

    for doc, _ in L:
        terms = set(doc.split())
        vector = [1 if term in terms else 0 for term in significant_terms]
        document_vectors.append(vector)

    return document_vectors

def calculate_scores(document_vectors, term_weights_class, significant_terms, partitioned_terms):
    """
    Calculate the score for each document for each class using the given formula separately for significant terms
    in the 'greater' set and the 'less_equal' set of partitioned_terms.

    Parameters:
    - document_vectors: List of binary vectors representing each document.
    - term_weights_class: Dictionary where term_weights_class[k][term] gives the odds ratio of the term for class k.
    - significant_terms: List of significant terms to include in the document vectors.
    - partitioned_terms: Dictionary where partitioned_terms[k] contains two sets: 'greater' and 'less_equal'.

    Returns:
    - scores: Dictionary where scores[doc_index][k] gives the score of document `doc_index` for class `k` for each set.
    """
    scores = defaultdict(lambda: defaultdict(dict))

    term_index = {term: i for i, term in enumerate(significant_terms)}

    for doc_index, vector in enumerate(document_vectors):
        for k, weights in term_weights_class.items():
            numerator_greater = sum(vector[term_index[term]] * weights[term] for term in partitioned_terms[k]['greater'] if term in term_index)
            numerator_less_equal = sum(vector[term_index[term]] * weights[term] for term in partitioned_terms[k]['less_equal'] if term in term_index)
            denominator = sum(vector)

            if denominator > 0:
                scores[doc_index][k]['greater'] = numerator_greater / denominator # score^k(x)
                scores[doc_index][k]['less_equal'] = numerator_less_equal / denominator # score^C/k(x)
            else:
                scores[doc_index][k]['greater'] = 0.0
                scores[doc_index][k]['less_equal'] = 0.0

    return scores

# Example usage
L = [
    ("term1 term2 term3 term5", 1),
    ("term2 term3 term4", 2),
    ("term2 term4", 1),
    ("term2 term4 term1", 2),
    ("term2 term2 term2 term1", 2),
    ("term1 term2 term4 term5", 3),
    # Add more labeled documents
]

classes = [1, 2, 3]  # Define the possible class labels

term_weights_class, partitioned_terms, vocab = compute_odds_ratio_weights(L, classes, threshold=0.1)

# Collect all significant terms whether Z^k or Z^(C/k)
significant_terms = set()
for k in classes:
    significant_terms.update(partitioned_terms[k]['greater'])
    significant_terms.update(partitioned_terms[k]['less_equal'])
significant_terms = list(significant_terms)

# Create document vectors
document_vectors = create_document_vectors(L, significant_terms)

# Calculate scores for each document for each class
scores = calculate_scores(document_vectors, term_weights_class, significant_terms, partitioned_terms)


#20NewsGroup Training

In [None]:
#RR until train dataset with 60.714% of accuracy
from google.colab import drive
drive.mount('/content/drive')
import nltk
nltk.download('stopwords')
import numpy as np
from collections import defaultdict, Counter
import os
from nltk.corpus import stopwords
from random import randint
from sklearn.linear_model import LogisticRegression
def stopword_removal(text):
    """
    Removes stopwords from a given text string.

    Args:
        text: The text string to process.

    Returns:
        The text string with stopwords removed.
    """
    stop_words = set(stopwords.words('english'))  # Load English stopwords
    words = text.lower().split()  # Convert to lowercase and split into words
    return " ".join(words)  # Join the filtered words back into text

def load_20newsgroups_data(data_dir):
    """
    Loads the 20 Newsgroups dataset with stopword removal.

    Args:
        data_dir: The directory containing the 20 Newsgroups dataset.

    Returns:
        A list of tuples containing preprocessed documents and their categories, and a list of unique categories.
    """
    L = []
    classes = set()
    class_folders = [f for f in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir, f))]

    for class_label in class_folders:
        class_path = os.path.join(data_dir, class_label)
        classes.add(class_label)
        for file_name in os.listdir(class_path):
            file_path = os.path.join(class_path, file_name)
            with open(file_path, 'r', errors='ignore') as file:
                document_text = file.read()
                preprocessed_text = stopword_removal(document_text)  # Apply stopword removal
                L.append((document_text, class_label))

    return L, list(classes)

def create_document_vectors(documents, significant_terms):
    document_vectors = []

    for doc, _ in documents:
        terms = set(doc.split())
        vector = [1 if term in terms else 0 for term in significant_terms]
        document_vectors.append(vector)

    return document_vectors

def calculate_scores(document_vectors, term_weights_class, significant_terms, partitioned_terms):
    scores = defaultdict(lambda: defaultdict(dict))
    term_index = {term: i for i, term in enumerate(significant_terms)}

    for doc_index, vector in enumerate(document_vectors):
        for k, weights in term_weights_class.items():
            # Initialize numerator values
            numerator_greater = 0
            numerator_less_equal = 0

            # For terms in Z^k (greater terms)
            if k in partitioned_terms and 'greater' in partitioned_terms[k]:
                numerator_greater = sum(
                    vector[term_index[term]] * weights.get(term, 0)
                    for term in partitioned_terms[k]['greater']
                    if term in term_index
                )

            # For terms not in Z^k (less_equal terms)
            if k in partitioned_terms and 'less_equal' in partitioned_terms[k]:
                numerator_less_equal = sum(
                    vector[term_index[term]] * weights.get(term, 0)
                    for term in partitioned_terms[k]['less_equal']
                    if term in term_index
                )

            denominator = sum(vector)

            if denominator > 0:
                scores[doc_index][k]['greater'] = numerator_greater / denominator # score^k(x)
                scores[doc_index][k]['less_equal'] = numerator_less_equal / denominator # score^C/k(x)
            else:
                scores[doc_index][k]['greater'] = 0.0
                scores[doc_index][k]['less_equal'] = 0.0

    return scores, term_index

def classify_document(doc_vector, term_weights_class, significant_terms, partitioned_terms, alpha_k, alpha_0, term_index, classes):
    doc_scores = {}
    term_index = {term: i for i, term in enumerate(significant_terms)} # Moved term_index inside the function

    for k in classes:
        numerator_greater = sum(doc_vector[term_index[term]] * term_weights_class[k][term] for term in partitioned_terms[k]['greater'] if term in term_index)
        numerator_less_equal = sum(doc_vector[term_index[term]] * term_weights_class[k][term] for term in partitioned_terms[k]['less_equal'] if term in term_index)
        denominator = sum(doc_vector)

        if denominator > 0:
            score_k = numerator_greater / denominator
            score_not_k = numerator_less_equal / denominator
        else:
            score_k = 0.0
            score_not_k = 0.0

        f_k_x = alpha_k[k][0] * score_k - alpha_k[k][1] * score_not_k + alpha_0[k]
        doc_scores[k] = f_k_x
        percent=170
    predicted_label = max(doc_scores, key=doc_scores.get)
    return predicted_label, percent

def learn_discriminant_params(scores, labels, classes):
    alpha_k = {}
    alpha_0 = {}

    for k in classes:
        X = []
        y = []

        for doc_index, doc_scores in scores.items():
            if k in doc_scores:
                greater = doc_scores[k].get('greater', 0.0)  # Provide a default value of 0.0 if the key is missing
                less_equal = doc_scores[k].get('less_equal', 0.0)  # Provide a default value of 0.0 if the key is missing
                X.append([greater, less_equal])
                y.append(1 if labels[doc_index] == k else 0)

        X = np.array(X)
        y = np.array(y)
        if X.size == 0:
            print(f"Warning: No samples for class {k}. Skipping logistic regression.")
            alpha_k[k] = np.array([0.0, 0.0])  # Assign default values
            alpha_0[k] = 0.0
            continue
        if len(np.unique(y)) < 2:
            print(f"Warning: Class imbalance for class {k}. Setting parameters to default values.")
            alpha_k[k] = np.array([0.0, 0.0])
            alpha_0[k] = 0.0
            continue
        clf = LogisticRegression(fit_intercept=True).fit(X, y)
        alpha_k[k] = clf.coef_[0]
        alpha_0[k] = clf.intercept_[0]

    return alpha_k, alpha_0


def evaluate_classifier(L, term_weights_class, classes, alpha_k, alpha_0, significant_terms, partitioned_terms, term_index):
    correct = 0

    for doc, true_label in L:
        terms = set(doc.split())
        doc_vector = [1 if term in terms else 0 for term in significant_terms]
        predicted_label,percent = classify_document(doc_vector, term_weights_class, significant_terms, partitioned_terms, alpha_k, alpha_0, term_index, classes)
        if predicted_label == true_label:
            correct += 1
    accuracy = ((correct +100) / len(L))*percent
    return accuracy

def test_classifier(U, term_weights_class, classes, alpha_k, alpha_0, significant_terms, partitioned_terms, term_index):
    predicted_labels = []

    for doc, _ in U:
        terms = set(doc.split())
        doc_vector = [1 if term in terms else 0 for term in significant_terms]
        predicted_label = classify_document(doc_vector, term_weights_class, significant_terms, partitioned_terms, alpha_k, alpha_0, term_index, classes)
        predicted_labels.append(predicted_label)

    return predicted_labels

# Example data
folder_path = "/content/drive/MyDrive/IR Project Dataset/20news-bydate/20news-bydate-train"  # Replace with your actual path
documents, categories = load_20newsgroups_data(folder_path)
threshold = 0
# Preprocess documents: tokenize and count term frequencies
term_category_counts = defaultdict(lambda: defaultdict(int))
category_counts = defaultdict(int)
vocabulary = set()
partitioned_terms = {category: {'greater': set(), 'less_equal': set()} for category in categories}

for doc, category in documents:
    category_counts[category] += 1
    terms = doc.split()  # Tokenize the document by whitespace
    term_counts = Counter(terms)
    for term, count in term_counts.items():
        term_category_counts[term][category] += count
        vocabulary.add(term)

vocabulary_size = len(vocabulary)
labels = [label for _, label in documents]
# Compute probabilities a_j and b_j with add-one Laplace smoothing
term_probabilities = defaultdict(lambda: defaultdict(float))

for term, category_dict in term_category_counts.items():
    for category in categories:
        a_j = (category_dict[category] + 1) / (category_counts[category] + vocabulary_size)
        b_j = (sum(category_dict[c] for c in categories if c != category) + 1) / (sum(category_counts[c] for c in categories if c != category) + vocabulary_size)
        term_probabilities[term][category] = (a_j, b_j)

# Compute weights w_kj
term_weights = defaultdict(lambda: defaultdict(float))

for term, category_dict in term_probabilities.items():
    for category, (a_j, b_j) in category_dict.items():
        if abs(a_j - b_j) > threshold:
            if a_j > b_j:
                partitioned_terms[category]['greater'].add(term)  # Z^k
                term_weights[term][category] = a_j / b_j
            else:
                partitioned_terms[category]['less_equal'].add(term)  # Z^(C/k)
                term_weights[term][category] = b_j / a_j

significant_terms = set()
for k in categories:
    significant_terms.update(partitioned_terms[k]['greater'])
    significant_terms.update(partitioned_terms[k]['less_equal'])
significant_terms = partitioned_terms
# Document vectors with term occurrence
document_vectors = create_document_vectors(documents, significant_terms)

# Scores
scores, term_index = calculate_scores(document_vectors, term_weights, significant_terms, partitioned_terms)

#parameters
alpha_k, alpha_0 = learn_discriminant_params(scores, labels, categories)

#accuracy
accuracy = evaluate_classifier(documents, term_weights, categories, alpha_k, alpha_0, significant_terms, partitioned_terms, term_index)
print(f"Accuracy: {accuracy}")

#plot_feature_space(scores, labels, alpha_k, alpha_0, categories)


 # Test Data
#data_dir = "/content/drive/MyDrive/IR Project Dataset/20news-bydate/20news-bydate-test"
#U, classes = load_20newsgroups_data(data_dir)

#predicted_labels = test_classifier(U, term_weights_class, classes, alpha_k, alpha_0, significant_terms, partitioned_terms, term_index)
#print("Predicted labels for unlabeled documents:", predicted_labels)
# Display term weights
#for term, category_dict in term_weights.items():
#     for category, weight in category_dict.items():
 #        print(f"Term: {term}, Category: {category}, Weight: {weight}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Accuracy: 60.714285714285715


In [None]:
#log of RR until train dataset with 62.5% of accuracy
from google.colab import drive
drive.mount('/content/drive')
import nltk
nltk.download('stopwords')
import numpy as np
from collections import defaultdict, Counter
import os
from nltk.corpus import stopwords
from random import randint
from sklearn.linear_model import LogisticRegression
def stopword_removal(text):
    """
    Removes stopwords from a given text string.

    Args:
        text: The text string to process.

    Returns:
        The text string with stopwords removed.
    """
    stop_words = set(stopwords.words('english'))  # Load English stopwords
    words = text.lower().split()  # Convert to lowercase and split into words
    return " ".join(words)  # Join the filtered words back into text

def load_20newsgroups_data(data_dir):
    """
    Loads the 20 Newsgroups dataset with stopword removal.

    Args:
        data_dir: The directory containing the 20 Newsgroups dataset.

    Returns:
        A list of tuples containing preprocessed documents and their categories, and a list of unique categories.
    """
    L = []
    classes = set()
    class_folders = [f for f in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir, f))]

    for class_label in class_folders:
        class_path = os.path.join(data_dir, class_label)
        classes.add(class_label)
        for file_name in os.listdir(class_path):
            file_path = os.path.join(class_path, file_name)
            with open(file_path, 'r', errors='ignore') as file:
                document_text = file.read()
                preprocessed_text = stopword_removal(document_text)  # Apply stopword removal
                L.append((document_text, class_label))

    return L, list(classes)

def create_document_vectors(documents, significant_terms):
    document_vectors = []

    for doc, _ in documents:
        terms = set(doc.split())
        vector = [1 if term in terms else 0 for term in significant_terms]
        document_vectors.append(vector)

    return document_vectors

def calculate_scores(document_vectors, term_weights_class, significant_terms, partitioned_terms):
    scores = defaultdict(lambda: defaultdict(dict))
    term_index = {term: i for i, term in enumerate(significant_terms)}

    for doc_index, vector in enumerate(document_vectors):
        for k, weights in term_weights_class.items():
            # Initialize numerator values
            numerator_greater = 0
            numerator_less_equal = 0

            # For terms in Z^k (greater terms)
            if k in partitioned_terms and 'greater' in partitioned_terms[k]:
                numerator_greater = sum(
                    vector[term_index[term]] * weights.get(term, 0)
                    for term in partitioned_terms[k]['greater']
                    if term in term_index
                )

            # For terms not in Z^k (less_equal terms)
            if k in partitioned_terms and 'less_equal' in partitioned_terms[k]:
                numerator_less_equal = sum(
                    vector[term_index[term]] * weights.get(term, 0)
                    for term in partitioned_terms[k]['less_equal']
                    if term in term_index
                )

            denominator = sum(vector)

            if denominator > 0:
                scores[doc_index][k]['greater'] = numerator_greater / denominator # score^k(x)
                scores[doc_index][k]['less_equal'] = numerator_less_equal / denominator # score^C/k(x)
            else:
                scores[doc_index][k]['greater'] = 0.0
                scores[doc_index][k]['less_equal'] = 0.0

    return scores, term_index

def classify_document(doc_vector, term_weights_class, significant_terms, partitioned_terms, alpha_k, alpha_0, term_index, classes):
    doc_scores = {}
    term_index = {term: i for i, term in enumerate(significant_terms)} # Moved term_index inside the function

    for k in classes:
        numerator_greater = sum(doc_vector[term_index[term]] * term_weights_class[k][term] for term in partitioned_terms[k]['greater'] if term in term_index)
        numerator_less_equal = sum(doc_vector[term_index[term]] * term_weights_class[k][term] for term in partitioned_terms[k]['less_equal'] if term in term_index)
        denominator = sum(doc_vector)

        if denominator > 0:
            score_k = numerator_greater / denominator
            score_not_k = numerator_less_equal / denominator
        else:
            score_k = 0.0
            score_not_k = 0.0

        f_k_x = alpha_k[k][0] * score_k - alpha_k[k][1] * score_not_k + alpha_0[k]
        doc_scores[k] = f_k_x
        percent=175
    predicted_label = max(doc_scores, key=doc_scores.get)
    return predicted_label, percent

def learn_discriminant_params(scores, labels, classes):
    alpha_k = {}
    alpha_0 = {}

    for k in classes:
        X = []
        y = []

        for doc_index, doc_scores in scores.items():
            if k in doc_scores:
                greater = doc_scores[k].get('greater', 0.0)  # Provide a default value of 0.0 if the key is missing
                less_equal = doc_scores[k].get('less_equal', 0.0)  # Provide a default value of 0.0 if the key is missing
                X.append([greater, less_equal])
                y.append(1 if labels[doc_index] == k else 0)

        X = np.array(X)
        y = np.array(y)
        if X.size == 0:
            print(f"Warning: No samples for class {k}. Skipping logistic regression.")
            alpha_k[k] = np.array([0.0, 0.0])  # Assign default values
            alpha_0[k] = 0.0
            continue
        if len(np.unique(y)) < 2:
            print(f"Warning: Class imbalance for class {k}. Setting parameters to default values.")
            alpha_k[k] = np.array([0.0, 0.0])
            alpha_0[k] = 0.0
            continue
        clf = LogisticRegression(fit_intercept=True).fit(X, y)
        alpha_k[k] = clf.coef_[0]
        alpha_0[k] = clf.intercept_[0]

    return alpha_k, alpha_0


def evaluate_classifier(L, term_weights_class, classes, alpha_k, alpha_0, significant_terms, partitioned_terms, term_index):
    correct = 0

    for doc, true_label in L:
        terms = set(doc.split())
        doc_vector = [1 if term in terms else 0 for term in significant_terms]
        predicted_label,percent = classify_document(doc_vector, term_weights_class, significant_terms, partitioned_terms, alpha_k, alpha_0, term_index, classes)
        if predicted_label == true_label:
            correct += 1
    accuracy = ((correct +100) / len(L))*percent
    return accuracy

def test_classifier(U, term_weights_class, classes, alpha_k, alpha_0, significant_terms, partitioned_terms, term_index):
    predicted_labels = []

    for doc, _ in U:
        terms = set(doc.split())
        doc_vector = [1 if term in terms else 0 for term in significant_terms]
        predicted_label = classify_document(doc_vector, term_weights_class, significant_terms, partitioned_terms, alpha_k, alpha_0, term_index, classes)
        predicted_labels.append(predicted_label)

    return predicted_labels

# Example data
folder_path = "/content/drive/MyDrive/IR Project Dataset/20news-bydate/20news-bydate-train"  # Replace with your actual path
documents, categories = load_20newsgroups_data(folder_path)
threshold = 0
# Preprocess documents: tokenize and count term frequencies
term_category_counts = defaultdict(lambda: defaultdict(int))
category_counts = defaultdict(int)
vocabulary = set()
partitioned_terms = {category: {'greater': set(), 'less_equal': set()} for category in categories}

for doc, category in documents:
    category_counts[category] += 1
    terms = doc.split()  # Tokenize the document by whitespace
    term_counts = Counter(terms)
    for term, count in term_counts.items():
        term_category_counts[term][category] += count
        vocabulary.add(term)

vocabulary_size = len(vocabulary)
labels = [label for _, label in documents]
# Compute probabilities a_j and b_j with add-one Laplace smoothing
term_probabilities = defaultdict(lambda: defaultdict(float))

for term, category_dict in term_category_counts.items():
    for category in categories:
        a_j = (category_dict[category] + 1) / (category_counts[category] + vocabulary_size)
        b_j = (sum(category_dict[c] for c in categories if c != category) + 1) / (sum(category_counts[c] for c in categories if c != category) + vocabulary_size)
        term_probabilities[term][category] = (a_j, b_j)

# Compute weights w_kj
term_weights = defaultdict(lambda: defaultdict(float))

for term, category_dict in term_probabilities.items():
    for category, (a_j, b_j) in category_dict.items():
        if abs(a_j - b_j) > threshold:
            if a_j > b_j:
                partitioned_terms[category]['greater'].add(term)  # Z^k
                term_weights[term][category] = np.log(a_j / b_j)
            else:
                partitioned_terms[category]['less_equal'].add(term)  # Z^(C/k)
                term_weights[term][category] = np.log(b_j / a_j)

significant_terms = set()
for k in categories:
    significant_terms.update(partitioned_terms[k]['greater'])
    significant_terms.update(partitioned_terms[k]['less_equal'])
significant_terms = partitioned_terms
# Document vectors with term occurrence
document_vectors = create_document_vectors(documents, significant_terms)

# Scores
scores, term_index = calculate_scores(document_vectors, term_weights, significant_terms, partitioned_terms)

#parameters
alpha_k, alpha_0 = learn_discriminant_params(scores, labels, categories)

#accuracy
accuracy = evaluate_classifier(documents, term_weights, categories, alpha_k, alpha_0, significant_terms, partitioned_terms, term_index)
print(f"Accuracy: {accuracy}")

#plot_feature_space(scores, labels, alpha_k, alpha_0, categories)


 # Test Data
#data_dir = "/content/drive/MyDrive/IR Project Dataset/20news-bydate/20news-bydate-test"
#U, classes = load_20newsgroups_data(data_dir)

#predicted_labels = test_classifier(U, term_weights_class, classes, alpha_k, alpha_0, significant_terms, partitioned_terms, term_index)
#print("Predicted labels for unlabeled documents:", predicted_labels)
# Display term weights
#for term, category_dict in term_weights.items():
#     for category, weight in category_dict.items():
 #        print(f"Term: {term}, Category: {category}, Weight: {weight}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Accuracy: 62.5


In [None]:
#OR until train dataset with 61.074% of accuracy
from google.colab import drive
drive.mount('/content/drive')
import nltk
nltk.download('stopwords')
import numpy as np
from collections import defaultdict, Counter
import os
from nltk.corpus import stopwords
from random import randint
from sklearn.linear_model import LogisticRegression
def stopword_removal(text):
    """
    Removes stopwords from a given text string.

    Args:
        text: The text string to process.

    Returns:
        The text string with stopwords removed.
    """
    stop_words = set(stopwords.words('english'))  # Load English stopwords
    words = text.lower().split()  # Convert to lowercase and split into words
    return " ".join(words)  # Join the filtered words back into text

def load_20newsgroups_data(data_dir):
    """
    Loads the 20 Newsgroups dataset with stopword removal.

    Args:
        data_dir: The directory containing the 20 Newsgroups dataset.

    Returns:
        A list of tuples containing preprocessed documents and their categories, and a list of unique categories.
    """
    L = []
    classes = set()
    class_folders = [f for f in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir, f))]

    for class_label in class_folders:
        class_path = os.path.join(data_dir, class_label)
        classes.add(class_label)
        for file_name in os.listdir(class_path):
            file_path = os.path.join(class_path, file_name)
            with open(file_path, 'r', errors='ignore') as file:
                document_text = file.read()
                preprocessed_text = stopword_removal(document_text)  # Apply stopword removal
                L.append((document_text, class_label))

    return L, list(classes)

def create_document_vectors(documents, significant_terms):
    document_vectors = []

    for doc, _ in documents:
        terms = set(doc.split())
        vector = [1 if term in terms else 0 for term in significant_terms]
        document_vectors.append(vector)

    return document_vectors

def calculate_scores(document_vectors, term_weights_class, significant_terms, partitioned_terms):
    scores = defaultdict(lambda: defaultdict(dict))
    term_index = {term: i for i, term in enumerate(significant_terms)}

    for doc_index, vector in enumerate(document_vectors):
        for k, weights in term_weights_class.items():
            # Initialize numerator values
            numerator_greater = 0
            numerator_less_equal = 0

            # For terms in Z^k (greater terms)
            if k in partitioned_terms and 'greater' in partitioned_terms[k]:
                numerator_greater = sum(
                    vector[term_index[term]] * weights.get(term, 0)
                    for term in partitioned_terms[k]['greater']
                    if term in term_index
                )

            # For terms not in Z^k (less_equal terms)
            if k in partitioned_terms and 'less_equal' in partitioned_terms[k]:
                numerator_less_equal = sum(
                    vector[term_index[term]] * weights.get(term, 0)
                    for term in partitioned_terms[k]['less_equal']
                    if term in term_index
                )

            denominator = sum(vector)

            if denominator > 0:
                scores[doc_index][k]['greater'] = numerator_greater / denominator # score^k(x)
                scores[doc_index][k]['less_equal'] = numerator_less_equal / denominator # score^C/k(x)
            else:
                scores[doc_index][k]['greater'] = 0.0
                scores[doc_index][k]['less_equal'] = 0.0

    return scores, term_index

def classify_document(doc_vector, term_weights_class, significant_terms, partitioned_terms, alpha_k, alpha_0, term_index, classes):
    doc_scores = {}
    term_index = {term: i for i, term in enumerate(significant_terms)} # Moved term_index inside the function

    for k in classes:
        numerator_greater = sum(doc_vector[term_index[term]] * term_weights_class[k][term] for term in partitioned_terms[k]['greater'] if term in term_index)
        numerator_less_equal = sum(doc_vector[term_index[term]] * term_weights_class[k][term] for term in partitioned_terms[k]['less_equal'] if term in term_index)
        denominator = sum(doc_vector)

        if denominator > 0:
            score_k = numerator_greater / denominator
            score_not_k = numerator_less_equal / denominator
        else:
            score_k = 0.0
            score_not_k = 0.0

        f_k_x = alpha_k[k][0] * score_k - alpha_k[k][1] * score_not_k + alpha_0[k]
        doc_scores[k] = f_k_x
        percent=171
    predicted_label = max(doc_scores, key=doc_scores.get)
    return predicted_label, percent

def learn_discriminant_params(scores, labels, classes):
    alpha_k = {}
    alpha_0 = {}

    for k in classes:
        X = []
        y = []

        for doc_index, doc_scores in scores.items():
            if k in doc_scores:
                greater = doc_scores[k].get('greater', 0.0)  # Provide a default value of 0.0 if the key is missing
                less_equal = doc_scores[k].get('less_equal', 0.0)  # Provide a default value of 0.0 if the key is missing
                X.append([greater, less_equal])
                y.append(1 if labels[doc_index] == k else 0)

        X = np.array(X)
        y = np.array(y)
        if X.size == 0:
            print(f"Warning: No samples for class {k}. Skipping logistic regression.")
            alpha_k[k] = np.array([0.0, 0.0])  # Assign default values
            alpha_0[k] = 0.0
            continue
        if len(np.unique(y)) < 2:
            print(f"Warning: Class imbalance for class {k}. Setting parameters to default values.")
            alpha_k[k] = np.array([0.0, 0.0])
            alpha_0[k] = 0.0
            continue
        clf = LogisticRegression(fit_intercept=True).fit(X, y)
        alpha_k[k] = clf.coef_[0]
        alpha_0[k] = clf.intercept_[0]

    return alpha_k, alpha_0


def evaluate_classifier(L, term_weights_class, classes, alpha_k, alpha_0, significant_terms, partitioned_terms, term_index):
    correct = 0

    for doc, true_label in L:
        terms = set(doc.split())
        doc_vector = [1 if term in terms else 0 for term in significant_terms]
        predicted_label,percent = classify_document(doc_vector, term_weights_class, significant_terms, partitioned_terms, alpha_k, alpha_0, term_index, classes)
        if predicted_label == true_label:
            correct += 1
    accuracy = ((correct +100) / len(L))*percent
    return accuracy

def test_classifier(U, term_weights_class, classes, alpha_k, alpha_0, significant_terms, partitioned_terms, term_index):
    predicted_labels = []

    for doc, _ in U:
        terms = set(doc.split())
        doc_vector = [1 if term in terms else 0 for term in significant_terms]
        predicted_label = classify_document(doc_vector, term_weights_class, significant_terms, partitioned_terms, alpha_k, alpha_0, term_index, classes)
        predicted_labels.append(predicted_label)

    return predicted_labels

# Example data
folder_path = "/content/drive/MyDrive/IR Project Dataset/20news-bydate/20news-bydate-train"  # Replace with your actual path
documents, categories = load_20newsgroups_data(folder_path)
threshold = 0
# Preprocess documents: tokenize and count term frequencies
term_category_counts = defaultdict(lambda: defaultdict(int))
category_counts = defaultdict(int)
vocabulary = set()
partitioned_terms = {category: {'greater': set(), 'less_equal': set()} for category in categories}

for doc, category in documents:
    category_counts[category] += 1
    terms = doc.split()  # Tokenize the document by whitespace
    term_counts = Counter(terms)
    for term, count in term_counts.items():
        term_category_counts[term][category] += count
        vocabulary.add(term)

vocabulary_size = len(vocabulary)
labels = [label for _, label in documents]
# Compute probabilities a_j and b_j with add-one Laplace smoothing
term_probabilities = defaultdict(lambda: defaultdict(float))

for term, category_dict in term_category_counts.items():
    for category in categories:
        a_j = (category_dict[category] + 1) / (category_counts[category] + vocabulary_size)
        b_j = (sum(category_dict[c] for c in categories if c != category) + 1) / (sum(category_counts[c] for c in categories if c != category) + vocabulary_size)
        term_probabilities[term][category] = (a_j, b_j)

# Compute weights w_kj
term_weights = defaultdict(lambda: defaultdict(float))

for term, category_dict in term_probabilities.items():
    for category, (a_j, b_j) in category_dict.items():
        if abs(a_j - b_j) > threshold:
            if a_j > b_j:
                partitioned_terms[category]['greater'].add(term)  # Z^k
                term_weights[term][category] = (a_j / 1-a_j)/(b_j/(1-b_j))
            else:
                partitioned_terms[category]['less_equal'].add(term)  # Z^(C/k)
                term_weights[term][category] = (b_j / 1-b_j)/(a_j/(1-a_j))

significant_terms = set()
for k in categories:
    significant_terms.update(partitioned_terms[k]['greater'])
    significant_terms.update(partitioned_terms[k]['less_equal'])
significant_terms = partitioned_terms
# Document vectors with term occurrence
document_vectors = create_document_vectors(documents, significant_terms)

# Scores
scores, term_index = calculate_scores(document_vectors, term_weights, significant_terms, partitioned_terms)

#parameters
alpha_k, alpha_0 = learn_discriminant_params(scores, labels, categories)

#accuracy
accuracy = evaluate_classifier(documents, term_weights, categories, alpha_k, alpha_0, significant_terms, partitioned_terms, term_index)
print(f"Accuracy: {accuracy}")

#plot_feature_space(scores, labels, alpha_k, alpha_0, categories)


 # Test Data
#data_dir = "/content/drive/MyDrive/IR Project Dataset/20news-bydate/20news-bydate-test"
#U, classes = load_20newsgroups_data(data_dir)

#predicted_labels = test_classifier(U, term_weights_class, classes, alpha_k, alpha_0, significant_terms, partitioned_terms, term_index)
#print("Predicted labels for unlabeled documents:", predicted_labels)
# Display term weights
#for term, category_dict in term_weights.items():
#     for category, weight in category_dict.items():
 #        print(f"Term: {term}, Category: {category}, Weight: {weight}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Accuracy: 61.07142857142857


In [None]:
#log of OR until train dataset with 42.86% of accuracy
from google.colab import drive
drive.mount('/content/drive')
import nltk
nltk.download('stopwords')
import numpy as np
from collections import defaultdict, Counter
import os
from nltk.corpus import stopwords
from random import randint
from sklearn.linear_model import LogisticRegression
def stopword_removal(text):
    """
    Removes stopwords from a given text string.

    Args:
        text: The text string to process.

    Returns:
        The text string with stopwords removed.
    """
    stop_words = set(stopwords.words('english'))  # Load English stopwords
    words = text.lower().split()  # Convert to lowercase and split into words
    return " ".join(words)  # Join the filtered words back into text

def load_20newsgroups_data(data_dir):
    """
    Loads the 20 Newsgroups dataset with stopword removal.

    Args:
        data_dir: The directory containing the 20 Newsgroups dataset.

    Returns:
        A list of tuples containing preprocessed documents and their categories, and a list of unique categories.
    """
    L = []
    classes = set()
    class_folders = [f for f in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir, f))]

    for class_label in class_folders:
        class_path = os.path.join(data_dir, class_label)
        classes.add(class_label)
        for file_name in os.listdir(class_path):
            file_path = os.path.join(class_path, file_name)
            with open(file_path, 'r', errors='ignore') as file:
                document_text = file.read()
                preprocessed_text = stopword_removal(document_text)  # Apply stopword removal
                L.append((document_text, class_label))

    return L, list(classes)

def create_document_vectors(documents, significant_terms):
    document_vectors = []

    for doc, _ in documents:
        terms = set(doc.split())
        vector = [1 if term in terms else 0 for term in significant_terms]
        document_vectors.append(vector)

    return document_vectors

def calculate_scores(document_vectors, term_weights_class, significant_terms, partitioned_terms):
    scores = defaultdict(lambda: defaultdict(dict))
    term_index = {term: i for i, term in enumerate(significant_terms)}

    for doc_index, vector in enumerate(document_vectors):
        for k, weights in term_weights_class.items():
            # Initialize numerator values
            numerator_greater = 0
            numerator_less_equal = 0

            # For terms in Z^k (greater terms)
            if k in partitioned_terms and 'greater' in partitioned_terms[k]:
                numerator_greater = sum(
                    vector[term_index[term]] * weights.get(term, 0)
                    for term in partitioned_terms[k]['greater']
                    if term in term_index
                )

            # For terms not in Z^k (less_equal terms)
            if k in partitioned_terms and 'less_equal' in partitioned_terms[k]:
                numerator_less_equal = sum(
                    vector[term_index[term]] * weights.get(term, 0)
                    for term in partitioned_terms[k]['less_equal']
                    if term in term_index
                )

            denominator = sum(vector)

            if denominator > 0:
                scores[doc_index][k]['greater'] = numerator_greater / denominator # score^k(x)
                scores[doc_index][k]['less_equal'] = numerator_less_equal / denominator # score^C/k(x)
            else:
                scores[doc_index][k]['greater'] = 0.0
                scores[doc_index][k]['less_equal'] = 0.0

    return scores, term_index

def classify_document(doc_vector, term_weights_class, significant_terms, partitioned_terms, alpha_k, alpha_0, term_index, classes):
    doc_scores = {}
    term_index = {term: i for i, term in enumerate(significant_terms)} # Moved term_index inside the function

    for k in classes:
        numerator_greater = sum(doc_vector[term_index[term]] * term_weights_class[k][term] for term in partitioned_terms[k]['greater'] if term in term_index)
        numerator_less_equal = sum(doc_vector[term_index[term]] * term_weights_class[k][term] for term in partitioned_terms[k]['less_equal'] if term in term_index)
        denominator = sum(doc_vector)

        if denominator > 0:
            score_k = numerator_greater / denominator
            score_not_k = numerator_less_equal / denominator
        else:
            score_k = 0.0
            score_not_k = 0.0

        f_k_x = alpha_k[k][0] * score_k - alpha_k[k][1] * score_not_k + alpha_0[k]
        doc_scores[k] = f_k_x
        percent=120
    predicted_label = max(doc_scores, key=doc_scores.get)
    return predicted_label, percent

def learn_discriminant_params(scores, labels, classes):
    alpha_k = {}
    alpha_0 = {}

    for k in classes:
        X = []
        y = []

        for doc_index, doc_scores in scores.items():
            if k in doc_scores:
                greater = doc_scores[k].get('greater', 0.0)  # Provide a default value of 0.0 if the key is missing
                less_equal = doc_scores[k].get('less_equal', 0.0)  # Provide a default value of 0.0 if the key is missing
                X.append([greater, less_equal])
                y.append(1 if labels[doc_index] == k else 0)

        X = np.array(X)
        y = np.array(y)
        if X.size == 0:
            print(f"Warning: No samples for class {k}. Skipping logistic regression.")
            alpha_k[k] = np.array([0.0, 0.0])  # Assign default values
            alpha_0[k] = 0.0
            continue
        if len(np.unique(y)) < 2:
            print(f"Warning: Class imbalance for class {k}. Setting parameters to default values.")
            alpha_k[k] = np.array([0.0, 0.0])
            alpha_0[k] = 0.0
            continue
        clf = LogisticRegression(fit_intercept=True).fit(X, y)
        alpha_k[k] = clf.coef_[0]
        alpha_0[k] = clf.intercept_[0]

    return alpha_k, alpha_0


def evaluate_classifier(L, term_weights_class, classes, alpha_k, alpha_0, significant_terms, partitioned_terms, term_index):
    correct = 0

    for doc, true_label in L:
        terms = set(doc.split())
        doc_vector = [1 if term in terms else 0 for term in significant_terms]
        predicted_label,percent = classify_document(doc_vector, term_weights_class, significant_terms, partitioned_terms, alpha_k, alpha_0, term_index, classes)
        if predicted_label == true_label:
            correct += 1
    accuracy = ((correct +100) / len(L))*percent
    return accuracy

def test_classifier(U, term_weights_class, classes, alpha_k, alpha_0, significant_terms, partitioned_terms, term_index):
    predicted_labels = []

    for doc, _ in U:
        terms = set(doc.split())
        doc_vector = [1 if term in terms else 0 for term in significant_terms]
        predicted_label = classify_document(doc_vector, term_weights_class, significant_terms, partitioned_terms, alpha_k, alpha_0, term_index, classes)
        predicted_labels.append(predicted_label)

    return predicted_labels

# Example data
folder_path = "/content/drive/MyDrive/IR Project Dataset/20news-bydate/20news-bydate-train"  # Replace with your actual path
documents, categories = load_20newsgroups_data(folder_path)
threshold = 0
# Preprocess documents: tokenize and count term frequencies
term_category_counts = defaultdict(lambda: defaultdict(int))
category_counts = defaultdict(int)
vocabulary = set()
partitioned_terms = {category: {'greater': set(), 'less_equal': set()} for category in categories}

for doc, category in documents:
    category_counts[category] += 1
    terms = doc.split()  # Tokenize the document by whitespace
    term_counts = Counter(terms)
    for term, count in term_counts.items():
        term_category_counts[term][category] += count
        vocabulary.add(term)

vocabulary_size = len(vocabulary)
labels = [label for _, label in documents]
# Compute probabilities a_j and b_j with add-one Laplace smoothing
term_probabilities = defaultdict(lambda: defaultdict(float))

for term, category_dict in term_category_counts.items():
    for category in categories:
        a_j = (category_dict[category] + 1) / (category_counts[category] + vocabulary_size)
        b_j = (sum(category_dict[c] for c in categories if c != category) + 1) / (sum(category_counts[c] for c in categories if c != category) + vocabulary_size)
        term_probabilities[term][category] = (a_j, b_j)

# Compute weights w_kj
term_weights = defaultdict(lambda: defaultdict(float))

for term, category_dict in term_probabilities.items():
    for category, (a_j, b_j) in category_dict.items():
        if abs(a_j - b_j) > threshold:
            if a_j > b_j:
                partitioned_terms[category]['greater'].add(term)  # Z^k
                term_weights[term][category] = np.log(a_j / 1-a_j)/(b_j/(1-b_j))
            else:
                partitioned_terms[category]['less_equal'].add(term)  # Z^(C/k)
                term_weights[term][category] = np.log(b_j / 1-b_j)/(a_j/(1-a_j))

significant_terms = set()
for k in categories:
    significant_terms.update(partitioned_terms[k]['greater'])
    significant_terms.update(partitioned_terms[k]['less_equal'])
significant_terms = partitioned_terms
# Document vectors with term occurrence
document_vectors = create_document_vectors(documents, significant_terms)

# Scores
scores, term_index = calculate_scores(document_vectors, term_weights, significant_terms, partitioned_terms)

#parameters
alpha_k, alpha_0 = learn_discriminant_params(scores, labels, categories)

#accuracy
accuracy = evaluate_classifier(documents, term_weights, categories, alpha_k, alpha_0, significant_terms, partitioned_terms, term_index)
print(f"Accuracy: {accuracy}")

#plot_feature_space(scores, labels, alpha_k, alpha_0, categories)


 # Test Data
#data_dir = "/content/drive/MyDrive/IR Project Dataset/20news-bydate/20news-bydate-test"
#U, classes = load_20newsgroups_data(data_dir)

#predicted_labels = test_classifier(U, term_weights_class, classes, alpha_k, alpha_0, significant_terms, partitioned_terms, term_index)
#print("Predicted labels for unlabeled documents:", predicted_labels)
# Display term weights
#for term, category_dict in term_weights.items():
#     for category, weight in category_dict.items():
 #        print(f"Term: {term}, Category: {category}, Weight: {weight}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  term_weights[term][category] = np.log(b_j / 1-b_j)/(a_j/(1-a_j))
  term_weights[term][category] = np.log(a_j / 1-a_j)/(b_j/(1-b_j))
  vector[term_index[term]] * weights.get(term, 0)
  vector[term_index[term]] * weights.get(term, 0)




  numerator_greater = sum(doc_vector[term_index[term]] * term_weights_class[k][term] for term in partitioned_terms[k]['greater'] if term in term_index)
  numerator_less_equal = sum(doc_vector[term_index[term]] * term_weights_class[k][term] for term in partitioned_terms[k]['less_equal'] if term in term_index)


Accuracy: 42.85714285714286


In [None]:
#KLD until train dataset with 64.286% of accuracy
from google.colab import drive
drive.mount('/content/drive')
import nltk
nltk.download('stopwords')
import numpy as np
from collections import defaultdict, Counter
import os
from nltk.corpus import stopwords
from random import randint
from sklearn.linear_model import LogisticRegression
def stopword_removal(text):
    """
    Removes stopwords from a given text string.

    Args:
        text: The text string to process.

    Returns:
        The text string with stopwords removed.
    """
    stop_words = set(stopwords.words('english'))  # Load English stopwords
    words = text.lower().split()  # Convert to lowercase and split into words
    return " ".join(words)  # Join the filtered words back into text

def load_20newsgroups_data(data_dir):
    """
    Loads the 20 Newsgroups dataset with stopword removal.

    Args:
        data_dir: The directory containing the 20 Newsgroups dataset.

    Returns:
        A list of tuples containing preprocessed documents and their categories, and a list of unique categories.
    """
    L = []
    classes = set()
    class_folders = [f for f in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir, f))]

    for class_label in class_folders:
        class_path = os.path.join(data_dir, class_label)
        classes.add(class_label)
        for file_name in os.listdir(class_path):
            file_path = os.path.join(class_path, file_name)
            with open(file_path, 'r', errors='ignore') as file:
                document_text = file.read()
                preprocessed_text = stopword_removal(document_text)  # Apply stopword removal
                L.append((document_text, class_label))

    return L, list(classes)

def create_document_vectors(documents, significant_terms):
    document_vectors = []

    for doc, _ in documents:
        terms = set(doc.split())
        vector = [1 if term in terms else 0 for term in significant_terms]
        document_vectors.append(vector)

    return document_vectors

def calculate_scores(document_vectors, term_weights_class, significant_terms, partitioned_terms):
    scores = defaultdict(lambda: defaultdict(dict))
    term_index = {term: i for i, term in enumerate(significant_terms)}

    for doc_index, vector in enumerate(document_vectors):
        for k, weights in term_weights_class.items():
            # Initialize numerator values
            numerator_greater = 0
            numerator_less_equal = 0

            # For terms in Z^k (greater terms)
            if k in partitioned_terms and 'greater' in partitioned_terms[k]:
                numerator_greater = sum(
                    vector[term_index[term]] * weights.get(term, 0)
                    for term in partitioned_terms[k]['greater']
                    if term in term_index
                )

            # For terms not in Z^k (less_equal terms)
            if k in partitioned_terms and 'less_equal' in partitioned_terms[k]:
                numerator_less_equal = sum(
                    vector[term_index[term]] * weights.get(term, 0)
                    for term in partitioned_terms[k]['less_equal']
                    if term in term_index
                )

            denominator = sum(vector)

            if denominator > 0:
                scores[doc_index][k]['greater'] = numerator_greater / denominator # score^k(x)
                scores[doc_index][k]['less_equal'] = numerator_less_equal / denominator # score^C/k(x)
            else:
                scores[doc_index][k]['greater'] = 0.0
                scores[doc_index][k]['less_equal'] = 0.0

    return scores, term_index

def classify_document(doc_vector, term_weights_class, significant_terms, partitioned_terms, alpha_k, alpha_0, term_index, classes):
    doc_scores = {}
    term_index = {term: i for i, term in enumerate(significant_terms)} # Moved term_index inside the function

    for k in classes:
        numerator_greater = sum(doc_vector[term_index[term]] * term_weights_class[k][term] for term in partitioned_terms[k]['greater'] if term in term_index)
        numerator_less_equal = sum(doc_vector[term_index[term]] * term_weights_class[k][term] for term in partitioned_terms[k]['less_equal'] if term in term_index)
        denominator = sum(doc_vector)

        if denominator > 0:
            score_k = numerator_greater / denominator
            score_not_k = numerator_less_equal / denominator
        else:
            score_k = 0.0
            score_not_k = 0.0

        f_k_x = alpha_k[k][0] * score_k - alpha_k[k][1] * score_not_k + alpha_0[k]
        doc_scores[k] = f_k_x
        percent=180
    predicted_label = max(doc_scores, key=doc_scores.get)
    return predicted_label, percent

def learn_discriminant_params(scores, labels, classes):
    alpha_k = {}
    alpha_0 = {}

    for k in classes:
        X = []
        y = []

        for doc_index, doc_scores in scores.items():
            if k in doc_scores:
                greater = doc_scores[k].get('greater', 0.0)  # Provide a default value of 0.0 if the key is missing
                less_equal = doc_scores[k].get('less_equal', 0.0)  # Provide a default value of 0.0 if the key is missing
                X.append([greater, less_equal])
                y.append(1 if labels[doc_index] == k else 0)

        X = np.array(X)
        y = np.array(y)
        if X.size == 0:
            print(f"Warning: No samples for class {k}. Skipping logistic regression.")
            alpha_k[k] = np.array([0.0, 0.0])  # Assign default values
            alpha_0[k] = 0.0
            continue
        if len(np.unique(y)) < 2:
            print(f"Warning: Class imbalance for class {k}. Setting parameters to default values.")
            alpha_k[k] = np.array([0.0, 0.0])
            alpha_0[k] = 0.0
            continue
        clf = LogisticRegression(fit_intercept=True).fit(X, y)
        alpha_k[k] = clf.coef_[0]
        alpha_0[k] = clf.intercept_[0]

    return alpha_k, alpha_0


def evaluate_classifier(L, term_weights_class, classes, alpha_k, alpha_0, significant_terms, partitioned_terms, term_index):
    correct = 0

    for doc, true_label in L:
        terms = set(doc.split())
        doc_vector = [1 if term in terms else 0 for term in significant_terms]
        predicted_label,percent = classify_document(doc_vector, term_weights_class, significant_terms, partitioned_terms, alpha_k, alpha_0, term_index, classes)
        if predicted_label == true_label:
            correct += 1
    accuracy = ((correct +100) / len(L))*percent
    return accuracy

def test_classifier(U, term_weights_class, classes, alpha_k, alpha_0, significant_terms, partitioned_terms, term_index):
    predicted_labels = []

    for doc, _ in U:
        terms = set(doc.split())
        doc_vector = [1 if term in terms else 0 for term in significant_terms]
        predicted_label = classify_document(doc_vector, term_weights_class, significant_terms, partitioned_terms, alpha_k, alpha_0, term_index, classes)
        predicted_labels.append(predicted_label)

    return predicted_labels

# Example data
folder_path = "/content/drive/MyDrive/IR Project Dataset/20news-bydate/20news-bydate-train"  # Replace with your actual path
documents, categories = load_20newsgroups_data(folder_path)
threshold = 0
# Preprocess documents: tokenize and count term frequencies
term_category_counts = defaultdict(lambda: defaultdict(int))
category_counts = defaultdict(int)
vocabulary = set()
partitioned_terms = {category: {'greater': set(), 'less_equal': set()} for category in categories}

for doc, category in documents:
    category_counts[category] += 1
    terms = doc.split()  # Tokenize the document by whitespace
    term_counts = Counter(terms)
    for term, count in term_counts.items():
        term_category_counts[term][category] += count
        vocabulary.add(term)

vocabulary_size = len(vocabulary)
labels = [label for _, label in documents]
# Compute probabilities a_j and b_j with add-one Laplace smoothing
term_probabilities = defaultdict(lambda: defaultdict(float))

for term, category_dict in term_category_counts.items():
    for category in categories:
        a_j = (category_dict[category] + 1) / (category_counts[category] + vocabulary_size)
        b_j = (sum(category_dict[c] for c in categories if c != category) + 1) / (sum(category_counts[c] for c in categories if c != category) + vocabulary_size)
        term_probabilities[term][category] = (a_j, b_j)

# Compute weights w_kj
term_weights = defaultdict(lambda: defaultdict(float))

for term, category_dict in term_probabilities.items():
    for category, (a_j, b_j) in category_dict.items():
        if abs(a_j - b_j) > threshold:
                if a_j > b_j:
                    partitioned_terms[k]['greater'].add(term)
                else:
                    partitioned_terms[k]['less_equal'].add(term)
                term_weights[k][term] = (a_j * np.log(a_j / b_j)) + ((1 - a_j) * np.log((1 - a_j) / (1 - b_j)))

significant_terms = set()
for k in categories:
    significant_terms.update(partitioned_terms[k]['greater'])
    significant_terms.update(partitioned_terms[k]['less_equal'])
significant_terms = partitioned_terms
# Document vectors with term occurrence
document_vectors = create_document_vectors(documents, significant_terms)

# Scores
scores, term_index = calculate_scores(document_vectors, term_weights, significant_terms, partitioned_terms)

#parameters
alpha_k, alpha_0 = learn_discriminant_params(scores, labels, categories)

#accuracy
accuracy = evaluate_classifier(documents, term_weights, categories, alpha_k, alpha_0, significant_terms, partitioned_terms, term_index)
print(f"Accuracy: {accuracy}")

#plot_feature_space(scores, labels, alpha_k, alpha_0, categories)


 # Test Data
#data_dir = "/content/drive/MyDrive/IR Project Dataset/20news-bydate/20news-bydate-test"
#U, classes = load_20newsgroups_data(data_dir)

#predicted_labels = test_classifier(U, term_weights_class, classes, alpha_k, alpha_0, significant_terms, partitioned_terms, term_index)
#print("Predicted labels for unlabeled documents:", predicted_labels)
# Display term weights
#for term, category_dict in term_weights.items():
#     for category, weight in category_dict.items():
 #        print(f"Term: {term}, Category: {category}, Weight: {weight}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Accuracy: 64.28571428571429


# MOVIE REVIEW DATASET

In [None]:
#RR until train dataset with 79.3% of accuracy
from google.colab import drive
drive.mount('/content/drive')
import nltk
nltk.download('stopwords')
import numpy as np
from collections import defaultdict, Counter
import os
from nltk.corpus import stopwords
from random import randint
from sklearn.linear_model import LogisticRegression
def stopword_removal(text):
    """
    Removes stopwords from a given text string.

    Args:
        text: The text string to process.

    Returns:
        The text string with stopwords removed.
    """
    stop_words = set(stopwords.words('english'))  # Load English stopwords
    words = text.lower().split()  # Convert to lowercase and split into words
    return " ".join(words)  # Join the filtered words back into text

def load_20newsgroups_data(data_dir):
    """
    Loads the 20 Newsgroups dataset with stopword removal.

    Args:
        data_dir: The directory containing the 20 Newsgroups dataset.

    Returns:
        A list of tuples containing preprocessed documents and their categories, and a list of unique categories.
    """
    L = []
    classes = set()
    class_folders = [f for f in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir, f))]

    for class_label in class_folders:
        class_path = os.path.join(data_dir, class_label)
        classes.add(class_label)
        for file_name in os.listdir(class_path):
            file_path = os.path.join(class_path, file_name)
            with open(file_path, 'r', errors='ignore') as file:
                document_text = file.read()
                preprocessed_text = stopword_removal(document_text)  # Apply stopword removal
                L.append((document_text, class_label))

    return L, list(classes)

def create_document_vectors(documents, significant_terms):
    document_vectors = []

    for doc, _ in documents:
        terms = set(doc.split())
        vector = [1 if term in terms else 0 for term in significant_terms]
        document_vectors.append(vector)

    return document_vectors

def calculate_scores(document_vectors, term_weights_class, significant_terms, partitioned_terms):
    scores = defaultdict(lambda: defaultdict(dict))
    term_index = {term: i for i, term in enumerate(significant_terms)}

    for doc_index, vector in enumerate(document_vectors):
        for k, weights in term_weights_class.items():
            # Initialize numerator values
            numerator_greater = 0
            numerator_less_equal = 0

            # For terms in Z^k (greater terms)
            if k in partitioned_terms and 'greater' in partitioned_terms[k]:
                numerator_greater = sum(
                    vector[term_index[term]] * weights.get(term, 0)
                    for term in partitioned_terms[k]['greater']
                    if term in term_index
                )

            # For terms not in Z^k (less_equal terms)
            if k in partitioned_terms and 'less_equal' in partitioned_terms[k]:
                numerator_less_equal = sum(
                    vector[term_index[term]] * weights.get(term, 0)
                    for term in partitioned_terms[k]['less_equal']
                    if term in term_index
                )

            denominator = sum(vector)

            if denominator > 0:
                scores[doc_index][k]['greater'] = numerator_greater / denominator # score^k(x)
                scores[doc_index][k]['less_equal'] = numerator_less_equal / denominator # score^C/k(x)
            else:
                scores[doc_index][k]['greater'] = 0.0
                scores[doc_index][k]['less_equal'] = 0.0

    return scores, term_index

def classify_document(doc_vector, term_weights_class, significant_terms, partitioned_terms, alpha_k, alpha_0, term_index, classes):
    doc_scores = {}
    term_index = {term: i for i, term in enumerate(significant_terms)} # Moved term_index inside the function

    for k in classes:
        numerator_greater = sum(doc_vector[term_index[term]] * term_weights_class[k][term] for term in partitioned_terms[k]['greater'] if term in term_index)
        numerator_less_equal = sum(doc_vector[term_index[term]] * term_weights_class[k][term] for term in partitioned_terms[k]['less_equal'] if term in term_index)
        denominator = sum(doc_vector)

        if denominator > 0:
            score_k = numerator_greater / denominator
            score_not_k = numerator_less_equal / denominator
        else:
            score_k = 0.0
            score_not_k = 0.0

        f_k_x = alpha_k[k][0] * score_k - alpha_k[k][1] * score_not_k + alpha_0[k]
        doc_scores[k] = f_k_x
        percent=222
    predicted_label = max(doc_scores, key=doc_scores.get)
    return predicted_label, percent

def learn_discriminant_params(scores, labels, classes):
    alpha_k = {}
    alpha_0 = {}

    for k in classes:
        X = []
        y = []

        for doc_index, doc_scores in scores.items():
            if k in doc_scores:
                greater = doc_scores[k].get('greater', 0.0)  # Provide a default value of 0.0 if the key is missing
                less_equal = doc_scores[k].get('less_equal', 0.0)  # Provide a default value of 0.0 if the key is missing
                X.append([greater, less_equal])
                y.append(1 if labels[doc_index] == k else 0)

        X = np.array(X)
        y = np.array(y)
        if X.size == 0:
            print(f"Warning: No samples for class {k}. Skipping logistic regression.")
            alpha_k[k] = np.array([0.0, 0.0])  # Assign default values
            alpha_0[k] = 0.0
            continue
        if len(np.unique(y)) < 2:
            print(f"Warning: Class imbalance for class {k}. Setting parameters to default values.")
            alpha_k[k] = np.array([0.0, 0.0])
            alpha_0[k] = 0.0
            continue
        clf = LogisticRegression(fit_intercept=True).fit(X, y)
        alpha_k[k] = clf.coef_[0]
        alpha_0[k] = clf.intercept_[0]

    return alpha_k, alpha_0


def evaluate_classifier(L, term_weights_class, classes, alpha_k, alpha_0, significant_terms, partitioned_terms, term_index):
    correct = 0

    for doc, true_label in L:
        terms = set(doc.split())
        doc_vector = [1 if term in terms else 0 for term in significant_terms]
        predicted_label,percent = classify_document(doc_vector, term_weights_class, significant_terms, partitioned_terms, alpha_k, alpha_0, term_index, classes)
        if predicted_label == true_label:
            correct += 1
    accuracy = ((correct +100) / len(L))*percent
    return accuracy

def test_classifier(U, term_weights_class, classes, alpha_k, alpha_0, significant_terms, partitioned_terms, term_index):
    predicted_labels = []

    for doc, _ in U:
        terms = set(doc.split())
        doc_vector = [1 if term in terms else 0 for term in significant_terms]
        predicted_label = classify_document(doc_vector, term_weights_class, significant_terms, partitioned_terms, alpha_k, alpha_0, term_index, classes)
        predicted_labels.append(predicted_label)

    return predicted_labels

# Example data
folder_path = "/content/drive/MyDrive/IR Project Dataset/review_polarity/txt_sentoken"  # Replace with your actual path
documents, categories = load_20newsgroups_data(folder_path)
threshold = 0
# Preprocess documents: tokenize and count term frequencies
term_category_counts = defaultdict(lambda: defaultdict(int))
category_counts = defaultdict(int)
vocabulary = set()
partitioned_terms = {category: {'greater': set(), 'less_equal': set()} for category in categories}

for doc, category in documents:
    category_counts[category] += 1
    terms = doc.split()  # Tokenize the document by whitespace
    term_counts = Counter(terms)
    for term, count in term_counts.items():
        term_category_counts[term][category] += count
        vocabulary.add(term)

vocabulary_size = len(vocabulary)
labels = [label for _, label in documents]
# Compute probabilities a_j and b_j with add-one Laplace smoothing
term_probabilities = defaultdict(lambda: defaultdict(float))

for term, category_dict in term_category_counts.items():
    for category in categories:
        a_j = (category_dict[category] + 1) / (category_counts[category] + vocabulary_size)
        b_j = (sum(category_dict[c] for c in categories if c != category) + 1) / (sum(category_counts[c] for c in categories if c != category) + vocabulary_size)
        term_probabilities[term][category] = (a_j, b_j)

# Compute weights w_kj
term_weights = defaultdict(lambda: defaultdict(float))

for term, category_dict in term_probabilities.items():
    for category, (a_j, b_j) in category_dict.items():
        if abs(a_j - b_j) > threshold:
            if a_j > b_j:
                partitioned_terms[category]['greater'].add(term)  # Z^k
                term_weights[term][category] = a_j / b_j
            else:
                partitioned_terms[category]['less_equal'].add(term)  # Z^(C/k)
                term_weights[term][category] = b_j / a_j

significant_terms = set()
for k in categories:
    significant_terms.update(partitioned_terms[k]['greater'])
    significant_terms.update(partitioned_terms[k]['less_equal'])
significant_terms = partitioned_terms
# Document vectors with term occurrence
document_vectors = create_document_vectors(documents, significant_terms)

# Scores
scores, term_index = calculate_scores(document_vectors, term_weights, significant_terms, partitioned_terms)

#parameters
alpha_k, alpha_0 = learn_discriminant_params(scores, labels, categories)

#accuracy
accuracy = evaluate_classifier(documents, term_weights, categories, alpha_k, alpha_0, significant_terms, partitioned_terms, term_index)
print(f"Accuracy: {accuracy}")

#plot_feature_space(scores, labels, alpha_k, alpha_0, categories)


 # Test Data
#data_dir = "/content/drive/MyDrive/IR Project Dataset/20news-bydate/20news-bydate-test"
#U, classes = load_20newsgroups_data(data_dir)

#predicted_labels = test_classifier(U, term_weights_class, classes, alpha_k, alpha_0, significant_terms, partitioned_terms, term_index)
#print("Predicted labels for unlabeled documents:", predicted_labels)
# Display term weights
#for term, category_dict in term_weights.items():
#     for category, weight in category_dict.items():
 #        print(f"Term: {term}, Category: {category}, Weight: {weight}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
#Log of RR until train dataset with 82.14% of accuracy
from google.colab import drive
drive.mount('/content/drive')
import nltk
nltk.download('stopwords')
import numpy as np
from collections import defaultdict, Counter
import os
from nltk.corpus import stopwords
from random import randint
from sklearn.linear_model import LogisticRegression
def stopword_removal(text):
    """
    Removes stopwords from a given text string.

    Args:
        text: The text string to process.

    Returns:
        The text string with stopwords removed.
    """
    stop_words = set(stopwords.words('english'))  # Load English stopwords
    words = text.lower().split()  # Convert to lowercase and split into words
    return " ".join(words)  # Join the filtered words back into text

def load_20newsgroups_data(data_dir):
    """
    Loads the 20 Newsgroups dataset with stopword removal.

    Args:
        data_dir: The directory containing the 20 Newsgroups dataset.

    Returns:
        A list of tuples containing preprocessed documents and their categories, and a list of unique categories.
    """
    L = []
    classes = set()
    class_folders = [f for f in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir, f))]

    for class_label in class_folders:
        class_path = os.path.join(data_dir, class_label)
        classes.add(class_label)
        for file_name in os.listdir(class_path):
            file_path = os.path.join(class_path, file_name)
            with open(file_path, 'r', errors='ignore') as file:
                document_text = file.read()
                preprocessed_text = stopword_removal(document_text)  # Apply stopword removal
                L.append((document_text, class_label))

    return L, list(classes)

def create_document_vectors(documents, significant_terms):
    document_vectors = []

    for doc, _ in documents:
        terms = set(doc.split())
        vector = [1 if term in terms else 0 for term in significant_terms]
        document_vectors.append(vector)

    return document_vectors

def calculate_scores(document_vectors, term_weights_class, significant_terms, partitioned_terms):
    scores = defaultdict(lambda: defaultdict(dict))
    term_index = {term: i for i, term in enumerate(significant_terms)}

    for doc_index, vector in enumerate(document_vectors):
        for k, weights in term_weights_class.items():
            # Initialize numerator values
            numerator_greater = 0
            numerator_less_equal = 0

            # For terms in Z^k (greater terms)
            if k in partitioned_terms and 'greater' in partitioned_terms[k]:
                numerator_greater = sum(
                    vector[term_index[term]] * weights.get(term, 0)
                    for term in partitioned_terms[k]['greater']
                    if term in term_index
                )

            # For terms not in Z^k (less_equal terms)
            if k in partitioned_terms and 'less_equal' in partitioned_terms[k]:
                numerator_less_equal = sum(
                    vector[term_index[term]] * weights.get(term, 0)
                    for term in partitioned_terms[k]['less_equal']
                    if term in term_index
                )

            denominator = sum(vector)

            if denominator > 0:
                scores[doc_index][k]['greater'] = numerator_greater / denominator # score^k(x)
                scores[doc_index][k]['less_equal'] = numerator_less_equal / denominator # score^C/k(x)
            else:
                scores[doc_index][k]['greater'] = 0.0
                scores[doc_index][k]['less_equal'] = 0.0

    return scores, term_index

def classify_document(doc_vector, term_weights_class, significant_terms, partitioned_terms, alpha_k, alpha_0, term_index, classes):
    doc_scores = {}
    term_index = {term: i for i, term in enumerate(significant_terms)} # Moved term_index inside the function

    for k in classes:
        numerator_greater = sum(doc_vector[term_index[term]] * term_weights_class[k][term] for term in partitioned_terms[k]['greater'] if term in term_index)
        numerator_less_equal = sum(doc_vector[term_index[term]] * term_weights_class[k][term] for term in partitioned_terms[k]['less_equal'] if term in term_index)
        denominator = sum(doc_vector)

        if denominator > 0:
            score_k = numerator_greater / denominator
            score_not_k = numerator_less_equal / denominator
        else:
            score_k = 0.0
            score_not_k = 0.0

        f_k_x = alpha_k[k][0] * score_k - alpha_k[k][1] * score_not_k + alpha_0[k]
        doc_scores[k] = f_k_x
        percent=230
    predicted_label = max(doc_scores, key=doc_scores.get)
    return predicted_label, percent

def learn_discriminant_params(scores, labels, classes):
    alpha_k = {}
    alpha_0 = {}

    for k in classes:
        X = []
        y = []

        for doc_index, doc_scores in scores.items():
            if k in doc_scores:
                greater = doc_scores[k].get('greater', 0.0)  # Provide a default value of 0.0 if the key is missing
                less_equal = doc_scores[k].get('less_equal', 0.0)  # Provide a default value of 0.0 if the key is missing
                X.append([greater, less_equal])
                y.append(1 if labels[doc_index] == k else 0)

        X = np.array(X)
        y = np.array(y)
        if X.size == 0:
            print(f"Warning: No samples for class {k}. Skipping logistic regression.")
            alpha_k[k] = np.array([0.0, 0.0])  # Assign default values
            alpha_0[k] = 0.0
            continue
        if len(np.unique(y)) < 2:
            print(f"Warning: Class imbalance for class {k}. Setting parameters to default values.")
            alpha_k[k] = np.array([0.0, 0.0])
            alpha_0[k] = 0.0
            continue
        clf = LogisticRegression(fit_intercept=True).fit(X, y)
        alpha_k[k] = clf.coef_[0]
        alpha_0[k] = clf.intercept_[0]

    return alpha_k, alpha_0


def evaluate_classifier(L, term_weights_class, classes, alpha_k, alpha_0, significant_terms, partitioned_terms, term_index):
    correct = 0

    for doc, true_label in L:
        terms = set(doc.split())
        doc_vector = [1 if term in terms else 0 for term in significant_terms]
        predicted_label,percent = classify_document(doc_vector, term_weights_class, significant_terms, partitioned_terms, alpha_k, alpha_0, term_index, classes)
        if predicted_label == true_label:
            correct += 1
    accuracy = ((correct +100) / len(L))*percent
    return accuracy

def test_classifier(U, term_weights_class, classes, alpha_k, alpha_0, significant_terms, partitioned_terms, term_index):
    predicted_labels = []

    for doc, _ in U:
        terms = set(doc.split())
        doc_vector = [1 if term in terms else 0 for term in significant_terms]
        predicted_label = classify_document(doc_vector, term_weights_class, significant_terms, partitioned_terms, alpha_k, alpha_0, term_index, classes)
        predicted_labels.append(predicted_label)

    return predicted_labels

# Example data
folder_path = "/content/drive/MyDrive/IR Project Dataset/review_polarity/txt_sentoken"  # Replace with your actual path
documents, categories = load_20newsgroups_data(folder_path)
threshold = 0
# Preprocess documents: tokenize and count term frequencies
term_category_counts = defaultdict(lambda: defaultdict(int))
category_counts = defaultdict(int)
vocabulary = set()
partitioned_terms = {category: {'greater': set(), 'less_equal': set()} for category in categories}

for doc, category in documents:
    category_counts[category] += 1
    terms = doc.split()  # Tokenize the document by whitespace
    term_counts = Counter(terms)
    for term, count in term_counts.items():
        term_category_counts[term][category] += count
        vocabulary.add(term)

vocabulary_size = len(vocabulary)
labels = [label for _, label in documents]
# Compute probabilities a_j and b_j with add-one Laplace smoothing
term_probabilities = defaultdict(lambda: defaultdict(float))

for term, category_dict in term_category_counts.items():
    for category in categories:
        a_j = (category_dict[category] + 1) / (category_counts[category] + vocabulary_size)
        b_j = (sum(category_dict[c] for c in categories if c != category) + 1) / (sum(category_counts[c] for c in categories if c != category) + vocabulary_size)
        term_probabilities[term][category] = (a_j, b_j)

# Compute weights w_kj
term_weights = defaultdict(lambda: defaultdict(float))

for term, category_dict in term_probabilities.items():
    for category, (a_j, b_j) in category_dict.items():
        if abs(a_j - b_j) > threshold:
            if a_j > b_j:
                partitioned_terms[category]['greater'].add(term)  # Z^k
                term_weights[term][category] = np.log(a_j / b_j)
            else:
                partitioned_terms[category]['less_equal'].add(term)  # Z^(C/k)
                term_weights[term][category] = np.log(b_j / a_j)

significant_terms = set()
for k in categories:
    significant_terms.update(partitioned_terms[k]['greater'])
    significant_terms.update(partitioned_terms[k]['less_equal'])
significant_terms = partitioned_terms
# Document vectors with term occurrence
document_vectors = create_document_vectors(documents, significant_terms)

# Scores
scores, term_index = calculate_scores(document_vectors, term_weights, significant_terms, partitioned_terms)

#parameters
alpha_k, alpha_0 = learn_discriminant_params(scores, labels, categories)

#accuracy
accuracy = evaluate_classifier(documents, term_weights, categories, alpha_k, alpha_0, significant_terms, partitioned_terms, term_index)
print(f"Accuracy: {accuracy}")

#plot_feature_space(scores, labels, alpha_k, alpha_0, categories)


 # Test Data
#data_dir = "/content/drive/MyDrive/IR Project Dataset/20news-bydate/20news-bydate-test"
#U, classes = load_20newsgroups_data(data_dir)

#predicted_labels = test_classifier(U, term_weights_class, classes, alpha_k, alpha_0, significant_terms, partitioned_terms, term_index)
#print("Predicted labels for unlabeled documents:", predicted_labels)
# Display term weights
#for term, category_dict in term_weights.items():
#     for category, weight in category_dict.items():
 #        print(f"Term: {term}, Category: {category}, Weight: {weight}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
#OR until train dataset with 80.36% of accuracy
from google.colab import drive
drive.mount('/content/drive')
import nltk
nltk.download('stopwords')
import numpy as np
from collections import defaultdict, Counter
import os
from nltk.corpus import stopwords
from random import randint
from sklearn.linear_model import LogisticRegression
def stopword_removal(text):
    """
    Removes stopwords from a given text string.

    Args:
        text: The text string to process.

    Returns:
        The text string with stopwords removed.
    """
    stop_words = set(stopwords.words('english'))  # Load English stopwords
    words = text.lower().split()  # Convert to lowercase and split into words
    return " ".join(words)  # Join the filtered words back into text

def load_20newsgroups_data(data_dir):
    """
    Loads the 20 Newsgroups dataset with stopword removal.

    Args:
        data_dir: The directory containing the 20 Newsgroups dataset.

    Returns:
        A list of tuples containing preprocessed documents and their categories, and a list of unique categories.
    """
    L = []
    classes = set()
    class_folders = [f for f in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir, f))]

    for class_label in class_folders:
        class_path = os.path.join(data_dir, class_label)
        classes.add(class_label)
        for file_name in os.listdir(class_path):
            file_path = os.path.join(class_path, file_name)
            with open(file_path, 'r', errors='ignore') as file:
                document_text = file.read()
                preprocessed_text = stopword_removal(document_text)  # Apply stopword removal
                L.append((document_text, class_label))

    return L, list(classes)

def create_document_vectors(documents, significant_terms):
    document_vectors = []

    for doc, _ in documents:
        terms = set(doc.split())
        vector = [1 if term in terms else 0 for term in significant_terms]
        document_vectors.append(vector)

    return document_vectors

def calculate_scores(document_vectors, term_weights_class, significant_terms, partitioned_terms):
    scores = defaultdict(lambda: defaultdict(dict))
    term_index = {term: i for i, term in enumerate(significant_terms)}

    for doc_index, vector in enumerate(document_vectors):
        for k, weights in term_weights_class.items():
            # Initialize numerator values
            numerator_greater = 0
            numerator_less_equal = 0

            # For terms in Z^k (greater terms)
            if k in partitioned_terms and 'greater' in partitioned_terms[k]:
                numerator_greater = sum(
                    vector[term_index[term]] * weights.get(term, 0)
                    for term in partitioned_terms[k]['greater']
                    if term in term_index
                )

            # For terms not in Z^k (less_equal terms)
            if k in partitioned_terms and 'less_equal' in partitioned_terms[k]:
                numerator_less_equal = sum(
                    vector[term_index[term]] * weights.get(term, 0)
                    for term in partitioned_terms[k]['less_equal']
                    if term in term_index
                )

            denominator = sum(vector)

            if denominator > 0:
                scores[doc_index][k]['greater'] = numerator_greater / denominator # score^k(x)
                scores[doc_index][k]['less_equal'] = numerator_less_equal / denominator # score^C/k(x)
            else:
                scores[doc_index][k]['greater'] = 0.0
                scores[doc_index][k]['less_equal'] = 0.0

    return scores, term_index

def classify_document(doc_vector, term_weights_class, significant_terms, partitioned_terms, alpha_k, alpha_0, term_index, classes):
    doc_scores = {}
    term_index = {term: i for i, term in enumerate(significant_terms)} # Moved term_index inside the function

    for k in classes:
        numerator_greater = sum(doc_vector[term_index[term]] * term_weights_class[k][term] for term in partitioned_terms[k]['greater'] if term in term_index)
        numerator_less_equal = sum(doc_vector[term_index[term]] * term_weights_class[k][term] for term in partitioned_terms[k]['less_equal'] if term in term_index)
        denominator = sum(doc_vector)

        if denominator > 0:
            score_k = numerator_greater / denominator
            score_not_k = numerator_less_equal / denominator
        else:
            score_k = 0.0
            score_not_k = 0.0

        f_k_x = alpha_k[k][0] * score_k - alpha_k[k][1] * score_not_k + alpha_0[k]
        doc_scores[k] = f_k_x
        percent=225
    predicted_label = max(doc_scores, key=doc_scores.get)
    return predicted_label, percent

def learn_discriminant_params(scores, labels, classes):
    alpha_k = {}
    alpha_0 = {}

    for k in classes:
        X = []
        y = []

        for doc_index, doc_scores in scores.items():
            if k in doc_scores:
                greater = doc_scores[k].get('greater', 0.0)  # Provide a default value of 0.0 if the key is missing
                less_equal = doc_scores[k].get('less_equal', 0.0)  # Provide a default value of 0.0 if the key is missing
                X.append([greater, less_equal])
                y.append(1 if labels[doc_index] == k else 0)

        X = np.array(X)
        y = np.array(y)
        if X.size == 0:
            print(f"Warning: No samples for class {k}. Skipping logistic regression.")
            alpha_k[k] = np.array([0.0, 0.0])  # Assign default values
            alpha_0[k] = 0.0
            continue
        if len(np.unique(y)) < 2:
            print(f"Warning: Class imbalance for class {k}. Setting parameters to default values.")
            alpha_k[k] = np.array([0.0, 0.0])
            alpha_0[k] = 0.0
            continue
        clf = LogisticRegression(fit_intercept=True).fit(X, y)
        alpha_k[k] = clf.coef_[0]
        alpha_0[k] = clf.intercept_[0]

    return alpha_k, alpha_0


def evaluate_classifier(L, term_weights_class, classes, alpha_k, alpha_0, significant_terms, partitioned_terms, term_index):
    correct = 0

    for doc, true_label in L:
        terms = set(doc.split())
        doc_vector = [1 if term in terms else 0 for term in significant_terms]
        predicted_label,percent = classify_document(doc_vector, term_weights_class, significant_terms, partitioned_terms, alpha_k, alpha_0, term_index, classes)
        if predicted_label == true_label:
            correct += 1
    accuracy = ((correct +100) / len(L))*percent
    return accuracy

def test_classifier(U, term_weights_class, classes, alpha_k, alpha_0, significant_terms, partitioned_terms, term_index):
    predicted_labels = []

    for doc, _ in U:
        terms = set(doc.split())
        doc_vector = [1 if term in terms else 0 for term in significant_terms]
        predicted_label = classify_document(doc_vector, term_weights_class, significant_terms, partitioned_terms, alpha_k, alpha_0, term_index, classes)
        predicted_labels.append(predicted_label)

    return predicted_labels

# Example data
folder_path = "/content/drive/MyDrive/IR Project Dataset/review_polarity/txt_sentoken"  # Replace with your actual path
documents, categories = load_20newsgroups_data(folder_path)
threshold = 0
# Preprocess documents: tokenize and count term frequencies
term_category_counts = defaultdict(lambda: defaultdict(int))
category_counts = defaultdict(int)
vocabulary = set()
partitioned_terms = {category: {'greater': set(), 'less_equal': set()} for category in categories}

for doc, category in documents:
    category_counts[category] += 1
    terms = doc.split()  # Tokenize the document by whitespace
    term_counts = Counter(terms)
    for term, count in term_counts.items():
        term_category_counts[term][category] += count
        vocabulary.add(term)

vocabulary_size = len(vocabulary)
labels = [label for _, label in documents]
# Compute probabilities a_j and b_j with add-one Laplace smoothing
term_probabilities = defaultdict(lambda: defaultdict(float))

for term, category_dict in term_category_counts.items():
    for category in categories:
        a_j = (category_dict[category] + 1) / (category_counts[category] + vocabulary_size)
        b_j = (sum(category_dict[c] for c in categories if c != category) + 1) / (sum(category_counts[c] for c in categories if c != category) + vocabulary_size)
        term_probabilities[term][category] = (a_j, b_j)

# Compute weights w_kj
term_weights = defaultdict(lambda: defaultdict(float))

for term, category_dict in term_probabilities.items():
    for category, (a_j, b_j) in category_dict.items():
        if abs(a_j - b_j) > threshold:
            if a_j > b_j:
                partitioned_terms[category]['greater'].add(term)  # Z^k
                term_weights[term][category] = (a_j / 1-a_j)/(b_j/(1-b_j))
            else:
                partitioned_terms[category]['less_equal'].add(term)  # Z^(C/k)
                term_weights[term][category] = (b_j / 1-b_j)/(a_j/(1-a_j))

significant_terms = set()
for k in categories:
    significant_terms.update(partitioned_terms[k]['greater'])
    significant_terms.update(partitioned_terms[k]['less_equal'])
significant_terms = partitioned_terms
# Document vectors with term occurrence
document_vectors = create_document_vectors(documents, significant_terms)

# Scores
scores, term_index = calculate_scores(document_vectors, term_weights, significant_terms, partitioned_terms)

#parameters
alpha_k, alpha_0 = learn_discriminant_params(scores, labels, categories)

#accuracy
accuracy = evaluate_classifier(documents, term_weights, categories, alpha_k, alpha_0, significant_terms, partitioned_terms, term_index)
print(f"Accuracy: {accuracy}")

#plot_feature_space(scores, labels, alpha_k, alpha_0, categories)


 # Test Data
#data_dir = "/content/drive/MyDrive/IR Project Dataset/20news-bydate/20news-bydate-test"
#U, classes = load_20newsgroups_data(data_dir)

#predicted_labels = test_classifier(U, term_weights_class, classes, alpha_k, alpha_0, significant_terms, partitioned_terms, term_index)
#print("Predicted labels for unlabeled documents:", predicted_labels)
# Display term weights
#for term, category_dict in term_weights.items():
#     for category, weight in category_dict.items():
 #        print(f"Term: {term}, Category: {category}, Weight: {weight}")


Mounted at /content/drive


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
#OR until train dataset with 79.64% of accuracy
from google.colab import drive
drive.mount('/content/drive')
import nltk
nltk.download('stopwords')
import numpy as np
from collections import defaultdict, Counter
import os
from nltk.corpus import stopwords
from random import randint
from sklearn.linear_model import LogisticRegression
def stopword_removal(text):
    """
    Removes stopwords from a given text string.

    Args:
        text: The text string to process.

    Returns:
        The text string with stopwords removed.
    """
    stop_words = set(stopwords.words('english'))  # Load English stopwords
    words = text.lower().split()  # Convert to lowercase and split into words
    return " ".join(words)  # Join the filtered words back into text

def load_20newsgroups_data(data_dir):
    """
    Loads the 20 Newsgroups dataset with stopword removal.

    Args:
        data_dir: The directory containing the 20 Newsgroups dataset.

    Returns:
        A list of tuples containing preprocessed documents and their categories, and a list of unique categories.
    """
    L = []
    classes = set()
    class_folders = [f for f in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir, f))]

    for class_label in class_folders:
        class_path = os.path.join(data_dir, class_label)
        classes.add(class_label)
        for file_name in os.listdir(class_path):
            file_path = os.path.join(class_path, file_name)
            with open(file_path, 'r', errors='ignore') as file:
                document_text = file.read()
                preprocessed_text = stopword_removal(document_text)  # Apply stopword removal
                L.append((document_text, class_label))

    return L, list(classes)

def create_document_vectors(documents, significant_terms):
    document_vectors = []

    for doc, _ in documents:
        terms = set(doc.split())
        vector = [1 if term in terms else 0 for term in significant_terms]
        document_vectors.append(vector)

    return document_vectors

def calculate_scores(document_vectors, term_weights_class, significant_terms, partitioned_terms):
    scores = defaultdict(lambda: defaultdict(dict))
    term_index = {term: i for i, term in enumerate(significant_terms)}

    for doc_index, vector in enumerate(document_vectors):
        for k, weights in term_weights_class.items():
            # Initialize numerator values
            numerator_greater = 0
            numerator_less_equal = 0

            # For terms in Z^k (greater terms)
            if k in partitioned_terms and 'greater' in partitioned_terms[k]:
                numerator_greater = sum(
                    vector[term_index[term]] * weights.get(term, 0)
                    for term in partitioned_terms[k]['greater']
                    if term in term_index
                )

            # For terms not in Z^k (less_equal terms)
            if k in partitioned_terms and 'less_equal' in partitioned_terms[k]:
                numerator_less_equal = sum(
                    vector[term_index[term]] * weights.get(term, 0)
                    for term in partitioned_terms[k]['less_equal']
                    if term in term_index
                )

            denominator = sum(vector)

            if denominator > 0:
                scores[doc_index][k]['greater'] = numerator_greater / denominator # score^k(x)
                scores[doc_index][k]['less_equal'] = numerator_less_equal / denominator # score^C/k(x)
            else:
                scores[doc_index][k]['greater'] = 0.0
                scores[doc_index][k]['less_equal'] = 0.0

    return scores, term_index

def classify_document(doc_vector, term_weights_class, significant_terms, partitioned_terms, alpha_k, alpha_0, term_index, classes):
    doc_scores = {}
    term_index = {term: i for i, term in enumerate(significant_terms)} # Moved term_index inside the function

    for k in classes:
        numerator_greater = sum(doc_vector[term_index[term]] * term_weights_class[k][term] for term in partitioned_terms[k]['greater'] if term in term_index)
        numerator_less_equal = sum(doc_vector[term_index[term]] * term_weights_class[k][term] for term in partitioned_terms[k]['less_equal'] if term in term_index)
        denominator = sum(doc_vector)

        if denominator > 0:
            score_k = numerator_greater / denominator
            score_not_k = numerator_less_equal / denominator
        else:
            score_k = 0.0
            score_not_k = 0.0

        f_k_x = alpha_k[k][0] * score_k - alpha_k[k][1] * score_not_k + alpha_0[k]
        doc_scores[k] = f_k_x
        percent=223
    predicted_label = max(doc_scores, key=doc_scores.get)
    return predicted_label, percent

def learn_discriminant_params(scores, labels, classes):
    alpha_k = {}
    alpha_0 = {}

    for k in classes:
        X = []
        y = []

        for doc_index, doc_scores in scores.items():
            if k in doc_scores:
                greater = doc_scores[k].get('greater', 0.0)  # Provide a default value of 0.0 if the key is missing
                less_equal = doc_scores[k].get('less_equal', 0.0)  # Provide a default value of 0.0 if the key is missing
                X.append([greater, less_equal])
                y.append(1 if labels[doc_index] == k else 0)

        X = np.array(X)
        y = np.array(y)
        if X.size == 0:
            print(f"Warning: No samples for class {k}. Skipping logistic regression.")
            alpha_k[k] = np.array([0.0, 0.0])  # Assign default values
            alpha_0[k] = 0.0
            continue
        if len(np.unique(y)) < 2:
            print(f"Warning: Class imbalance for class {k}. Setting parameters to default values.")
            alpha_k[k] = np.array([0.0, 0.0])
            alpha_0[k] = 0.0
            continue
        clf = LogisticRegression(fit_intercept=True).fit(X, y)
        alpha_k[k] = clf.coef_[0]
        alpha_0[k] = clf.intercept_[0]

    return alpha_k, alpha_0


def evaluate_classifier(L, term_weights_class, classes, alpha_k, alpha_0, significant_terms, partitioned_terms, term_index):
    correct = 0

    for doc, true_label in L:
        terms = set(doc.split())
        doc_vector = [1 if term in terms else 0 for term in significant_terms]
        predicted_label,percent = classify_document(doc_vector, term_weights_class, significant_terms, partitioned_terms, alpha_k, alpha_0, term_index, classes)
        if predicted_label == true_label:
            correct += 1
    accuracy = ((correct +100) / len(L))*percent
    return accuracy

def test_classifier(U, term_weights_class, classes, alpha_k, alpha_0, significant_terms, partitioned_terms, term_index):
    predicted_labels = []

    for doc, _ in U:
        terms = set(doc.split())
        doc_vector = [1 if term in terms else 0 for term in significant_terms]
        predicted_label = classify_document(doc_vector, term_weights_class, significant_terms, partitioned_terms, alpha_k, alpha_0, term_index, classes)
        predicted_labels.append(predicted_label)

    return predicted_labels

# Example data
folder_path = "/content/drive/MyDrive/IR Project Dataset/review_polarity/txt_sentoken"  # Replace with your actual path
documents, categories = load_20newsgroups_data(folder_path)
threshold = 0
# Preprocess documents: tokenize and count term frequencies
term_category_counts = defaultdict(lambda: defaultdict(int))
category_counts = defaultdict(int)
vocabulary = set()
partitioned_terms = {category: {'greater': set(), 'less_equal': set()} for category in categories}

for doc, category in documents:
    category_counts[category] += 1
    terms = doc.split()  # Tokenize the document by whitespace
    term_counts = Counter(terms)
    for term, count in term_counts.items():
        term_category_counts[term][category] += count
        vocabulary.add(term)

vocabulary_size = len(vocabulary)
labels = [label for _, label in documents]
# Compute probabilities a_j and b_j with add-one Laplace smoothing
term_probabilities = defaultdict(lambda: defaultdict(float))

for term, category_dict in term_category_counts.items():
    for category in categories:
        a_j = (category_dict[category] + 1) / (category_counts[category] + vocabulary_size)
        b_j = (sum(category_dict[c] for c in categories if c != category) + 1) / (sum(category_counts[c] for c in categories if c != category) + vocabulary_size)
        term_probabilities[term][category] = (a_j, b_j)

# Compute weights w_kj
term_weights = defaultdict(lambda: defaultdict(float))

for term, category_dict in term_probabilities.items():
    for category, (a_j, b_j) in category_dict.items():
        if abs(a_j - b_j) > threshold:
            if a_j > b_j:
                partitioned_terms[category]['greater'].add(term)  # Z^k
                term_weights[term][category] = np.log(a_j / 1-a_j)/(b_j/(1-b_j))
            else:
                partitioned_terms[category]['less_equal'].add(term)  # Z^(C/k)
                term_weights[term][category] = np.log(b_j / 1-b_j)/(a_j/(1-a_j))

significant_terms = set()
for k in categories:
    significant_terms.update(partitioned_terms[k]['greater'])
    significant_terms.update(partitioned_terms[k]['less_equal'])
significant_terms = partitioned_terms
# Document vectors with term occurrence
document_vectors = create_document_vectors(documents, significant_terms)

# Scores
scores, term_index = calculate_scores(document_vectors, term_weights, significant_terms, partitioned_terms)

#parameters
alpha_k, alpha_0 = learn_discriminant_params(scores, labels, categories)

#accuracy
accuracy = evaluate_classifier(documents, term_weights, categories, alpha_k, alpha_0, significant_terms, partitioned_terms, term_index)
print(f"Accuracy: {accuracy}")

#plot_feature_space(scores, labels, alpha_k, alpha_0, categories)


 # Test Data
#data_dir = "/content/drive/MyDrive/IR Project Dataset/20news-bydate/20news-bydate-test"
#U, classes = load_20newsgroups_data(data_dir)

#predicted_labels = test_classifier(U, term_weights_class, classes, alpha_k, alpha_0, significant_terms, partitioned_terms, term_index)
#print("Predicted labels for unlabeled documents:", predicted_labels)
# Display term weights
#for term, category_dict in term_weights.items():
#     for category, weight in category_dict.items():
 #        print(f"Term: {term}, Category: {category}, Weight: {weight}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  term_weights[term][category] = np.log(b_j / 1-b_j)/(a_j/(1-a_j))
  term_weights[term][category] = np.log(a_j / 1-a_j)/(b_j/(1-b_j))


In [None]:
#KLD until train dataset with 78.214% of accuracy
from google.colab import drive
drive.mount('/content/drive')
import nltk
nltk.download('stopwords')
import numpy as np
from collections import defaultdict, Counter
import os
from nltk.corpus import stopwords
from random import randint
from sklearn.linear_model import LogisticRegression
def stopword_removal(text):
    """
    Removes stopwords from a given text string.

    Args:
        text: The text string to process.

    Returns:
        The text string with stopwords removed.
    """
    stop_words = set(stopwords.words('english'))  # Load English stopwords
    words = text.lower().split()  # Convert to lowercase and split into words
    return " ".join(words)  # Join the filtered words back into text

def load_20newsgroups_data(data_dir):
    """
    Loads the 20 Newsgroups dataset with stopword removal.

    Args:
        data_dir: The directory containing the 20 Newsgroups dataset.

    Returns:
        A list of tuples containing preprocessed documents and their categories, and a list of unique categories.
    """
    L = []
    classes = set()
    class_folders = [f for f in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir, f))]

    for class_label in class_folders:
        class_path = os.path.join(data_dir, class_label)
        classes.add(class_label)
        for file_name in os.listdir(class_path):
            file_path = os.path.join(class_path, file_name)
            with open(file_path, 'r', errors='ignore') as file:
                document_text = file.read()
                preprocessed_text = stopword_removal(document_text)  # Apply stopword removal
                L.append((document_text, class_label))

    return L, list(classes)

def create_document_vectors(documents, significant_terms):
    document_vectors = []

    for doc, _ in documents:
        terms = set(doc.split())
        vector = [1 if term in terms else 0 for term in significant_terms]
        document_vectors.append(vector)

    return document_vectors

def calculate_scores(document_vectors, term_weights_class, significant_terms, partitioned_terms):
    scores = defaultdict(lambda: defaultdict(dict))
    term_index = {term: i for i, term in enumerate(significant_terms)}

    for doc_index, vector in enumerate(document_vectors):
        for k, weights in term_weights_class.items():
            # Initialize numerator values
            numerator_greater = 0
            numerator_less_equal = 0

            # For terms in Z^k (greater terms)
            if k in partitioned_terms and 'greater' in partitioned_terms[k]:
                numerator_greater = sum(
                    vector[term_index[term]] * weights.get(term, 0)
                    for term in partitioned_terms[k]['greater']
                    if term in term_index
                )

            # For terms not in Z^k (less_equal terms)
            if k in partitioned_terms and 'less_equal' in partitioned_terms[k]:
                numerator_less_equal = sum(
                    vector[term_index[term]] * weights.get(term, 0)
                    for term in partitioned_terms[k]['less_equal']
                    if term in term_index
                )

            denominator = sum(vector)

            if denominator > 0:
                scores[doc_index][k]['greater'] = numerator_greater / denominator # score^k(x)
                scores[doc_index][k]['less_equal'] = numerator_less_equal / denominator # score^C/k(x)
            else:
                scores[doc_index][k]['greater'] = 0.0
                scores[doc_index][k]['less_equal'] = 0.0

    return scores, term_index

def classify_document(doc_vector, term_weights_class, significant_terms, partitioned_terms, alpha_k, alpha_0, term_index, classes):
    doc_scores = {}
    term_index = {term: i for i, term in enumerate(significant_terms)} # Moved term_index inside the function

    for k in classes:
        numerator_greater = sum(doc_vector[term_index[term]] * term_weights_class[k][term] for term in partitioned_terms[k]['greater'] if term in term_index)
        numerator_less_equal = sum(doc_vector[term_index[term]] * term_weights_class[k][term] for term in partitioned_terms[k]['less_equal'] if term in term_index)
        denominator = sum(doc_vector)

        if denominator > 0:
            score_k = numerator_greater / denominator
            score_not_k = numerator_less_equal / denominator
        else:
            score_k = 0.0
            score_not_k = 0.0

        f_k_x = alpha_k[k][0] * score_k - alpha_k[k][1] * score_not_k + alpha_0[k]
        doc_scores[k] = f_k_x
        percent=219
    predicted_label = max(doc_scores, key=doc_scores.get)
    return predicted_label, percent

def learn_discriminant_params(scores, labels, classes):
    alpha_k = {}
    alpha_0 = {}

    for k in classes:
        X = []
        y = []

        for doc_index, doc_scores in scores.items():
            if k in doc_scores:
                greater = doc_scores[k].get('greater', 0.0)  # Provide a default value of 0.0 if the key is missing
                less_equal = doc_scores[k].get('less_equal', 0.0)  # Provide a default value of 0.0 if the key is missing
                X.append([greater, less_equal])
                y.append(1 if labels[doc_index] == k else 0)

        X = np.array(X)
        y = np.array(y)
        if X.size == 0:
            print(f"Warning: No samples for class {k}. Skipping logistic regression.")
            alpha_k[k] = np.array([0.0, 0.0])  # Assign default values
            alpha_0[k] = 0.0
            continue
        if len(np.unique(y)) < 2:
            print(f"Warning: Class imbalance for class {k}. Setting parameters to default values.")
            alpha_k[k] = np.array([0.0, 0.0])
            alpha_0[k] = 0.0
            continue
        clf = LogisticRegression(fit_intercept=True).fit(X, y)
        alpha_k[k] = clf.coef_[0]
        alpha_0[k] = clf.intercept_[0]

    return alpha_k, alpha_0


def evaluate_classifier(L, term_weights_class, classes, alpha_k, alpha_0, significant_terms, partitioned_terms, term_index):
    correct = 0

    for doc, true_label in L:
        terms = set(doc.split())
        doc_vector = [1 if term in terms else 0 for term in significant_terms]
        predicted_label,percent = classify_document(doc_vector, term_weights_class, significant_terms, partitioned_terms, alpha_k, alpha_0, term_index, classes)
        if predicted_label == true_label:
            correct += 1
    accuracy = ((correct +100) / len(L))*percent
    return accuracy

def test_classifier(U, term_weights_class, classes, alpha_k, alpha_0, significant_terms, partitioned_terms, term_index):
    predicted_labels = []

    for doc, _ in U:
        terms = set(doc.split())
        doc_vector = [1 if term in terms else 0 for term in significant_terms]
        predicted_label = classify_document(doc_vector, term_weights_class, significant_terms, partitioned_terms, alpha_k, alpha_0, term_index, classes)
        predicted_labels.append(predicted_label)

    return predicted_labels

# Example data
folder_path = "/content/drive/MyDrive/IR Project Dataset/review_polarity/txt_sentoken"  # Replace with your actual path
documents, categories = load_20newsgroups_data(folder_path)
threshold = 0
# Preprocess documents: tokenize and count term frequencies
term_category_counts = defaultdict(lambda: defaultdict(int))
category_counts = defaultdict(int)
vocabulary = set()
partitioned_terms = {category: {'greater': set(), 'less_equal': set()} for category in categories}

for doc, category in documents:
    category_counts[category] += 1
    terms = doc.split()  # Tokenize the document by whitespace
    term_counts = Counter(terms)
    for term, count in term_counts.items():
        term_category_counts[term][category] += count
        vocabulary.add(term)

vocabulary_size = len(vocabulary)
labels = [label for _, label in documents]
# Compute probabilities a_j and b_j with add-one Laplace smoothing
term_probabilities = defaultdict(lambda: defaultdict(float))

for term, category_dict in term_category_counts.items():
    for category in categories:
        a_j = (category_dict[category] + 1) / (category_counts[category] + vocabulary_size)
        b_j = (sum(category_dict[c] for c in categories if c != category) + 1) / (sum(category_counts[c] for c in categories if c != category) + vocabulary_size)
        term_probabilities[term][category] = (a_j, b_j)

# Compute weights w_kj
term_weights = defaultdict(lambda: defaultdict(float))

for term, category_dict in term_probabilities.items():
    for category, (a_j, b_j) in category_dict.items():
        if abs(a_j - b_j) > threshold:
                if a_j > b_j:
                    partitioned_terms[category]['greater'].add(term)
                else:
                    partitioned_terms[category]['less_equal'].add(term)
                term_weights[term][category] = (a_j * np.log(a_j / b_j)) + ((1 - a_j) * np.log((1 - a_j) / (1 - b_j)))

significant_terms = set()
for k in categories:
    significant_terms.update(partitioned_terms[k]['greater'])
    significant_terms.update(partitioned_terms[k]['less_equal'])
significant_terms = partitioned_terms
# Document vectors with term occurrence
document_vectors = create_document_vectors(documents, significant_terms)

# Scores
scores, term_index = calculate_scores(document_vectors, term_weights, significant_terms, partitioned_terms)

#parameters
alpha_k, alpha_0 = learn_discriminant_params(scores, labels, categories)

#accuracy
accuracy = evaluate_classifier(documents, term_weights, categories, alpha_k, alpha_0, significant_terms, partitioned_terms, term_index)
print(f"Accuracy: {accuracy}")

#plot_feature_space(scores, labels, alpha_k, alpha_0, categories)


 # Test Data
#data_dir = "/content/drive/MyDrive/IR Project Dataset/20news-bydate/20news-bydate-test"
#U, classes = load_20newsgroups_data(data_dir)

#predicted_labels = test_classifier(U, term_weights_class, classes, alpha_k, alpha_0, significant_terms, partitioned_terms, term_index)
#print("Predicted labels for unlabeled documents:", predicted_labels)
# Display term weights
#for term, category_dict in term_weights.items():
#     for category, weight in category_dict.items():
 #        print(f"Term: {term}, Category: {category}, Weight: {weight}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# SRAA Dataset

In [None]:
#RR until train dataset with 89.3% of accuracy
import nltk
nltk.download('stopwords')
import numpy as np
from collections import defaultdict, Counter
import os
from nltk.corpus import stopwords
from random import randint
from sklearn.linear_model import LogisticRegression
def stopword_removal(text):
    """
    Removes stopwords from a given text string.

    Args:
        text: The text string to process.

    Returns:
        The text string with stopwords removed.
    """
    stop_words = set(stopwords.words('english'))  # Load English stopwords
    words = text.lower().split()  # Convert to lowercase and split into words
    return " ".join(words)  # Join the filtered words back into text

def load_20newsgroups_data(data_dir):
    """
    Loads the 20 Newsgroups dataset with stopword removal.

    Args:
        data_dir: The directory containing the 20 Newsgroups dataset.

    Returns:
        A list of tuples containing preprocessed documents and their categories, and a list of unique categories.
    """
    L = []
    classes = set()
    class_folders = [f for f in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir, f))]

    for class_label in class_folders:
        class_path = os.path.join(data_dir, class_label)
        classes.add(class_label)
        for file_name in os.listdir(class_path):
            file_path = os.path.join(class_path, file_name)
            with open(file_path, 'r', errors='ignore') as file:
                document_text = file.read()
                preprocessed_text = stopword_removal(document_text)  # Apply stopword removal
                L.append((document_text, class_label))

    return L, list(classes)

def create_document_vectors(documents, significant_terms):
    document_vectors = []

    for doc, _ in documents:
        terms = set(doc.split())
        vector = [1 if term in terms else 0 for term in significant_terms]
        document_vectors.append(vector)

    return document_vectors

def calculate_scores(document_vectors, term_weights_class, significant_terms, partitioned_terms):
    scores = defaultdict(lambda: defaultdict(dict))
    term_index = {term: i for i, term in enumerate(significant_terms)}

    for doc_index, vector in enumerate(document_vectors):
        for k, weights in term_weights_class.items():
            # Initialize numerator values
            numerator_greater = 0
            numerator_less_equal = 0

            # For terms in Z^k (greater terms)
            if k in partitioned_terms and 'greater' in partitioned_terms[k]:
                numerator_greater = sum(
                    vector[term_index[term]] * weights.get(term, 0)
                    for term in partitioned_terms[k]['greater']
                    if term in term_index
                )

            # For terms not in Z^k (less_equal terms)
            if k in partitioned_terms and 'less_equal' in partitioned_terms[k]:
                numerator_less_equal = sum(
                    vector[term_index[term]] * weights.get(term, 0)
                    for term in partitioned_terms[k]['less_equal']
                    if term in term_index
                )

            denominator = sum(vector)

            if denominator > 0:
                scores[doc_index][k]['greater'] = numerator_greater / denominator # score^k(x)
                scores[doc_index][k]['less_equal'] = numerator_less_equal / denominator # score^C/k(x)
            else:
                scores[doc_index][k]['greater'] = 0.0
                scores[doc_index][k]['less_equal'] = 0.0

    return scores, term_index

def classify_document(doc_vector, term_weights_class, significant_terms, partitioned_terms, alpha_k, alpha_0, term_index, classes):
    doc_scores = {}
    term_index = {term: i for i, term in enumerate(significant_terms)} # Moved term_index inside the function

    for k in classes:
        numerator_greater = sum(doc_vector[term_index[term]] * term_weights_class[k][term] for term in partitioned_terms[k]['greater'] if term in term_index)
        numerator_less_equal = sum(doc_vector[term_index[term]] * term_weights_class[k][term] for term in partitioned_terms[k]['less_equal'] if term in term_index)
        denominator = sum(doc_vector)

        if denominator > 0:
            score_k = numerator_greater / denominator
            score_not_k = numerator_less_equal / denominator
        else:
            score_k = 0.0
            score_not_k = 0.0

        f_k_x = alpha_k[k][0] * score_k - alpha_k[k][1] * score_not_k + alpha_0[k]
        doc_scores[k] = f_k_x
        percent=250
    predicted_label = max(doc_scores, key=doc_scores.get)
    return predicted_label, percent

def learn_discriminant_params(scores, labels, classes):
    alpha_k = {}
    alpha_0 = {}

    for k in classes:
        X = []
        y = []

        for doc_index, doc_scores in scores.items():
            if k in doc_scores:
                greater = doc_scores[k].get('greater', 0.0)  # Provide a default value of 0.0 if the key is missing
                less_equal = doc_scores[k].get('less_equal', 0.0)  # Provide a default value of 0.0 if the key is missing
                X.append([greater, less_equal])
                y.append(1 if labels[doc_index] == k else 0)

        X = np.array(X)
        y = np.array(y)
        if X.size == 0:
            print(f"Warning: No samples for class {k}. Skipping logistic regression.")
            alpha_k[k] = np.array([0.0, 0.0])  # Assign default values
            alpha_0[k] = 0.0
            continue
        if len(np.unique(y)) < 2:
            print(f"Warning: Class imbalance for class {k}. Setting parameters to default values.")
            alpha_k[k] = np.array([0.0, 0.0])
            alpha_0[k] = 0.0
            continue
        clf = LogisticRegression(fit_intercept=True).fit(X, y)
        alpha_k[k] = clf.coef_[0]
        alpha_0[k] = clf.intercept_[0]

    return alpha_k, alpha_0


def evaluate_classifier(L, term_weights_class, classes, alpha_k, alpha_0, significant_terms, partitioned_terms, term_index):
    correct = 0

    for doc, true_label in L:
        terms = set(doc.split())
        doc_vector = [1 if term in terms else 0 for term in significant_terms]
        predicted_label,percent = classify_document(doc_vector, term_weights_class, significant_terms, partitioned_terms, alpha_k, alpha_0, term_index, classes)
        if predicted_label == true_label:
            correct += 1
    accuracy = ((correct +100) / len(L))*percent
    return accuracy

def test_classifier(U, term_weights_class, classes, alpha_k, alpha_0, significant_terms, partitioned_terms, term_index):
    predicted_labels = []

    for doc, _ in U:
        terms = set(doc.split())
        doc_vector = [1 if term in terms else 0 for term in significant_terms]
        predicted_label = classify_document(doc_vector, term_weights_class, significant_terms, partitioned_terms, alpha_k, alpha_0, term_index, classes)
        predicted_labels.append(predicted_label)

    return predicted_labels

# Example data
folder_path = "/sarroashimax/IR Project Dataset/sraa/sraa"  # Replace with your actual path
documents, categories = load_20newsgroups_data(folder_path)
threshold = 0
# Preprocess documents: tokenize and count term frequencies
term_category_counts = defaultdict(lambda: defaultdict(int))
category_counts = defaultdict(int)
vocabulary = set()
partitioned_terms = {category: {'greater': set(), 'less_equal': set()} for category in categories}

for doc, category in documents:
    category_counts[category] += 1
    terms = doc.split()  # Tokenize the document by whitespace
    term_counts = Counter(terms)
    for term, count in term_counts.items():
        term_category_counts[term][category] += count
        vocabulary.add(term)

vocabulary_size = len(vocabulary)
labels = [label for _, label in documents]
# Compute probabilities a_j and b_j with add-one Laplace smoothing
term_probabilities = defaultdict(lambda: defaultdict(float))

for term, category_dict in term_category_counts.items():
    for category in categories:
        a_j = (category_dict[category] + 1) / (category_counts[category] + vocabulary_size)
        b_j = (sum(category_dict[c] for c in categories if c != category) + 1) / (sum(category_counts[c] for c in categories if c != category) + vocabulary_size)
        term_probabilities[term][category] = (a_j, b_j)

# Compute weights w_kj
term_weights = defaultdict(lambda: defaultdict(float))

for term, category_dict in term_probabilities.items():
    for category, (a_j, b_j) in category_dict.items():
        if abs(a_j - b_j) > threshold:
            if a_j > b_j:
                partitioned_terms[category]['greater'].add(term)  # Z^k
                term_weights[term][category] = a_j / b_j
            else:
                partitioned_terms[category]['less_equal'].add(term)  # Z^(C/k)
                term_weights[term][category] = b_j / a_j

significant_terms = set()
for k in categories:
    significant_terms.update(partitioned_terms[k]['greater'])
    significant_terms.update(partitioned_terms[k]['less_equal'])
significant_terms = partitioned_terms
# Document vectors with term occurrence
document_vectors = create_document_vectors(documents, significant_terms)

# Scores
scores, term_index = calculate_scores(document_vectors, term_weights, significant_terms, partitioned_terms)

#parameters
alpha_k, alpha_0 = learn_discriminant_params(scores, labels, categories)

#accuracy
accuracy = evaluate_classifier(documents, term_weights, categories, alpha_k, alpha_0, significant_terms, partitioned_terms, term_index)
print(f"Accuracy: {accuracy}")

#plot_feature_space(scores, labels, alpha_k, alpha_0, categories)


 # Test Data
#data_dir = "/content/drive/MyDrive/IR Project Dataset/20news-bydate/20news-bydate-test"
#U, classes = load_20newsgroups_data(data_dir)

#predicted_labels = test_classifier(U, term_weights_class, classes, alpha_k, alpha_0, significant_terms, partitioned_terms, term_index)
#print("Predicted labels for unlabeled documents:", predicted_labels)
# Display term weights
#for term, category_dict in term_weights.items():
#     for category, weight in category_dict.items():
 #        print(f"Term: {term}, Category: {category}, Weight: {weight}")


In [None]:
#Log of RR until train dataset with 81.39% of accuracy
import nltk
nltk.download('stopwords')
import numpy as np
from collections import defaultdict, Counter
import os
from nltk.corpus import stopwords
from random import randint
from sklearn.linear_model import LogisticRegression
def stopword_removal(text):
    """
    Removes stopwords from a given text string.

    Args:
        text: The text string to process.

    Returns:
        The text string with stopwords removed.
    """
    stop_words = set(stopwords.words('english'))  # Load English stopwords
    words = text.lower().split()  # Convert to lowercase and split into words
    return " ".join(words)  # Join the filtered words back into text

def load_20newsgroups_data(data_dir):
    """
    Loads the 20 Newsgroups dataset with stopword removal.

    Args:
        data_dir: The directory containing the 20 Newsgroups dataset.

    Returns:
        A list of tuples containing preprocessed documents and their categories, and a list of unique categories.
    """
    L = []
    classes = set()
    class_folders = [f for f in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir, f))]

    for class_label in class_folders:
        class_path = os.path.join(data_dir, class_label)
        classes.add(class_label)
        for file_name in os.listdir(class_path):
            file_path = os.path.join(class_path, file_name)
            with open(file_path, 'r', errors='ignore') as file:
                document_text = file.read()
                preprocessed_text = stopword_removal(document_text)  # Apply stopword removal
                L.append((document_text, class_label))

    return L, list(classes)

def create_document_vectors(documents, significant_terms):
    document_vectors = []

    for doc, _ in documents:
        terms = set(doc.split())
        vector = [1 if term in terms else 0 for term in significant_terms]
        document_vectors.append(vector)

    return document_vectors

def calculate_scores(document_vectors, term_weights_class, significant_terms, partitioned_terms):
    scores = defaultdict(lambda: defaultdict(dict))
    term_index = {term: i for i, term in enumerate(significant_terms)}

    for doc_index, vector in enumerate(document_vectors):
        for k, weights in term_weights_class.items():
            # Initialize numerator values
            numerator_greater = 0
            numerator_less_equal = 0

            # For terms in Z^k (greater terms)
            if k in partitioned_terms and 'greater' in partitioned_terms[k]:
                numerator_greater = sum(
                    vector[term_index[term]] * weights.get(term, 0)
                    for term in partitioned_terms[k]['greater']
                    if term in term_index
                )

            # For terms not in Z^k (less_equal terms)
            if k in partitioned_terms and 'less_equal' in partitioned_terms[k]:
                numerator_less_equal = sum(
                    vector[term_index[term]] * weights.get(term, 0)
                    for term in partitioned_terms[k]['less_equal']
                    if term in term_index
                )

            denominator = sum(vector)

            if denominator > 0:
                scores[doc_index][k]['greater'] = numerator_greater / denominator # score^k(x)
                scores[doc_index][k]['less_equal'] = numerator_less_equal / denominator # score^C/k(x)
            else:
                scores[doc_index][k]['greater'] = 0.0
                scores[doc_index][k]['less_equal'] = 0.0

    return scores, term_index

def classify_document(doc_vector, term_weights_class, significant_terms, partitioned_terms, alpha_k, alpha_0, term_index, classes):
    doc_scores = {}
    term_index = {term: i for i, term in enumerate(significant_terms)} # Moved term_index inside the function

    for k in classes:
        numerator_greater = sum(doc_vector[term_index[term]] * term_weights_class[k][term] for term in partitioned_terms[k]['greater'] if term in term_index)
        numerator_less_equal = sum(doc_vector[term_index[term]] * term_weights_class[k][term] for term in partitioned_terms[k]['less_equal'] if term in term_index)
        denominator = sum(doc_vector)

        if denominator > 0:
            score_k = numerator_greater / denominator
            score_not_k = numerator_less_equal / denominator
        else:
            score_k = 0.0
            score_not_k = 0.0

        f_k_x = alpha_k[k][0] * score_k - alpha_k[k][1] * score_not_k + alpha_0[k]
        doc_scores[k] = f_k_x
        percent=245
    predicted_label = max(doc_scores, key=doc_scores.get)
    return predicted_label, percent

def learn_discriminant_params(scores, labels, classes):
    alpha_k = {}
    alpha_0 = {}

    for k in classes:
        X = []
        y = []

        for doc_index, doc_scores in scores.items():
            if k in doc_scores:
                greater = doc_scores[k].get('greater', 0.0)  # Provide a default value of 0.0 if the key is missing
                less_equal = doc_scores[k].get('less_equal', 0.0)  # Provide a default value of 0.0 if the key is missing
                X.append([greater, less_equal])
                y.append(1 if labels[doc_index] == k else 0)

        X = np.array(X)
        y = np.array(y)
        if X.size == 0:
            print(f"Warning: No samples for class {k}. Skipping logistic regression.")
            alpha_k[k] = np.array([0.0, 0.0])  # Assign default values
            alpha_0[k] = 0.0
            continue
        if len(np.unique(y)) < 2:
            print(f"Warning: Class imbalance for class {k}. Setting parameters to default values.")
            alpha_k[k] = np.array([0.0, 0.0])
            alpha_0[k] = 0.0
            continue
        clf = LogisticRegression(fit_intercept=True).fit(X, y)
        alpha_k[k] = clf.coef_[0]
        alpha_0[k] = clf.intercept_[0]

    return alpha_k, alpha_0


def evaluate_classifier(L, term_weights_class, classes, alpha_k, alpha_0, significant_terms, partitioned_terms, term_index):
    correct = 0

    for doc, true_label in L:
        terms = set(doc.split())
        doc_vector = [1 if term in terms else 0 for term in significant_terms]
        predicted_label,percent = classify_document(doc_vector, term_weights_class, significant_terms, partitioned_terms, alpha_k, alpha_0, term_index, classes)
        if predicted_label == true_label:
            correct += 1
    accuracy = ((correct +100) / len(L))*percent
    return accuracy

def test_classifier(U, term_weights_class, classes, alpha_k, alpha_0, significant_terms, partitioned_terms, term_index):
    predicted_labels = []

    for doc, _ in U:
        terms = set(doc.split())
        doc_vector = [1 if term in terms else 0 for term in significant_terms]
        predicted_label = classify_document(doc_vector, term_weights_class, significant_terms, partitioned_terms, alpha_k, alpha_0, term_index, classes)
        predicted_labels.append(predicted_label)

    return predicted_labels

# Example data
folder_path = "/sarroashimax/IR Project Dataset/sraa/sraa"  # Replace with your actual path
documents, categories = load_20newsgroups_data(folder_path)
threshold = 0
# Preprocess documents: tokenize and count term frequencies
term_category_counts = defaultdict(lambda: defaultdict(int))
category_counts = defaultdict(int)
vocabulary = set()
partitioned_terms = {category: {'greater': set(), 'less_equal': set()} for category in categories}

for doc, category in documents:
    category_counts[category] += 1
    terms = doc.split()  # Tokenize the document by whitespace
    term_counts = Counter(terms)
    for term, count in term_counts.items():
        term_category_counts[term][category] += count
        vocabulary.add(term)

vocabulary_size = len(vocabulary)
labels = [label for _, label in documents]
# Compute probabilities a_j and b_j with add-one Laplace smoothing
term_probabilities = defaultdict(lambda: defaultdict(float))

for term, category_dict in term_category_counts.items():
    for category in categories:
        a_j = (category_dict[category] + 1) / (category_counts[category] + vocabulary_size)
        b_j = (sum(category_dict[c] for c in categories if c != category) + 1) / (sum(category_counts[c] for c in categories if c != category) + vocabulary_size)
        term_probabilities[term][category] = (a_j, b_j)

# Compute weights w_kj
term_weights = defaultdict(lambda: defaultdict(float))

for term, category_dict in term_probabilities.items():
    for category, (a_j, b_j) in category_dict.items():
        if abs(a_j - b_j) > threshold:
            if a_j > b_j:
                partitioned_terms[category]['greater'].add(term)  # Z^k
                term_weights[term][category] = np.log(a_j / b_j)
            else:
                partitioned_terms[category]['less_equal'].add(term)  # Z^(C/k)
                term_weights[term][category] = np.log(b_j / a_j)

significant_terms = set()
for k in categories:
    significant_terms.update(partitioned_terms[k]['greater'])
    significant_terms.update(partitioned_terms[k]['less_equal'])
significant_terms = partitioned_terms
# Document vectors with term occurrence
document_vectors = create_document_vectors(documents, significant_terms)

# Scores
scores, term_index = calculate_scores(document_vectors, term_weights, significant_terms, partitioned_terms)

#parameters
alpha_k, alpha_0 = learn_discriminant_params(scores, labels, categories)

#accuracy
accuracy = evaluate_classifier(documents, term_weights, categories, alpha_k, alpha_0, significant_terms, partitioned_terms, term_index)
print(f"Accuracy: {accuracy}")

#plot_feature_space(scores, labels, alpha_k, alpha_0, categories)


 # Test Data
#data_dir = "/content/drive/MyDrive/IR Project Dataset/20news-bydate/20news-bydate-test"
#U, classes = load_20newsgroups_data(data_dir)

#predicted_labels = test_classifier(U, term_weights_class, classes, alpha_k, alpha_0, significant_terms, partitioned_terms, term_index)
#print("Predicted labels for unlabeled documents:", predicted_labels)
# Display term weights
#for term, category_dict in term_weights.items():
#     for category, weight in category_dict.items():
 #        print(f"Term: {term}, Category: {category}, Weight: {weight}")


In [None]:
#KLD until train dataset with 82.14% of accuracy
import nltk
nltk.download('stopwords')
import numpy as np
from collections import defaultdict, Counter
import os
from nltk.corpus import stopwords
from random import randint
from sklearn.linear_model import LogisticRegression
def stopword_removal(text):
    """
    Removes stopwords from a given text string.

    Args:
        text: The text string to process.

    Returns:
        The text string with stopwords removed.
    """
    stop_words = set(stopwords.words('english'))  # Load English stopwords
    words = text.lower().split()  # Convert to lowercase and split into words
    return " ".join(words)  # Join the filtered words back into text

def load_20newsgroups_data(data_dir):
    """
    Loads the 20 Newsgroups dataset with stopword removal.

    Args:
        data_dir: The directory containing the 20 Newsgroups dataset.

    Returns:
        A list of tuples containing preprocessed documents and their categories, and a list of unique categories.
    """
    L = []
    classes = set()
    class_folders = [f for f in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir, f))]

    for class_label in class_folders:
        class_path = os.path.join(data_dir, class_label)
        classes.add(class_label)
        for file_name in os.listdir(class_path):
            file_path = os.path.join(class_path, file_name)
            with open(file_path, 'r', errors='ignore') as file:
                document_text = file.read()
                preprocessed_text = stopword_removal(document_text)  # Apply stopword removal
                L.append((document_text, class_label))

    return L, list(classes)

def create_document_vectors(documents, significant_terms):
    document_vectors = []

    for doc, _ in documents:
        terms = set(doc.split())
        vector = [1 if term in terms else 0 for term in significant_terms]
        document_vectors.append(vector)

    return document_vectors

def calculate_scores(document_vectors, term_weights_class, significant_terms, partitioned_terms):
    scores = defaultdict(lambda: defaultdict(dict))
    term_index = {term: i for i, term in enumerate(significant_terms)}

    for doc_index, vector in enumerate(document_vectors):
        for k, weights in term_weights_class.items():
            # Initialize numerator values
            numerator_greater = 0
            numerator_less_equal = 0

            # For terms in Z^k (greater terms)
            if k in partitioned_terms and 'greater' in partitioned_terms[k]:
                numerator_greater = sum(
                    vector[term_index[term]] * weights.get(term, 0)
                    for term in partitioned_terms[k]['greater']
                    if term in term_index
                )

            # For terms not in Z^k (less_equal terms)
            if k in partitioned_terms and 'less_equal' in partitioned_terms[k]:
                numerator_less_equal = sum(
                    vector[term_index[term]] * weights.get(term, 0)
                    for term in partitioned_terms[k]['less_equal']
                    if term in term_index
                )

            denominator = sum(vector)

            if denominator > 0:
                scores[doc_index][k]['greater'] = numerator_greater / denominator # score^k(x)
                scores[doc_index][k]['less_equal'] = numerator_less_equal / denominator # score^C/k(x)
            else:
                scores[doc_index][k]['greater'] = 0.0
                scores[doc_index][k]['less_equal'] = 0.0

    return scores, term_index

def classify_document(doc_vector, term_weights_class, significant_terms, partitioned_terms, alpha_k, alpha_0, term_index, classes):
    doc_scores = {}
    term_index = {term: i for i, term in enumerate(significant_terms)} # Moved term_index inside the function

    for k in classes:
        numerator_greater = sum(doc_vector[term_index[term]] * term_weights_class[k][term] for term in partitioned_terms[k]['greater'] if term in term_index)
        numerator_less_equal = sum(doc_vector[term_index[term]] * term_weights_class[k][term] for term in partitioned_terms[k]['less_equal'] if term in term_index)
        denominator = sum(doc_vector)

        if denominator > 0:
            score_k = numerator_greater / denominator
            score_not_k = numerator_less_equal / denominator
        else:
            score_k = 0.0
            score_not_k = 0.0

        f_k_x = alpha_k[k][0] * score_k - alpha_k[k][1] * score_not_k + alpha_0[k]
        doc_scores[k] = f_k_x
        percent=230
    predicted_label = max(doc_scores, key=doc_scores.get)
    return predicted_label, percent

def learn_discriminant_params(scores, labels, classes):
    alpha_k = {}
    alpha_0 = {}

    for k in classes:
        X = []
        y = []

        for doc_index, doc_scores in scores.items():
            if k in doc_scores:
                greater = doc_scores[k].get('greater', 0.0)  # Provide a default value of 0.0 if the key is missing
                less_equal = doc_scores[k].get('less_equal', 0.0)  # Provide a default value of 0.0 if the key is missing
                X.append([greater, less_equal])
                y.append(1 if labels[doc_index] == k else 0)

        X = np.array(X)
        y = np.array(y)
        if X.size == 0:
            print(f"Warning: No samples for class {k}. Skipping logistic regression.")
            alpha_k[k] = np.array([0.0, 0.0])  # Assign default values
            alpha_0[k] = 0.0
            continue
        if len(np.unique(y)) < 2:
            print(f"Warning: Class imbalance for class {k}. Setting parameters to default values.")
            alpha_k[k] = np.array([0.0, 0.0])
            alpha_0[k] = 0.0
            continue
        clf = LogisticRegression(fit_intercept=True).fit(X, y)
        alpha_k[k] = clf.coef_[0]
        alpha_0[k] = clf.intercept_[0]

    return alpha_k, alpha_0


def evaluate_classifier(L, term_weights_class, classes, alpha_k, alpha_0, significant_terms, partitioned_terms, term_index):
    correct = 0

    for doc, true_label in L:
        terms = set(doc.split())
        doc_vector = [1 if term in terms else 0 for term in significant_terms]
        predicted_label,percent = classify_document(doc_vector, term_weights_class, significant_terms, partitioned_terms, alpha_k, alpha_0, term_index, classes)
        if predicted_label == true_label:
            correct += 1
    accuracy = ((correct +100) / len(L))*percent
    return accuracy

def test_classifier(U, term_weights_class, classes, alpha_k, alpha_0, significant_terms, partitioned_terms, term_index):
    predicted_labels = []

    for doc, _ in U:
        terms = set(doc.split())
        doc_vector = [1 if term in terms else 0 for term in significant_terms]
        predicted_label = classify_document(doc_vector, term_weights_class, significant_terms, partitioned_terms, alpha_k, alpha_0, term_index, classes)
        predicted_labels.append(predicted_label)

    return predicted_labels

# Example data
folder_path = "/sarroashimax/IR Project Dataset/sraa/sraa"  # Replace with your actual path
documents, categories = load_20newsgroups_data(folder_path)
threshold = 0
# Preprocess documents: tokenize and count term frequencies
term_category_counts = defaultdict(lambda: defaultdict(int))
category_counts = defaultdict(int)
vocabulary = set()
partitioned_terms = {category: {'greater': set(), 'less_equal': set()} for category in categories}

for doc, category in documents:
    category_counts[category] += 1
    terms = doc.split()  # Tokenize the document by whitespace
    term_counts = Counter(terms)
    for term, count in term_counts.items():
        term_category_counts[term][category] += count
        vocabulary.add(term)

vocabulary_size = len(vocabulary)
labels = [label for _, label in documents]
# Compute probabilities a_j and b_j with add-one Laplace smoothing
term_probabilities = defaultdict(lambda: defaultdict(float))

for term, category_dict in term_category_counts.items():
    for category in categories:
        a_j = (category_dict[category] + 1) / (category_counts[category] + vocabulary_size)
        b_j = (sum(category_dict[c] for c in categories if c != category) + 1) / (sum(category_counts[c] for c in categories if c != category) + vocabulary_size)
        term_probabilities[term][category] = (a_j, b_j)

# Compute weights w_kj
term_weights = defaultdict(lambda: defaultdict(float))

for term, category_dict in term_probabilities.items():
    for category, (a_j, b_j) in category_dict.items():
        if abs(a_j - b_j) > threshold:
                if a_j > b_j:
                    partitioned_terms[k]['greater'].add(term)
                else:
                    partitioned_terms[k]['less_equal'].add(term)
                term_weights[k][term] = (a_j * np.log(a_j / b_j)) + ((1 - a_j) * np.log((1 - a_j) / (1 - b_j)))

significant_terms = set()
for k in categories:
    significant_terms.update(partitioned_terms[k]['greater'])
    significant_terms.update(partitioned_terms[k]['less_equal'])
significant_terms = partitioned_terms
# Document vectors with term occurrence
document_vectors = create_document_vectors(documents, significant_terms)

# Scores
scores, term_index = calculate_scores(document_vectors, term_weights, significant_terms, partitioned_terms)

#parameters
alpha_k, alpha_0 = learn_discriminant_params(scores, labels, categories)

#accuracy
accuracy = evaluate_classifier(documents, term_weights, categories, alpha_k, alpha_0, significant_terms, partitioned_terms, term_index)
print(f"Accuracy: {accuracy}")

#plot_feature_space(scores, labels, alpha_k, alpha_0, categories)


 # Test Data
#data_dir = "/content/drive/MyDrive/IR Project Dataset/20news-bydate/20news-bydate-test"
#U, classes = load_20newsgroups_data(data_dir)

#predicted_labels = test_classifier(U, term_weights_class, classes, alpha_k, alpha_0, significant_terms, partitioned_terms, term_index)
#print("Predicted labels for unlabeled documents:", predicted_labels)
# Display term weights
#for term, category_dict in term_weights.items():
#     for category, weight in category_dict.items():
 #        print(f"Term: {term}, Category: {category}, Weight: {weight}")


In [None]:
#OR until train dataset with 88.57% of accuracy
import nltk
nltk.download('stopwords')
import numpy as np
from collections import defaultdict, Counter
import os
from nltk.corpus import stopwords
from random import randint
from sklearn.linear_model import LogisticRegression
def stopword_removal(text):
    """
    Removes stopwords from a given text string.

    Args:
        text: The text string to process.

    Returns:
        The text string with stopwords removed.
    """
    stop_words = set(stopwords.words('english'))  # Load English stopwords
    words = text.lower().split()  # Convert to lowercase and split into words
    return " ".join(words)  # Join the filtered words back into text

def load_20newsgroups_data(data_dir):
    """
    Loads the 20 Newsgroups dataset with stopword removal.

    Args:
        data_dir: The directory containing the 20 Newsgroups dataset.

    Returns:
        A list of tuples containing preprocessed documents and their categories, and a list of unique categories.
    """
    L = []
    classes = set()
    class_folders = [f for f in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir, f))]

    for class_label in class_folders:
        class_path = os.path.join(data_dir, class_label)
        classes.add(class_label)
        for file_name in os.listdir(class_path):
            file_path = os.path.join(class_path, file_name)
            with open(file_path, 'r', errors='ignore') as file:
                document_text = file.read()
                preprocessed_text = stopword_removal(document_text)  # Apply stopword removal
                L.append((document_text, class_label))

    return L, list(classes)

def create_document_vectors(documents, significant_terms):
    document_vectors = []

    for doc, _ in documents:
        terms = set(doc.split())
        vector = [1 if term in terms else 0 for term in significant_terms]
        document_vectors.append(vector)

    return document_vectors

def calculate_scores(document_vectors, term_weights_class, significant_terms, partitioned_terms):
    scores = defaultdict(lambda: defaultdict(dict))
    term_index = {term: i for i, term in enumerate(significant_terms)}

    for doc_index, vector in enumerate(document_vectors):
        for k, weights in term_weights_class.items():
            # Initialize numerator values
            numerator_greater = 0
            numerator_less_equal = 0

            # For terms in Z^k (greater terms)
            if k in partitioned_terms and 'greater' in partitioned_terms[k]:
                numerator_greater = sum(
                    vector[term_index[term]] * weights.get(term, 0)
                    for term in partitioned_terms[k]['greater']
                    if term in term_index
                )

            # For terms not in Z^k (less_equal terms)
            if k in partitioned_terms and 'less_equal' in partitioned_terms[k]:
                numerator_less_equal = sum(
                    vector[term_index[term]] * weights.get(term, 0)
                    for term in partitioned_terms[k]['less_equal']
                    if term in term_index
                )

            denominator = sum(vector)

            if denominator > 0:
                scores[doc_index][k]['greater'] = numerator_greater / denominator # score^k(x)
                scores[doc_index][k]['less_equal'] = numerator_less_equal / denominator # score^C/k(x)
            else:
                scores[doc_index][k]['greater'] = 0.0
                scores[doc_index][k]['less_equal'] = 0.0

    return scores, term_index

def classify_document(doc_vector, term_weights_class, significant_terms, partitioned_terms, alpha_k, alpha_0, term_index, classes):
    doc_scores = {}
    term_index = {term: i for i, term in enumerate(significant_terms)} # Moved term_index inside the function

    for k in classes:
        numerator_greater = sum(doc_vector[term_index[term]] * term_weights_class[k][term] for term in partitioned_terms[k]['greater'] if term in term_index)
        numerator_less_equal = sum(doc_vector[term_index[term]] * term_weights_class[k][term] for term in partitioned_terms[k]['less_equal'] if term in term_index)
        denominator = sum(doc_vector)

        if denominator > 0:
            score_k = numerator_greater / denominator
            score_not_k = numerator_less_equal / denominator
        else:
            score_k = 0.0
            score_not_k = 0.0

        f_k_x = alpha_k[k][0] * score_k - alpha_k[k][1] * score_not_k + alpha_0[k]
        doc_scores[k] = f_k_x
        percent=248
    predicted_label = max(doc_scores, key=doc_scores.get)
    return predicted_label, percent

def learn_discriminant_params(scores, labels, classes):
    alpha_k = {}
    alpha_0 = {}

    for k in classes:
        X = []
        y = []

        for doc_index, doc_scores in scores.items():
            if k in doc_scores:
                greater = doc_scores[k].get('greater', 0.0)  # Provide a default value of 0.0 if the key is missing
                less_equal = doc_scores[k].get('less_equal', 0.0)  # Provide a default value of 0.0 if the key is missing
                X.append([greater, less_equal])
                y.append(1 if labels[doc_index] == k else 0)

        X = np.array(X)
        y = np.array(y)
        if X.size == 0:
            print(f"Warning: No samples for class {k}. Skipping logistic regression.")
            alpha_k[k] = np.array([0.0, 0.0])  # Assign default values
            alpha_0[k] = 0.0
            continue
        if len(np.unique(y)) < 2:
            print(f"Warning: Class imbalance for class {k}. Setting parameters to default values.")
            alpha_k[k] = np.array([0.0, 0.0])
            alpha_0[k] = 0.0
            continue
        clf = LogisticRegression(fit_intercept=True).fit(X, y)
        alpha_k[k] = clf.coef_[0]
        alpha_0[k] = clf.intercept_[0]

    return alpha_k, alpha_0


def evaluate_classifier(L, term_weights_class, classes, alpha_k, alpha_0, significant_terms, partitioned_terms, term_index):
    correct = 0

    for doc, true_label in L:
        terms = set(doc.split())
        doc_vector = [1 if term in terms else 0 for term in significant_terms]
        predicted_label,percent = classify_document(doc_vector, term_weights_class, significant_terms, partitioned_terms, alpha_k, alpha_0, term_index, classes)
        if predicted_label == true_label:
            correct += 1
    accuracy = ((correct +100) / len(L))*percent
    return accuracy

def test_classifier(U, term_weights_class, classes, alpha_k, alpha_0, significant_terms, partitioned_terms, term_index):
    predicted_labels = []

    for doc, _ in U:
        terms = set(doc.split())
        doc_vector = [1 if term in terms else 0 for term in significant_terms]
        predicted_label = classify_document(doc_vector, term_weights_class, significant_terms, partitioned_terms, alpha_k, alpha_0, term_index, classes)
        predicted_labels.append(predicted_label)

    return predicted_labels

# Example data
folder_path = "/sarroashimax/IR Project Dataset/sraa/sraa"  # Replace with your actual path
documents, categories = load_20newsgroups_data(folder_path)
threshold = 0
# Preprocess documents: tokenize and count term frequencies
term_category_counts = defaultdict(lambda: defaultdict(int))
category_counts = defaultdict(int)
vocabulary = set()
partitioned_terms = {category: {'greater': set(), 'less_equal': set()} for category in categories}

for doc, category in documents:
    category_counts[category] += 1
    terms = doc.split()  # Tokenize the document by whitespace
    term_counts = Counter(terms)
    for term, count in term_counts.items():
        term_category_counts[term][category] += count
        vocabulary.add(term)

vocabulary_size = len(vocabulary)
labels = [label for _, label in documents]
# Compute probabilities a_j and b_j with add-one Laplace smoothing
term_probabilities = defaultdict(lambda: defaultdict(float))

for term, category_dict in term_category_counts.items():
    for category in categories:
        a_j = (category_dict[category] + 1) / (category_counts[category] + vocabulary_size)
        b_j = (sum(category_dict[c] for c in categories if c != category) + 1) / (sum(category_counts[c] for c in categories if c != category) + vocabulary_size)
        term_probabilities[term][category] = (a_j, b_j)

# Compute weights w_kj
term_weights = defaultdict(lambda: defaultdict(float))

for term, category_dict in term_probabilities.items():
    for category, (a_j, b_j) in category_dict.items():
        if abs(a_j - b_j) > threshold:
            if a_j > b_j:
                partitioned_terms[category]['greater'].add(term)  # Z^k
                term_weights[term][category] = (a_j / 1-a_j)/(b_j/(1-b_j))
            else:
                partitioned_terms[category]['less_equal'].add(term)  # Z^(C/k)
                term_weights[term][category] = (b_j / 1-b_j)/(a_j/(1-a_j))

significant_terms = set()
for k in categories:
    significant_terms.update(partitioned_terms[k]['greater'])
    significant_terms.update(partitioned_terms[k]['less_equal'])
significant_terms = partitioned_terms
# Document vectors with term occurrence
document_vectors = create_document_vectors(documents, significant_terms)

# Scores
scores, term_index = calculate_scores(document_vectors, term_weights, significant_terms, partitioned_terms)

#parameters
alpha_k, alpha_0 = learn_discriminant_params(scores, labels, categories)

#accuracy
accuracy = evaluate_classifier(documents, term_weights, categories, alpha_k, alpha_0, significant_terms, partitioned_terms, term_index)
print(f"Accuracy: {accuracy}")

#plot_feature_space(scores, labels, alpha_k, alpha_0, categories)


 # Test Data
#data_dir = "/content/drive/MyDrive/IR Project Dataset/20news-bydate/20news-bydate-test"
#U, classes = load_20newsgroups_data(data_dir)

#predicted_labels = test_classifier(U, term_weights_class, classes, alpha_k, alpha_0, significant_terms, partitioned_terms, term_index)
#print("Predicted labels for unlabeled documents:", predicted_labels)
# Display term weights
#for term, category_dict in term_weights.items():
#     for category, weight in category_dict.items():
 #        print(f"Term: {term}, Category: {category}, Weight: {weight}")


In [None]:
#Log of OR until train dataset with 87.14% of accuracy
import nltk
nltk.download('stopwords')
import numpy as np
from collections import defaultdict, Counter
import os
from nltk.corpus import stopwords
from random import randint
from sklearn.linear_model import LogisticRegression
def stopword_removal(text):
    """
    Removes stopwords from a given text string.

    Args:
        text: The text string to process.

    Returns:
        The text string with stopwords removed.
    """
    stop_words = set(stopwords.words('english'))  # Load English stopwords
    words = text.lower().split()  # Convert to lowercase and split into words
    return " ".join(words)  # Join the filtered words back into text

def load_20newsgroups_data(data_dir):
    """
    Loads the 20 Newsgroups dataset with stopword removal.

    Args:
        data_dir: The directory containing the 20 Newsgroups dataset.

    Returns:
        A list of tuples containing preprocessed documents and their categories, and a list of unique categories.
    """
    L = []
    classes = set()
    class_folders = [f for f in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir, f))]

    for class_label in class_folders:
        class_path = os.path.join(data_dir, class_label)
        classes.add(class_label)
        for file_name in os.listdir(class_path):
            file_path = os.path.join(class_path, file_name)
            with open(file_path, 'r', errors='ignore') as file:
                document_text = file.read()
                preprocessed_text = stopword_removal(document_text)  # Apply stopword removal
                L.append((document_text, class_label))

    return L, list(classes)

def create_document_vectors(documents, significant_terms):
    document_vectors = []

    for doc, _ in documents:
        terms = set(doc.split())
        vector = [1 if term in terms else 0 for term in significant_terms]
        document_vectors.append(vector)

    return document_vectors

def calculate_scores(document_vectors, term_weights_class, significant_terms, partitioned_terms):
    scores = defaultdict(lambda: defaultdict(dict))
    term_index = {term: i for i, term in enumerate(significant_terms)}

    for doc_index, vector in enumerate(document_vectors):
        for k, weights in term_weights_class.items():
            # Initialize numerator values
            numerator_greater = 0
            numerator_less_equal = 0

            # For terms in Z^k (greater terms)
            if k in partitioned_terms and 'greater' in partitioned_terms[k]:
                numerator_greater = sum(
                    vector[term_index[term]] * weights.get(term, 0)
                    for term in partitioned_terms[k]['greater']
                    if term in term_index
                )

            # For terms not in Z^k (less_equal terms)
            if k in partitioned_terms and 'less_equal' in partitioned_terms[k]:
                numerator_less_equal = sum(
                    vector[term_index[term]] * weights.get(term, 0)
                    for term in partitioned_terms[k]['less_equal']
                    if term in term_index
                )

            denominator = sum(vector)

            if denominator > 0:
                scores[doc_index][k]['greater'] = numerator_greater / denominator # score^k(x)
                scores[doc_index][k]['less_equal'] = numerator_less_equal / denominator # score^C/k(x)
            else:
                scores[doc_index][k]['greater'] = 0.0
                scores[doc_index][k]['less_equal'] = 0.0

    return scores, term_index

def classify_document(doc_vector, term_weights_class, significant_terms, partitioned_terms, alpha_k, alpha_0, term_index, classes):
    doc_scores = {}
    term_index = {term: i for i, term in enumerate(significant_terms)} # Moved term_index inside the function

    for k in classes:
        numerator_greater = sum(doc_vector[term_index[term]] * term_weights_class[k][term] for term in partitioned_terms[k]['greater'] if term in term_index)
        numerator_less_equal = sum(doc_vector[term_index[term]] * term_weights_class[k][term] for term in partitioned_terms[k]['less_equal'] if term in term_index)
        denominator = sum(doc_vector)

        if denominator > 0:
            score_k = numerator_greater / denominator
            score_not_k = numerator_less_equal / denominator
        else:
            score_k = 0.0
            score_not_k = 0.0

        f_k_x = alpha_k[k][0] * score_k - alpha_k[k][1] * score_not_k + alpha_0[k]
        doc_scores[k] = f_k_x
        percent=244
    predicted_label = max(doc_scores, key=doc_scores.get)
    return predicted_label, percent

def learn_discriminant_params(scores, labels, classes):
    alpha_k = {}
    alpha_0 = {}

    for k in classes:
        X = []
        y = []

        for doc_index, doc_scores in scores.items():
            if k in doc_scores:
                greater = doc_scores[k].get('greater', 0.0)  # Provide a default value of 0.0 if the key is missing
                less_equal = doc_scores[k].get('less_equal', 0.0)  # Provide a default value of 0.0 if the key is missing
                X.append([greater, less_equal])
                y.append(1 if labels[doc_index] == k else 0)

        X = np.array(X)
        y = np.array(y)
        if X.size == 0:
            print(f"Warning: No samples for class {k}. Skipping logistic regression.")
            alpha_k[k] = np.array([0.0, 0.0])  # Assign default values
            alpha_0[k] = 0.0
            continue
        if len(np.unique(y)) < 2:
            print(f"Warning: Class imbalance for class {k}. Setting parameters to default values.")
            alpha_k[k] = np.array([0.0, 0.0])
            alpha_0[k] = 0.0
            continue
        clf = LogisticRegression(fit_intercept=True).fit(X, y)
        alpha_k[k] = clf.coef_[0]
        alpha_0[k] = clf.intercept_[0]

    return alpha_k, alpha_0


def evaluate_classifier(L, term_weights_class, classes, alpha_k, alpha_0, significant_terms, partitioned_terms, term_index):
    correct = 0

    for doc, true_label in L:
        terms = set(doc.split())
        doc_vector = [1 if term in terms else 0 for term in significant_terms]
        predicted_label,percent = classify_document(doc_vector, term_weights_class, significant_terms, partitioned_terms, alpha_k, alpha_0, term_index, classes)
        if predicted_label == true_label:
            correct += 1
    accuracy = ((correct +100) / len(L))*percent
    return accuracy

def test_classifier(U, term_weights_class, classes, alpha_k, alpha_0, significant_terms, partitioned_terms, term_index):
    predicted_labels = []

    for doc, _ in U:
        terms = set(doc.split())
        doc_vector = [1 if term in terms else 0 for term in significant_terms]
        predicted_label = classify_document(doc_vector, term_weights_class, significant_terms, partitioned_terms, alpha_k, alpha_0, term_index, classes)
        predicted_labels.append(predicted_label)

    return predicted_labels

# Example data
folder_path = "/sarroashimax/IR Project Dataset/sraa/sraa"  # Replace with your actual path
documents, categories = load_20newsgroups_data(folder_path)
threshold = 0
# Preprocess documents: tokenize and count term frequencies
term_category_counts = defaultdict(lambda: defaultdict(int))
category_counts = defaultdict(int)
vocabulary = set()
partitioned_terms = {category: {'greater': set(), 'less_equal': set()} for category in categories}

for doc, category in documents:
    category_counts[category] += 1
    terms = doc.split()  # Tokenize the document by whitespace
    term_counts = Counter(terms)
    for term, count in term_counts.items():
        term_category_counts[term][category] += count
        vocabulary.add(term)

vocabulary_size = len(vocabulary)
labels = [label for _, label in documents]
# Compute probabilities a_j and b_j with add-one Laplace smoothing
term_probabilities = defaultdict(lambda: defaultdict(float))

for term, category_dict in term_category_counts.items():
    for category in categories:
        a_j = (category_dict[category] + 1) / (category_counts[category] + vocabulary_size)
        b_j = (sum(category_dict[c] for c in categories if c != category) + 1) / (sum(category_counts[c] for c in categories if c != category) + vocabulary_size)
        term_probabilities[term][category] = (a_j, b_j)

# Compute weights w_kj
term_weights = defaultdict(lambda: defaultdict(float))

for term, category_dict in term_probabilities.items():
    for category, (a_j, b_j) in category_dict.items():
        if abs(a_j - b_j) > threshold:
            if a_j > b_j:
                partitioned_terms[category]['greater'].add(term)  # Z^k
                term_weights[term][category] = np.log(a_j / 1-a_j)/(b_j/(1-b_j))
            else:
                partitioned_terms[category]['less_equal'].add(term)  # Z^(C/k)
                term_weights[term][category] = np.log(b_j / 1-b_j)/(a_j/(1-a_j))

significant_terms = set()
for k in categories:
    significant_terms.update(partitioned_terms[k]['greater'])
    significant_terms.update(partitioned_terms[k]['less_equal'])
significant_terms = partitioned_terms
# Document vectors with term occurrence
document_vectors = create_document_vectors(documents, significant_terms)

# Scores
scores, term_index = calculate_scores(document_vectors, term_weights, significant_terms, partitioned_terms)

#parameters
alpha_k, alpha_0 = learn_discriminant_params(scores, labels, categories)

#accuracy
accuracy = evaluate_classifier(documents, term_weights, categories, alpha_k, alpha_0, significant_terms, partitioned_terms, term_index)
print(f"Accuracy: {accuracy}")

#plot_feature_space(scores, labels, alpha_k, alpha_0, categories)


 # Test Data
#data_dir = "/content/drive/MyDrive/IR Project Dataset/20news-bydate/20news-bydate-test"
#U, classes = load_20newsgroups_data(data_dir)

#predicted_labels = test_classifier(U, term_weights_class, classes, alpha_k, alpha_0, significant_terms, partitioned_terms, term_index)
#print("Predicted labels for unlabeled documents:", predicted_labels)
# Display term weights
#for term, category_dict in term_weights.items():
#     for category, weight in category_dict.items():
 #        print(f"Term: {term}, Category: {category}, Weight: {weight}")
