*بخش اول:*
---
### 1_1
Using n-grams as features for classification is common in NLP. For this purpose, after tokenization, all possible n-grams from the text are calculated, and using their frequencies, each of them is mapped to feature vectors. After these steps, the feature vectors are ready for classification using their labels.

### 1_2

When using higher n-grams (like trigrams) with a small dataset, several problems can occur:

        1-Data sparsity: Many n-grams might appear rarely, leading to weaker models.
        
        2-Overfitting: The model may learn patterns specific to the training data and fail to generalize.
        
        3-Slower training: The feature space becomes too large, making training slower.

*بخش دوم:*
---

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score
from collections import Counter, defaultdict
import numpy as np
import random
from tokenizers import ByteLevelBPETokenizer
from sklearn.linear_model import LogisticRegression
from sklearn import svm


data = pd.read_csv('data/digikala.csv')

print(data.describe())

texts = data['Text'].tolist()
labels = data['Suggestion'].tolist()

# 3_2
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, 
                                                                      test_size=0.2, random_state=42)

# 2_1
tokenizer = ByteLevelBPETokenizer()
tokenizer.train_from_iterator(train_texts, vocab_size=20000, min_frequency=10)

# 2_2
train_tokens = [tokenizer.encode(text).tokens for text in train_texts]
test_tokens = [tokenizer.encode(text).tokens for text in test_texts]


             Score  Suggestion
count  3261.000000  3261.00000
mean     74.719411     1.41061
std      21.514015     0.72408
min       0.000000     1.00000
25%      60.000000     1.00000
50%      80.000000     1.00000
75%      92.000000     2.00000
max     100.000000     3.00000


*بخش سوم:*
---
### 3_4
Due to the imbalancity of dataset, the classifier prioritize the majority class and ignore the minorities. Therefore it leads to high precidion and low recall. 
Precision being high means when the model makes a positive prediction, it is usually correct.
Recall being low means our model has missed many true positives.

### 3_5
In these situations, it is better to use ngram classifiers instead of complex classifiers with many parameters:

        1- When the data is limited

        2- When calculation and speed are more important

        3- When the patterns are simple and strict

In [None]:
# 3_1

class NGramClassifier:
    def __init__(self, n=2, classifier=svm.SVC()):
        self.n = n
        self.vocab = set()
        self.classifier = classifier
        self.ngram_probabilities = {}


    def count_ngrams(self, tokens, n):
        ngrams = Counter([tuple(tokens[i:i+n]) for i in range(len(tokens)-n+1)])
        return ngrams

    
    def generate_feature_vector(self, ngrams):  ## I used GPT to generate feature vector. ##
        self.vocab = list(self.vocab)
        feature_vector = np.zeros(len(self.vocab), dtype=int)
        
        # Populate the feature vector with the count of each n-gram from the vocab
        for i, ngram in enumerate(self.vocab):
            # If the n-gram exists in the current ngram dictionary, get its count, else use 0
            feature_vector[i] = ngrams.get(ngram, 0)

        return feature_vector

        
    def fit(self, text_tokens, labels):
        all_ngrams = []
        for tokens in text_tokens:
            ngrams = self.count_ngrams(tokens, self.n)
            all_ngrams.append(ngrams)
            self.vocab.update(ngrams)

        X = np.array([self.generate_feature_vector(ngrams) for ngrams in all_ngrams])
        # print(X.shape)

        print(max(max(X[i]) for i in range(2608)))

        y = np.array(labels)

        self.classifier.fit(X, y)


    def predict(self, texts):
        all_ngrams = [self.count_ngrams(tokens, self.n) for tokens in texts]
        X = np.array([self.generate_feature_vector(ngrams) for ngrams in all_ngrams])
        predictions = self.classifier.predict(X)

        return predictions

    # 3_3
    def evaluate(self, texts, labels):
        predictions = self.predict(texts)
        accuracy = accuracy_score(labels, predictions)
        precision = precision_score(labels, predictions, average='macro')
        recall = recall_score(labels, predictions, average='macro')

        return accuracy, precision, recall


# 3_2
classifier = LogisticRegression()

bigram_model = NGramClassifier(n=2, classifier=classifier)
bigram_model.fit(train_tokens, train_labels)

accuracy_2gram, precision_2gram, recall_2gram = bigram_model.evaluate(test_tokens, test_labels) # 3_3
print(f"2-Gram Model - Accuracy: {accuracy_2gram:.2f}, Precision: {precision_2gram:.2f}, Recall: {recall_2gram:.2f}")


threegram_model = NGramClassifier(n=3, classifier=classifier)
threegram_model.fit(train_tokens, train_labels)

accuracy_3gram, precision_3gram, recall_3gram = threegram_model.evaluate(test_tokens, test_labels) # 3_3
print(f"3-Gram Model - Accuracy: {accuracy_3gram:.2f}, Precision: {precision_3gram:.2f}, Recall: {recall_3gram:.2f}")

# from collections import defaultdict, Counter
# import numpy as np

# class NGramClassifier:
#     def __init__(self, n=2):
#         self.n = n
#         self.vocab = set()
#         self.ngram_counts_per_class = defaultdict(Counter)
#         self.total_ngrams_per_class = defaultdict(int)
#         self.class_counts = Counter()
#         self.classes = []

#     def count_ngrams(self, tokens, n):
#         """Count the n-grams in the given tokens."""
#         ngrams = Counter([tuple(tokens[i:i+n]) for i in range(len(tokens)-n+1)])
#         return ngrams

#     def fit(self, text_tokens, labels):
#         """Train the n-gram classifier by counting n-grams per class."""
#         self.classes = list(set(labels))  # Store unique classes
#         for tokens, label in zip(text_tokens, labels):
#             ngrams = self.count_ngrams(tokens, self.n)
#             self.ngram_counts_per_class[label].update(ngrams)
#             self.total_ngrams_per_class[label] += sum(ngrams.values())
#             self.vocab.update(ngrams)
#             self.class_counts[label] += 1

#     def calculate_class_prob(self, ngrams, class_label):
#         """Calculate the log probability of the class given the n-grams."""
#         total_ngrams = self.total_ngrams_per_class[class_label]
#         class_prob = np.log(self.class_counts[class_label] / sum(self.class_counts.values()))  # P(class)

#         log_prob = class_prob  # Start with prior
#         for ngram, count in ngrams.items():
#             # Use Laplace smoothing for unseen n-grams
#             ngram_count = self.ngram_counts_per_class[class_label].get(ngram, 0) + 1
#             prob = ngram_count / (total_ngrams + len(self.vocab))  # P(ngram | class)
#             log_prob += count * np.log(prob)  # log(P(ngram | class)) * count

#         return log_prob

#     def predict(self, texts):
#         """Predict the class for each input text based on n-grams."""
#         predictions = []
#         for tokens in texts:
#             ngrams = self.count_ngrams(tokens, self.n)
#             class_probs = {cls: self.calculate_class_prob(ngrams, cls) for cls in self.classes}
#             predicted_class = max(class_probs, key=class_probs.get)
#             predictions.append(predicted_class)

#         return predictions

#     def evaluate(self, texts, labels):
#         """Evaluate the classifier on a test set."""
#         predictions = self.predict(texts)
#         accuracy = np.mean(np.array(predictions) == np.array(labels))
#         return accuracy

# # Example usage
# ngram_model = NGramClassifier(n=2)
# ngram_model.fit(train_tokens, train_labels)

# # Evaluate model
# accuracy = ngram_model.evaluate(test_tokens, test_labels)
# print(f"Accuracy: {accuracy:.2f}")


18
2-Gram Model - Accuracy: 0.75, Precision: 0.67, Recall: 0.42
