In [42]:
#Unigram CLASSIFIER

from collections import Counter, defaultdict
import pandas as pd
import numpy as np
import math
import re

stop_words = {
    'a', 'about', 'above', 'after', 'again', 'against', 'all', 'am', 'an', 'and', 'any', 'are', "aren't",
    'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by',
    'can', "can't", 'cannot', 'could', "couldn't", 'did', "didn't", 'do', 'does', "doesn't", 'doing',
    "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', "hadn't", 'has', "hasn't",
    'have', "haven't", 'having', 'he', "he'd", "he'll", "he's", 'her', 'here', "here's", 'hers', 'herself',
    'him', 'himself', 'his', 'how', "how's", 'i', "i'd", "i'll", "i'm", "i've", 'if', 'in', 'into', 'is',
    "isn't", 'it', "it's", 'its', 'itself', "let's", 'me', 'more', 'most', "mustn't", 'my', 'myself', 'no',
    'nor', 'not', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'ought', 'our', 'ours', 'ourselves',
    'out', 'over', 'own', 'same', "shan't", 'she', "she'd", "she'll", "she's", 'should', "shouldn't", 'so',
    'some', 'such', 'than', 'that', "that's", 'the', 'their', 'theirs', 'them', 'themselves', 'then',
    'there', "there's", 'these', 'they', "they'd", "they'll", "they're", "they've", 'this', 'those',
    'through', 'to', 'too', 'under', 'until', 'up', 'very', 'was', "wasn't", 'we', "we'd", "we'll", "we're",
    "we've", 'were', "weren't", 'what', "what's", 'when', "when's", 'where', "where's", 'which', 'while',
    'who', "who's", 'whom', 'why', "why's", 'with', "won't", 'would', "wouldn't", 'you', "you'd", "you'll",
    "you're", "you've", 'your', 'yours', 'yourself', 'yourselves'
}

def U_preprocess(text):
    # Lowercase and keep only words
    text = text.lower()
    words = re.findall(r'\b\w+\b', text)
    
    bigrams = []
    words = [w for w in words if w not in stop_words and len(w) > 2]
    # for i in range(1, len(words)):
    #     bigrams.append(f"{words[i-1]}_{words[i]}")

    # trigrams = []
    # for i in range(2, len(words)):
    #     trigrams.append(f"{words[i-2]}_{words[i-1]}_{words[i]}")

    return words

def UnigramClassifier(alpha, X_train, X_val, y_train):
    projs = ["A", "S", "G", "W"]
    word_counts = defaultdict(Counter)
    total_words = {"A":0,"S":0,"G":0,"W":0}
    all_words = Counter()
    num_projs = {"A":0,"S":0,"G":0,"W":0}
    total_instances = len(X_train)

    word_counts["A"] = Counter()
    word_counts["S"] = Counter()
    word_counts["G"] = Counter()
    word_counts["W"] = Counter()

    for i in range(len(X_train)):
        pred, desc = y_train[i], X_train[i]

        pp_desc = U_preprocess(desc)

        word_counts[pred].update(pp_desc)
        total_words[pred] += len(pp_desc)
        all_words.update(pp_desc)
        num_projs[pred] += 1

    vocab_size = len(all_words)

    class_weights = {proj: 1.0 / num_projs[proj] for proj in projs}
    total_weight = sum(class_weights.values()) 
    proj_probs = []
    for proj in projs:
        weighted_prob = class_weights[proj] / total_weight  # Normalize so that sum is 1
        proj_probs.append(weighted_prob)

    word_probs = defaultdict(Counter)
    word_probs["A"] = Counter()
    word_probs["S"] = Counter()
    word_probs["G"] = Counter()
    word_probs["W"] = Counter()

    #Calculating probabilities for each word/bigram
    for proj in projs:
        focus_dict = word_counts[proj]
        denom = total_words[proj] + alpha*total_instances
        
        #Calculating likelihoods with Laplace smoothing
        for word in all_words:
            if word not in focus_dict:
                word_probs[proj][word] = alpha/denom
            else:
                word_probs[proj][word] = (focus_dict[word]+alpha)/denom
        
    #Initialzing classification variables
    temp = np.log(proj_probs)
    class_probs = {}
    for i in range(len(temp)):
        class_probs[projs[i]] = temp[i]
    classifications = []

    #Classify test data
    for desc in X_val:
        pp_desc = U_preprocess(desc)

        cur_class_probs = class_probs.copy()
        for proj in projs:
            #Unigram Probs
            for word in pp_desc:
                cur_class_probs[proj] += np.log(word_probs[proj].get(word, alpha / denom))
            
            # #Bigram Probs
            # for i in range(1, len(pp_desc)):
            #     cur_bigram = f"{pp_desc[i-1]}_{pp_desc[i]}"
            #     cur_class_probs[proj] += np.log(word_probs[proj].get(cur_bigram, alpha / denom))

            # #Trigram Probs
            # for i in range(2, len(pp_desc)):
            #     cur_trigram = f"{pp_desc[i-2]}_{pp_desc[i-1]}_{pp_desc[i]}"
            #     cur_class_probs[proj] += np.log(word_probs[proj].get(cur_trigram, alpha / denom))

        #find project with highest probability, and make that the prediction
        best_proj = max(cur_class_probs, key=cur_class_probs.get)
        classifications.append(best_proj)
    
    return classifications




In [43]:
#Bigram CLASSIFIER

from collections import Counter, defaultdict
import pandas as pd
import numpy as np
import math
import re

stop_words = {
    'a', 'about', 'above', 'after', 'again', 'against', 'all', 'am', 'an', 'and', 'any', 'are', "aren't",
    'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by',
    'can', "can't", 'cannot', 'could', "couldn't", 'did', "didn't", 'do', 'does', "doesn't", 'doing',
    "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', "hadn't", 'has', "hasn't",
    'have', "haven't", 'having', 'he', "he'd", "he'll", "he's", 'her', 'here', "here's", 'hers', 'herself',
    'him', 'himself', 'his', 'how', "how's", 'i', "i'd", "i'll", "i'm", "i've", 'if', 'in', 'into', 'is',
    "isn't", 'it', "it's", 'its', 'itself', "let's", 'me', 'more', 'most', "mustn't", 'my', 'myself', 'no',
    'nor', 'not', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'ought', 'our', 'ours', 'ourselves',
    'out', 'over', 'own', 'same', "shan't", 'she', "she'd", "she'll", "she's", 'should', "shouldn't", 'so',
    'some', 'such', 'than', 'that', "that's", 'the', 'their', 'theirs', 'them', 'themselves', 'then',
    'there', "there's", 'these', 'they', "they'd", "they'll", "they're", "they've", 'this', 'those',
    'through', 'to', 'too', 'under', 'until', 'up', 'very', 'was', "wasn't", 'we', "we'd", "we'll", "we're",
    "we've", 'were', "weren't", 'what', "what's", 'when', "when's", 'where', "where's", 'which', 'while',
    'who', "who's", 'whom', 'why', "why's", 'with', "won't", 'would', "wouldn't", 'you', "you'd", "you'll",
    "you're", "you've", 'your', 'yours', 'yourself', 'yourselves'
}

def B_preprocess(text):
    # Lowercase and keep only words
    text = text.lower()
    words = re.findall(r'\b\w+\b', text)
    
    bigrams = []
    for i in range(1, len(words)):
        bigrams.append(f"{words[i-1]}_{words[i]}")
    return bigrams

def BigramClassifier(alpha, X_train, X_val, y_train):
    projs = ["A", "S", "G", "W"]
    word_counts = defaultdict(Counter)
    total_words = {"A":0,"S":0,"G":0,"W":0}
    all_words = Counter()
    num_projs = {"A":0,"S":0,"G":0,"W":0}
    total_instances = len(X_train)

    word_counts["A"] = Counter()
    word_counts["S"] = Counter()
    word_counts["G"] = Counter()
    word_counts["W"] = Counter()

    for i in range(len(X_train)):
        pred, desc = y_train[i], X_train[i]

        pp_desc = B_preprocess(desc)

        word_counts[pred].update(pp_desc)
        total_words[pred] += len(pp_desc)
        all_words.update(pp_desc)
        num_projs[pred] += 1

    vocab_size = len(all_words)

    class_weights = {proj: 1.0 / num_projs[proj] for proj in projs}
    total_weight = sum(class_weights.values()) 
    proj_probs = []
    for proj in projs:
        weighted_prob = class_weights[proj] / total_weight  # Normalize so that sum is 1
        proj_probs.append(weighted_prob)

    word_probs = defaultdict(Counter)
    word_probs["A"] = Counter()
    word_probs["S"] = Counter()
    word_probs["G"] = Counter()
    word_probs["W"] = Counter()

    #Calculating probabilities for each word/bigram
    for proj in projs:
        focus_dict = word_counts[proj]
        denom = total_words[proj] + alpha*total_instances
        
        #Calculating likelihoods with Laplace smoothing
        for word in all_words:
            if word not in focus_dict:
                word_probs[proj][word] = alpha/denom
            else:
                word_probs[proj][word] = (focus_dict[word]+alpha)/denom
        
    #Initialzing classification variables
    temp = np.log(proj_probs)
    class_probs = {}
    for i in range(len(temp)):
        class_probs[projs[i]] = temp[i]
    classifications = []

    #Classify test data
    for desc in X_val:
        pp_desc = B_preprocess(desc)

        cur_class_probs = class_probs.copy()
        for proj in projs:
            # for i in range(1, len(pp_desc)):
            #     cur_bigram = f"{pp_desc[i-1]}_{pp_desc[i]}"
            #     cur_class_probs[proj] += np.log(word_probs[proj].get(cur_bigram, alpha / denom))
            for bigram in pp_desc:
                cur_class_probs[proj] += np.log(word_probs[proj].get(bigram, alpha / denom))

        #find project with highest probability, and make that the prediction
        best_proj = max(cur_class_probs, key=cur_class_probs.get)
        classifications.append(best_proj)
    
    return classifications



In [44]:
#UniBigram CLASSIFIER

from collections import Counter, defaultdict
import pandas as pd
import numpy as np
import math
import re

stop_words = {
    'a', 'about', 'above', 'after', 'again', 'against', 'all', 'am', 'an', 'and', 'any', 'are', "aren't",
    'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by',
    'can', "can't", 'cannot', 'could', "couldn't", 'did', "didn't", 'do', 'does', "doesn't", 'doing',
    "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', "hadn't", 'has', "hasn't",
    'have', "haven't", 'having', 'he', "he'd", "he'll", "he's", 'her', 'here', "here's", 'hers', 'herself',
    'him', 'himself', 'his', 'how', "how's", 'i', "i'd", "i'll", "i'm", "i've", 'if', 'in', 'into', 'is',
    "isn't", 'it', "it's", 'its', 'itself', "let's", 'me', 'more', 'most', "mustn't", 'my', 'myself', 'no',
    'nor', 'not', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'ought', 'our', 'ours', 'ourselves',
    'out', 'over', 'own', 'same', "shan't", 'she', "she'd", "she'll", "she's", 'should', "shouldn't", 'so',
    'some', 'such', 'than', 'that', "that's", 'the', 'their', 'theirs', 'them', 'themselves', 'then',
    'there', "there's", 'these', 'they', "they'd", "they'll", "they're", "they've", 'this', 'those',
    'through', 'to', 'too', 'under', 'until', 'up', 'very', 'was', "wasn't", 'we', "we'd", "we'll", "we're",
    "we've", 'were', "weren't", 'what', "what's", 'when', "when's", 'where', "where's", 'which', 'while',
    'who', "who's", 'whom', 'why', "why's", 'with', "won't", 'would', "wouldn't", 'you', "you'd", "you'll",
    "you're", "you've", 'your', 'yours', 'yourself', 'yourselves'
}

def T_preprocess(text):
    # Lowercase and keep only words
    text = text.lower()
    words = re.findall(r'\b\w+\b', text)
    
    # words = [w for w in words if w not in stop_words and len(w) > 2]

    bigrams = []
    for i in range(2, len(words)):
        bigrams.append(f"{words[i-2]}_{words[i-1]}_{words[i]}")

    return bigrams

def UniBigramClassifier(alpha, X_train, X_val, y_train):
    projs = ["A", "S", "G", "W"]
    word_counts = defaultdict(Counter)
    total_words = {"A":0,"S":0,"G":0,"W":0}
    all_words = Counter()
    num_projs = {"A":0,"S":0,"G":0,"W":0}
    total_instances = len(X_train)

    word_counts["A"] = Counter()
    word_counts["S"] = Counter()
    word_counts["G"] = Counter()
    word_counts["W"] = Counter()

    for i in range(len(X_train)):
        pred, desc = y_train[i], X_train[i]

        pp_desc = T_preprocess(desc)

        word_counts[pred].update(pp_desc)
        total_words[pred] += len(pp_desc)
        all_words.update(pp_desc)
        num_projs[pred] += 1

    vocab_size = len(all_words)

    class_weights = {proj: 1.0 / num_projs[proj] for proj in projs}
    total_weight = sum(class_weights.values()) 
    proj_probs = []
    for proj in projs:
        weighted_prob = class_weights[proj] / total_weight  # Normalize so that sum is 1
        proj_probs.append(weighted_prob)

    word_probs = defaultdict(Counter)
    word_probs["A"] = Counter()
    word_probs["S"] = Counter()
    word_probs["G"] = Counter()
    word_probs["W"] = Counter()

    #Calculating probabilities for each word/bigram
    for proj in projs:
        focus_dict = word_counts[proj]
        denom = total_words[proj] + alpha*total_instances
        
        #Calculating likelihoods with Laplace smoothing
        for word in all_words:
            if word not in focus_dict:
                word_probs[proj][word] = alpha/denom
            else:
                word_probs[proj][word] = (focus_dict[word]+alpha)/denom
        
    #Initialzing classification variables
    temp = np.log(proj_probs)
    class_probs = {}
    for i in range(len(temp)):
        class_probs[projs[i]] = temp[i]
    classifications = []

    #Classify test data
    for desc in X_val:
        pp_desc = T_preprocess(desc)

        cur_class_probs = class_probs.copy()
        for proj in projs:
            #Unigram Probs
            for word in pp_desc:
                cur_class_probs[proj] += np.log(word_probs[proj].get(word, alpha / denom))
            
            # #Bigram Probs
            # for i in range(1, len(pp_desc)):
            #     cur_bigram = f"{pp_desc[i-1]}_{pp_desc[i]}"
            #     cur_class_probs[proj] += np.log(word_probs[proj].get(cur_bigram, alpha / denom))

        #find project with highest probability, and make that the prediction
        best_proj = max(cur_class_probs, key=cur_class_probs.get)
        classifications.append(best_proj)
    
    return classifications




In [45]:
def EnsembleClassifier(alpha1, alpha2, alpha3, X_train, X_val, y_train):
    #Get each classifier's predictions
    c1 = UnigramClassifier(alpha1, X_train, X_val, y_train)
    c2 = BigramClassifier(alpha2, X_train, X_val, y_train)
    c3 = UniBigramClassifier(alpha3, X_train, X_val, y_train)
    
    classifications = []
    #Compare the predictions --> choose majority
    for i in range(len(c1)):
       
        comp = Counter([c1[i], c2[i], c3[i]])

        #if they all have different guesses: use c1 guess
        if len(comp) == 3:
            classifications.append(c2[i])
        else: #if not, guess the majority
            classifications.append(max(comp, key=comp.get))

    return classifications


In [46]:
from sklearn.model_selection import StratifiedKFold
import numpy as np
import pandas as pd

#5 Fold Cross-Validation Method
def CheckAccuracy(classifications, y_val):
    correct = 0
    for i in range(len(classifications)):
        if classifications[i] == y_val[i]:
            correct += 1
    return correct/len(classifications)
    

def CrossValidate():
    print(f"5 Fold Cross-Validation Test on Classifier(s):\n")
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    accuracies = []
    df = pd.read_csv("train.csv")
    X = df["Description"].values
    y = df["Class"].values

    for fold, (train_index, val_index) in enumerate(skf.split(X, y)):
        # Split the data
        X_train, X_val = X[train_index], X[val_index]
        y_train, y_val = y[train_index], y[val_index]
        
        # classifications = UnigramClassifier(6.5, X_train, X_val, y_train)
        # accuracy = CheckAccuracy(classifications, y_val)
        # accuracies.append(accuracy)

        # classifications = BigramClassifier(0.007, X_train, X_val, y_train)
        # accuracy = CheckAccuracy(classifications, y_val)
        # accuracies.append(accuracy)

        # classifications = UniBigramClassifier(1, X_train, X_val, y_train)
        # accuracy = CheckAccuracy(classifications, y_val)
        # accuracies.append(accuracy)

        classifications = EnsembleClassifier(6.5, 0.007, 2, X_train, X_val, y_train)
        # print(classifications)
        accuracy = CheckAccuracy(classifications, y_val)
        accuracies.append(accuracy)

        print(f"Fold {fold+1} Accuracy: {accuracy:.2%}.")

    print(f"\nAverage cross-validated accuracy is {np.mean(accuracies):.2%}\n")

CrossValidate()

5 Fold Cross-Validation Test on Classifier(s):

Fold 1 Accuracy: 98.41%.
Fold 2 Accuracy: 97.39%.
Fold 3 Accuracy: 97.39%.
Fold 4 Accuracy: 98.18%.
Fold 5 Accuracy: 98.41%.

Average cross-validated accuracy is 97.95%

