In [1]:
#imports
from typing import Text, Tuple, List, Union, Iterable
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from scipy.sparse import csr_matrix, spmatrix
import pandas as pd
import os

NDArray = Union[np.ndarray, spmatrix]

In [2]:
#define T2F class
class TextToFeatures:
    def __init__(self):
        self.vectorizer = TfidfVectorizer(ngram_range=(1, 2))
        self.index_map = None

    def fit(self, training_texts: Iterable[Text]) -> None:
        self.vectorizer.fit(training_texts)
        self.index_map = {feature: idx for idx, feature in enumerate(self.vectorizer.get_feature_names_out())}

    def index(self, feature: Text) -> Union[None, int]:
        return self.index_map.get(feature)
    
    def transform(self, texts: Iterable[Text]) -> csr_matrix:
        transformed_reviews = self.vectorizer.transform(texts)
        return transformed_reviews

In [3]:
#define T2L class
class TextToLabels:
    def __init__(self):
        self.vectorizer = CountVectorizer()
        self.model = LogisticRegression(max_iter=100000, solver='saga')

    def fit(self, training_text: Iterable[Text], training_labels: Iterable[int]) -> None:
        self.vectorizer.fit(training_text)
        train_features = self.vectorizer.transform(training_text)
        self.model.fit(train_features, training_labels)

    def transform(self, test_text: Iterable[Text]) -> np.ndarray:
        test_features = self.vectorizer.transform(test_text)
        test_labels = self.model.predict(test_features)
        return test_labels

In [4]:
#read in data & pre-process
def process_text(input_file_path: Text, output_file_path: Text, is_train_data: bool) -> List[Tuple[Text, Text, Text]]:
    processed_data = []
    with open(input_file_path, 'r') as file:
        for line in file:
            line = line.strip()
            if not line:
                continue
            if is_train_data:
                match = re.match(r'^(\d+),(.*),(\d)$', line)
                if match:
                    ID, Text, Label = match.groups()
                    processed_data.append((ID, Text, Label))
            else:
                match = re.match(r'^(\d+),(.*)$', line)
                if match:
                    ID, Text = match.groups()
                    processed_data.append((ID, Text, ""))  # Test data initially doesn't have labels
    if output_file_path:
        with open(output_file_path, 'w+') as train_out:
            train_out.write('\n'.join([','.join(item) for item in processed_data]))
    return processed_data

In [5]:
def main():
    train_data_input = '/home/rofljen/repos/ling-539-class-competition-code-rofljen-main/class-competition/files/Data/train.csv'
    train_data_output = '/home/rofljen/repos/ling-539-class-competition-code-rofljen-main/class-competition/files/Data/train.txt'
    test_data_input = '/home/rofljen/repos/ling-539-class-competition-code-rofljen-main/class-competition/files/Data/test.csv'
    test_data_output = '/home/rofljen/repos/ling-539-class-competition-code-rofljen-main/class-competition/files/Data/test.txt'

    # Process train data
    processed_train_reviews = process_text(train_data_input, train_data_output, is_train_data=True)

    # Process test data
    processed_test_reviews = process_text(test_data_input, test_data_output, is_train_data=False)

    # Extract text from processed reviews
    train_texts = [review[1] for review in processed_train_reviews]
    test_texts = [review[1] for review in processed_test_reviews]

    # Initialize TextToFeatures and fit with training data
    t2f = TextToFeatures()
    t2f.fit(train_texts)

    # Transform both train and test texts into feature matrices
    train_feature_matrices = t2f.transform(train_texts)
    test_feature_matrices = t2f.transform(test_texts)

    # Initialize TextToLabels and fit with training data
    t2l = TextToLabels()
    train_labels = [int(review[2]) for review in processed_train_reviews]
    t2l.fit(train_texts, train_labels)
    
    test_labels = t2l.transform(test_texts)

    # Combine test data with predicted labels
    labeled_test_data = []
    for review, label in zip(processed_test_reviews, test_labels):
        ID, text, _ = review  # Extract ID and text
        labeled_test_data.append((ID, label))

    # Save results to a DataFrame and CSV file
    df = pd.DataFrame(labeled_test_data, columns=['ID', 'LABEL'])
    directory = '/home/rofljen/repos/ling-539-class-competition-code-rofljen-main/class-competition/files'

    if not os.path.exists(directory):
        os.makedirs(directory)

    df.to_csv(os.path.join(directory, 'results.csv'), index=False)

    # Print lengths
    print("Length of processed_test_reviews:", len(processed_test_reviews))
    print("Length of test_labels:", len(test_labels))


# Call the main function
main()


Length of processed_test_reviews: 17581
Length of test_labels: 17581
