# Tarea 3

### Sentiment Analysis

In [1]:
import numpy as np
import zipfile
import tarfile
import os
from tqdm import tqdm 
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
import re
import warnings

# Filter out FutureWarnings from scikit-learn
warnings.filterwarnings("ignore", category=DeprecationWarning)


c:\Users\USUARIO\anaconda3\envs\anteia\lib\site-packages\numpy\.libs\libopenblas.EL2C6PLE4ZYW3ECEVIV3OXXGRN2NRFM2.gfortran-win_amd64.dll
c:\Users\USUARIO\anaconda3\envs\anteia\lib\site-packages\numpy\.libs\libopenblas64__v0.3.23-246-g3d31191b-gcc_10_3_0.dll


## Extracción Dataset


In [2]:
compressed_file = "Datasets.zip"
with zipfile.ZipFile(compressed_file, 'r') as zip_ref:
    folder_name = os.path.splitext(compressed_file)[0]  # Remove the ".zip" extension
    target_folder = os.path.join(folder_name)
    
    if not os.path.exists(target_folder):
        # Create the folder within the target directory
        os.mkdir(target_folder)

        # Extract all files to the target folder
        zip_ref.extractall(target_folder)

print("Extracción completada")

Extracción completada


In [3]:
compressed_file = "Datasets\\20news-18828.tar.gz"
folder_name = os.path.splitext(os.path.splitext(compressed_file)[0])[0]  # Remove the ".tar.gz" extension

if not os.path.exists(folder_name):
    os.mkdir(folder_name)

# Extract all files from the TAR.GZ archive to the target folder without creating an additional subfolder
with tarfile.open(compressed_file, 'r:gz') as tar_ref:
    members = tar_ref.getmembers()
    tar_ref.extractall(path=folder_name, members=members)

print("Extracción completada")

Extracción completada


In [4]:
compressed_file = "Datasets\\Multi Domain Sentiment\\processed_acl.tar.gz"
folder_name = os.path.splitext(os.path.splitext(compressed_file)[0])[0]  # Remove the ".tar.gz" extension

if not os.path.exists(folder_name):
    os.mkdir(folder_name)

# Extract all files from the TAR.GZ archive to the target folder without creating an additional subfolder
with tarfile.open(compressed_file, 'r:gz') as tar_ref:
    members = tar_ref.getmembers()
    tar_ref.extractall(path=folder_name, members=members)

print("Extracción completada")

Extracción completada


## Extracción de datos y preparación de dataset

In [5]:
# Define the path to the dataset and categories
data_path = "Datasets\\Multi Domain Sentiment\\processed_acl\\processed_acl"
categories = ["books", "dvd", "electronics", "kitchen"]

In [10]:
# Define a function to extract reviews and their labels with a specified limit
def extract_reviews_and_labels(category, sentiment, max_reviews=None):
    reviews = []
    labels = []
    folder_path = os.path.join(data_path, category)
    filename = f"{sentiment}.review"
    file_path = os.path.join(folder_path, filename)
    
    with open(file_path, "r", encoding="utf-8") as file:
        lines = file.readlines()
        for line in lines:
            # se tiene en cuenta la entrada, se pone en formato de texto limpio, repitiendo las palabras cuando en la estructura "str:num" num!=1
            line_clean = re.sub(r'(\w+):(\d+)', lambda x: (x.group(1) + ' ') * (int(x.group(2)) - 1) + x.group(1), line)
            line_clean = line_clean.strip()
            if line_clean:
                # Split the line into features and label parts
                parts = line_clean.split("#label#:")
                if len(parts) == 2:
                    review = parts[0].strip()
                    label = parts[1].strip()
                    reviews.append(review)
                    labels.append(label)
                    if max_reviews is not None and len(reviews) >= max_reviews:
                        break  # Stop extracting if the maximum number of reviews is reached
    return reviews, labels

In [11]:
# Feature representation strategies
vectorizers = {
    "tf": CountVectorizer(), 
    "tfidf": TfidfVectorizer(),
}

# Initialize results dataframe
results_df = pd.DataFrame(
    columns=["Category", "Algorithm", "Representation", "Precision", "Recall", "F1", "Accuracy"]
)

# Filter out all warnings
warnings.filterwarnings("ignore")

In [12]:
for category in categories:
    positive_reviews, positive_labels = extract_reviews_and_labels(category, "positive", max_reviews=None)
    negative_reviews, negative_labels = extract_reviews_and_labels(category, "negative", max_reviews=None)
    X_test, y_test = extract_reviews_and_labels(category, "unlabeled", max_reviews=None)
    X_train = positive_reviews + negative_reviews
    y_train = positive_labels + negative_labels

    # # Split the data into train and validation sets
    # X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)


    # Perform sentiment analysis for each category, algorithm, and feature representation

    for algorithm in ["NB", "LR"]:
        for representation, vectorizer in vectorizers.items():
            print(f"Processing Category: {category}, Algorithm: {algorithm}, Representation: {representation}")
            # Initialize the vectorizer
            X_train_vectorized = vectorizer.fit_transform(X_train)
            X_test_vectorized = vectorizer.transform(X_test)
            
            # Train the classifier
            if algorithm == "NB":
                classifier = MultinomialNB()
            elif algorithm == "LR":
                classifier = LogisticRegression(max_iter=500, random_state=13)  #se reduce numero maximo de iteraciones pq se demora muchísimo
                
            classifier.fit(X_train_vectorized, y_train)
            
            # Predict sentiment
            y_pred = classifier.predict(X_test_vectorized)
            
            # Calculate evaluation metrics
            precision = precision_score(y_test, y_pred, average="weighted")
            recall = recall_score(y_test, y_pred, average="weighted")
            f1 = f1_score(y_test, y_pred, average="weighted")
            accuracy = accuracy_score(y_test, y_pred)
            
            # Store results in the dataframe
            results_df = results_df.append(
                {
                    "Category": category,
                    "Algorithm": algorithm,
                    "Representation": representation,
                    "Precision": precision,
                    "Recall": recall,
                    "F1": f1,
                    "Accuracy": accuracy,
                },
                ignore_index=True,
            )
            if algorithm=='NB':
                print('_________________________')
                continue
            # Get the top N features (words) for each class (positive and negative)
            feature_names = vectorizer.get_feature_names()
            top_n = 10  # Adjust the number of top features to display

            coef = classifier.coef_[0]  # For binary classification, there's only one set of coefficients
            top_features_indices = np.argsort(coef)[-top_n:]
            top_features = [feature_names[i] for i in top_features_indices]
            print(f"Category: {category}")
            print(f"Top {top_n} features: {top_features}")
            print()

            print('_________________________')

Processing Category: books, Algorithm: NB, Representation: tf
_________________________
Processing Category: books, Algorithm: NB, Representation: tfidf
_________________________
Processing Category: books, Algorithm: LR, Representation: tf
Category: books
Top 10 features: ['enjoyed', 'everything', 'a_must', 'wonderful', 'must', 'the_best', 'loved', 'easy', 'great', 'excellent']

_________________________
Processing Category: books, Algorithm: LR, Representation: tfidf
Category: books
Top 10 features: ['a_great', 'love', 'the_best', 'best', 'must', 'my', 'you', 'easy', 'excellent', 'great']

_________________________
Processing Category: dvd, Algorithm: NB, Representation: tf
_________________________
Processing Category: dvd, Algorithm: NB, Representation: tfidf
_________________________
Processing Category: dvd, Algorithm: LR, Representation: tf
Category: dvd
Top 10 features: ['season', 'well', 'still', 'wonderful', 'enjoy', 'loved', 'love', 'best', 'excellent', 'great']

___________

In [14]:
# Display the results
print("Results:")
print(results_df)

Results:
       Category Algorithm Representation  Precision    Recall        F1  \
0         books        NB             tf   0.834947  0.826652  0.825808   
1         books        NB          tfidf   0.839174  0.823964  0.822292   
2         books        LR             tf   0.824635  0.824636  0.824635   
3         books        LR          tfidf   0.833220  0.832027  0.831960   
4           dvd        NB             tf   0.820063  0.819855  0.819847   
5           dvd        NB          tfidf   0.848160  0.846068  0.845892   
6           dvd        LR             tf   0.832617  0.832404  0.832355   
7           dvd        LR          tfidf   0.845131  0.844953  0.844914   
8   electronics        NB             tf   0.854781  0.854779  0.854777   
9   electronics        NB          tfidf   0.866608  0.865517  0.865441   
10  electronics        LR             tf   0.858618  0.858476  0.858451   
11  electronics        LR          tfidf   0.859836  0.859708  0.859685   
12      kitchen 

In [18]:
# Sort the results_df by 'F1' column in ascending order
sorted_results_df = results_df.sort_values(by='F1')

# Get the rows with the lowest F1 scores
lowest_f1_categories = sorted_results_df[sorted_results_df['F1'] == sorted_results_df['F1']]

# Print the categories with the lowest F1 scores
print("Categories with the lowest F1 scores:")
print(lowest_f1_categories[["Algorithm", "Representation", 'Category', 'F1']])

Categories with the lowest F1 scores:
   Algorithm Representation     Category        F1
4         NB             tf          dvd  0.819847
1         NB          tfidf        books  0.822292
2         LR             tf        books  0.824635
0         NB             tf        books  0.825808
3         LR          tfidf        books  0.831960
6         LR             tf          dvd  0.832355
7         LR          tfidf          dvd  0.844914
5         NB          tfidf          dvd  0.845892
8         NB             tf  electronics  0.854777
10        LR             tf  electronics  0.858451
11        LR          tfidf  electronics  0.859685
9         NB          tfidf  electronics  0.865441
15        LR          tfidf      kitchen  0.868775
12        NB             tf      kitchen  0.877869
13        NB          tfidf      kitchen  0.880380
14        LR             tf      kitchen  0.881910


Segun la métrica F1, lasd clases más dificiles de clasificar son dvd y books, debido a que independientemente de la representación o el algoritmo de clasificación, el desempeño es menor comparado con las clases electronics y kitchen.

In [20]:
# Combine data from all categories
X_train = []
y_train = []
X_test = []
y_test = []
for category in categories:
    positive_reviews, positive_labels = extract_reviews_and_labels(category, "positive", max_reviews=None)
    negative_reviews, negative_labels = extract_reviews_and_labels(category, "negative", max_reviews=None)
    X_train += positive_reviews + negative_reviews
    y_train += positive_labels + negative_labels

    # Load the test data
    X_test_cat, y_test_cat = extract_reviews_and_labels(category, "unlabeled", max_reviews=None)
    X_test += X_test_cat
    y_test += y_test_cat

# Perform sentiment analysis on the merged dataset for NB and LR
for algorithm in ["NB", "LR"]:
    for representation, vectorizer in vectorizers.items():
        print(f"Algorithm: {algorithm}, Representation: {representation}")
        # Initialize the vectorizer
        X_train_vectorized = vectorizer.fit_transform(X_train)
        X_test_vectorized = vectorizer.transform(X_test)
        
        # Train the classifier
        if algorithm == "NB":
            classifier = MultinomialNB()
        elif algorithm == "LR":
            classifier = LogisticRegression(max_iter=500, random_state=13)
        
        classifier.fit(X_train_vectorized, y_train)
        
        # Predict sentiment
        y_pred = classifier.predict(X_test_vectorized)
        
        # Calculate evaluation metrics
        precision = precision_score(y_test, y_pred, average="binary")  # Assuming binary classification
        recall = recall_score(y_test, y_pred, average="binary")
        f1 = f1_score(y_test, y_pred, average="binary")
        
        # Store results in the dataframe
        results_df = results_df.append(
            {
                "Algorithm": algorithm,
                "Representation": representation,
                "Precision": precision,
                "Recall": recall,
                "F1": f1,
            },
            ignore_index=True,
        )

# Display the results
print("Results:")
print(results_df)

Algorithm: NB, Representation: tf


ValueError: pos_label=1 is not a valid label. It should be one of ['negative', 'positive']