# Análisis de sentimientos con dataset personalizado

## Lectura del dataset

In [2]:
import pandas as pd

df = pd.read_csv(r'sample-tucson-separado.csv', encoding='latin-1')

In [3]:
# Crea una copia del dataset eliminando las respuestas clasificadas como 
# neutrales y reemplazando las etiquetas de los sentimientos a las esperadas
# por la librería textblob.
df2 = df.copy()
df2 = df2[['Answer', 'Sentiment']]
df2.drop(df2[df2['Sentiment'] == 'Neutral'].index, inplace=True)
df2['Sentiment'] = df2['Sentiment'].replace({'Positive': 'pos', 'Neutral': 'neu', 'Negative': 'neg'})
df2['Sentiment'].unique()

array(['pos', 'neg'], dtype=object)

## Creación de datos de entrenamiento y de prueba

In [4]:
from sklearn.model_selection import train_test_split


y = df2['Sentiment']
X_train, X_test, y_train, y_test = train_test_split(df2, y, test_size=0.25, random_state=25)

## Análisis de datos

In [37]:
from textblob.classifiers import DecisionTreeClassifier


cl1 = DecisionTreeClassifier(X_train.values)
cl1.accuracy(X_test.values)

0.45454545454545453

In [6]:
from textblob.classifiers import MaxEntClassifier


cl2 = MaxEntClassifier(X_train.values)
cl2.accuracy(X_test.values)

  ==> Training (100 iterations)

      Iteration    Log Likelihood    Accuracy
      ---------------------------------------
             1          -0.69315        0.636
             2          -0.05046        1.000
             3          -0.00956        1.000
             4          -0.00251        1.000
             5          -0.00077        1.000
             6          -0.00026        1.000
             7          -0.00009        1.000
             8          -0.00004        1.000
             9          -0.00001        1.000
            10          -0.00001        1.000
            11          -0.00000        1.000
            12          -0.00000        1.000
            13          -0.00000        1.000
            14          -0.00000        1.000
            15          -0.00000        1.000
            16          -0.00000        1.000
            17          -0.00000        1.000
            18          -0.00000        1.000
            19          -0.00000        1.000
 

0.7272727272727273

In [24]:
from textblob.classifiers import NaiveBayesClassifier


cl3 = NaiveBayesClassifier(X_train.values)
cl3.accuracy(X_test.values)

0.6818181818181818

In [8]:
import random
from textblob.classifiers import PositiveNaiveBayesClassifier

answers = set(df2['Answer'].values)
pos_answers = set(df2[df2['Sentiment'] == 'pos']['Answer'].values)
train_size = int(len(pos_answers) * .25)
pos_answers = random.sample(pos_answers, train_size)
unlabeled_answers = set(df2['Answer'].values) - set(pos_answers)
unlabeled_answers = random.sample(unlabeled_answers, train_size)
cl4 = PositiveNaiveBayesClassifier(positive_set=pos_answers, unlabeled_set=unlabeled_answers)
test_answers = X_test.copy()
test_answers['Sentiment'] = test_answers['Sentiment'].replace({'pos': True, 'neu': False, 'neg': False})
cl4.accuracy(test_answers.values)

0.8181818181818182

In [36]:
import random
from textblob.classifiers import PositiveNaiveBayesClassifier


random.seed(5) # Con 3 da mejores resultados.

answers = set(df2['Answer'].values)
pos_answers = set(df2[df2['Sentiment'] == 'pos']['Answer'].values)
neg_answers = set(df2[df2['Sentiment'] == 'neg']['Answer'].values)
print(f'ANS: {len(answers)}, POS: {len(pos_answers)}, NEG: {len(neg_answers)}')

train_size = int(len(answers) * .15)
print(f'TRAIN SIZE: {train_size}')
train_pos_answers = random.sample(pos_answers, train_size)
train_unlabeled_answers = set(df2['Answer'].values) - set(train_pos_answers)
train_unlabeled_answers = random.sample(unlabeled_answers, train_size)

cl4 = PositiveNaiveBayesClassifier(
    positive_set=train_pos_answers,
    unlabeled_set=train_unlabeled_answers
)
test_answers = X_test.copy()
test_answers['Sentiment'] = test_answers['Sentiment'].replace(
    {'pos': True, 'neu': False, 'neg': False})
acc = cl4.accuracy(test_answers.values)
print(f'ACCURACY: {acc}')

ANS: 88, POS: 58, NEG: 30
TRAIN SIZE: 13
ACCURACY: 0.7272727272727273
