In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from scipy.sparse import csr_matrix
from textblob import TextBlob
from sklearn.feature_selection import chi2

In [2]:
# Read the file
df = pd.read_csv("BestBuy_Data.csv",na_filter=False)

In [3]:
# Drop the NA values
df1 = df.dropna() 

In [4]:
positive_rationale = []
negative_rationale = []

In [5]:
# Get all positive rationales
df1_pos = df1[df1['Sentiment'] == "Positive"]
positive_rationale = [pos_rationale.lower() for pos_rationale in df1_pos['Rationale']]
print(positive_rationale)

['available', '', 'sneaking', '', 'easy', 'thank', '', 'early', '', 'happy', 'love', 'fun', 'better', 'better', 'interesting', 'know', '', '', '', 'good', 'cheap', 'deals', 'released', 'great', 'good', 'real', 'great', 'right', 'perfectly', "can't hurt", 'sure', 'run', 'gift', 'early', 'epic', 'really nice', '', 'new', 'right', 'same price', 'know', 'deals', 'top', 'cute', 'glad', 'connected', 'great', 'good deal', 'proud', 'excitement', 'thanks', 'free', 'need', 'exciting', 'easy', 'new', 'new', 'manage', 'deals', 'perfect', 'discount', 'unlimited', 'great find', 'well', 'thank', 'save', 'seeing', 'fun', 'treat', 'thanks', 'advantage', 'lit', 'need', 'hottest', 'thanks', 'tempting', 'extra', 'quick', 'right', 'want', 'saving', 'great', 'nice', 'respect', 'funny', 'good deal', 'sales', 'big', 'learn', '', 'pretty', 'perfect', 'welcome', 'profit', 'great', 'thanks', 'deal', 'clean', 'please', 'hottest', 'wise', 'new', 'great', 'deals', 'cheaper', 'glory', 'better', 'awesome', 'thanks', 

In [6]:
# Get all Negative Rationales
df1_neg = df1[df1['Sentiment'] == "Negative"]
negative_rationale = [neg_rationale.lower() for neg_rationale in df1_neg['Rationale']]
print(negative_rationale)

['frustrating', 'capitalism', 'crying', 'difficult', 'never buy', 'expensive', 'sorry', 'late', 'worse', 'delay', 'terrible', 'wait', 'pissed', 'barely understand', 'stop', 'omg', 'pathetic', 'outdated', 'waiting', 'anywhere', 'not received', 'breaks', 'bad', 'more', 'disappointed', 'charges', 'but', 'expect', 'no', 'sucks', 'learn', 'screwed', 'shit', 'down', 'dammit', 'cheap', 'bad', 'resolved', 'waited', 'issues', 'lose', 'away', '', 'scammed', 'shame', 'never buy', 'confused', 'no one', 'late', 'failed', 'fuck', 'suck', 'fuck', 'wait', 'restock', 'ignore us', 'elsewhere', 'never left', 'worst', 'slammed', 'lost', 'elsewhere', 'frustrating', 'worst', 'thief', 'damage', 'knows anything', 'no shipping', 'cancelled', 'sell', 'worst', 'dented', 'denied', 'no one', 'wait', 'bad', 'worst', 'never', 'horrible', 'sue', 'need', 'late', 'ignore', 'dealbreaker', 'disliked', 'pun', 'mad', 'hatred', 'not', 'not profound', 'mad', 'trash', 'sucks', 'ago', 'trash', 'waiting', 'exchanging', 'cost', 

<h2>Use chi square to find rationales</h2>

In [None]:
active_learning = 0
# user to label each tweet
# active_learning = 1

In [7]:
df_chi =  df1[df1['Sentiment'].isin(["Positive","Negative"])]
cv = CountVectorizer()
X = cv.fit_transform(df_chi['Text'])

chi2score = chi2(X, df_chi['Sentiment'])[0]
# print((chi2score))
features = cv.get_feature_names()
# print(features)

In [8]:
# In case more than one rationale choose using chi square

In [9]:
def choose_rationale_ChiSquare(sentence,pos_rat,neg_rat,active_learning):
    p_rat_list = list(pos_rat)
    n_rat_list = list(neg_rat)
    label =""
    rationale =""
    p_chi = []
    p_chi_val=[]
    n_chi = []
    n_chi_val =[]
    for r in p_rat_list:
        #print(r)
        arg = features.index(r)
        p_chi.append(r)
        p_chi_val.append(chi2score[arg])
        #print(p_chi)
        #print(p_chi_val)
    for r in n_rat_list:
        arg = features.index(r)
        n_chi.append(r)
        n_chi_val.append(chi2score[arg])
        #print(n_chi)
        #print(n_chi_val)
    if sum(p_chi_val)>sum(n_chi_val):
        lab = "Positive"
        rat = p_chi[np.argmax(p_chi_val)]
    elif sum(p_chi_val)<sum(n_chi_val):
        lab = "Negative"
        rat = n_chi[np.argmax(n_chi_val)]
    else:
        if active_learning == 0:
            lab ="NA"
            rat ="NA"

# Active learning for label and rationale
        if active_learning == 1:
            if (len(pos_rat) == 0 and len(neg_rat) == 0) or (sum(p_chi_val)==sum(n_chi_val)):
                print("If not able to give label or rationale - type NA")
                lab = input("Label to the tweet '"+sentence+"'")
                rat = input("Rationale to the tweet '"+sentence+"'")
                if lab == "Positive":
                    if rat not in positive_rationale:
                        positive_rationale.append(rat)
                    else:
                        print("Already exist")
                elif lab=="Negative":
                    if rat not in negative_rationale:
                        negative_rationale.append(rat)
                    else:
                        print("Already exist")
            
    return lab,rat

In [None]:
# Get the rationale for the tweet

In [10]:
def find_ask_Label_Rationale(sentence,active_learning):
    words = set(sentence.split())
    pos_rat = {}
    neg_rat = {}
    if words & set(positive_rationale):
        pos_rat = words & set(positive_rationale)
    if words & set(negative_rationale):
        neg_rat = words & set(negative_rationale)
    
    
    label,rationale = choose_rationale_ChiSquare(sentence,pos_rat,neg_rat,active_learning)
    return label,rationale

In [None]:
#Uncomment following so that active learning can be implemented
# goahead = "Y"
#active_learning = 1

In [11]:
count=0
df.head()
label =""
rationale = ""

for index, row in df.iterrows():
    
    sentence=row['Text'].lower()
    sentiment = row['Sentiment']
    row_rationale = row['Rationale']
    
    if row_rationale == "" and sentiment == "" and len(sentence)>2:
        #print(sentence)
        label,rationale = find_ask_Label_Rationale(sentence,active_learning)
        df['Sentiment'][index] = label
        df['Rationale'][index] = rationale
        #print("Label ="+label)
        #print("Rationale="+rationale)
    
        if (label == "NA") and (rationale == "NA"):
            #print(df.index[index])
            df.drop(df.index[index])
            if goahead =="Y":
                goahead = input('Do you want to label more (Y/N)')
                if goahead =='Y' and active_learning == 1:
                    continue
                else:
                    #Using Chi Square to get rationale
                    active_learning=0
    count+=1
    

In [21]:
# An example to show active learning
# active_learning = 1
label,rationale = find_ask_Label_Rationale("That does sound frustrating. @BestBuyCanHelp can you assist? ^Jessica",active_learning=1)
print(label)
print(rationale)


If not able to give label or rationale - type NA
Label to the tweet 'That does sound frustrating. @BestBuyCanHelp can you assist? ^Jessica'Negative
Rationale to the tweet 'That does sound frustrating. @BestBuyCanHelp can you assist? ^Jessica'frustrating
Already exist
Negative
frustrating


In [18]:
# Get all positive and negative sentiment tweets and create a new csv

In [13]:
df_new =  df[df['Sentiment'].isin(["Positive","Negative"])]

In [15]:
df_new.to_csv("labelleddata.csv",index=False)