In [1]:
import pandas as pd
pd.options.display.max_columns = 50
pd.options.display.max_colwidth = 280

import matplotlib.pyplot as plt
from collections import defaultdict


from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, GridSearchCV

import nltk
from nltk.probability import FreqDist
from nltk.tokenize import regexp_tokenize, word_tokenize, RegexpTokenizer
from nltk import pos_tag
from nltk.corpus import wordnet, stopwords
from nltk.stem import WordNetLemmatizer
sw = stopwords.words('english')

In [2]:
dataFolder_path = '../../data/'

In [11]:
# def get_wordnet_pos(treebank_tag):
#     '''
#     Translate nltk POS to wordnet tags
#     '''
#     if treebank_tag.startswith('J'):
#         return wordnet.ADJ
#     elif treebank_tag.startswith('V'):
#         return wordnet.VERB
#     elif treebank_tag.startswith('N'):
#         return wordnet.NOUN
#     elif treebank_tag.startswith('R'):
#         return wordnet.ADV
#     else:
#         return wordnet.NOUN


def doc_preparer(doc, stem = False, stop_words=sw):
    '''

    :param doc: a document from the satire corpus 
    :return: a document string with words which have been 
            lemmatized, 
            parsed for stopwords, 
            made lowercase,
            and stripped of punctuation and numbers.
    '''
    #Stemming seems to work better. Lemming can't identify plurals of products
    
    
#     lemmed_keywords = ['apple',
#                 'ipad', 'ipads',
#                 'iphone', 'iphones',
#                 'itunes',
#                 'google', 'googled',
#                 'android', 'droid', 'androids', 'droids',
#                 'circle', 'circles'
#                 'app', 'apps']

#     stemmed_keywords = ['appl',
#                         'ipad',
#                         'iphon',
#                         'itun',
#                         'googl',
#                         'android',
#                         'droid',
#                         'circl',
#                         'app']

    regex_token = RegexpTokenizer(r"([a-zA-Z]+(?:’[a-z]+)?)")
    doc = regex_token.tokenize(doc)
    doc = [word.lower() for word in doc]
    doc = [word for word in doc if word not in sw]
#     doc = pos_tag(doc)
#     doc = [(word[0], get_wordnet_pos(word[1])) for word in doc]
#     lemmatizer = WordNetLemmatizer()
#     doc = [lemmatizer.lemmatize(word[0], word[1]) for word in doc]
#     doc = [word for word in doc if word in lemmed_keywords]
    
    
    
    p_stemmer = nltk.stem.PorterStemmer()
    if stem:
        doc = [p_stemmer.stem(word) for word in doc if p_stemmer.stem(word)]
    return ' '.join(doc)

def cv_printScores(cv_metric):
    print('CV Results')
    print('='*32)
    print('Accuracy')
    print('-'*32)
    print(f"Training accuracy: {cv_metric['train_accuracy'].mean():.3f}")
    print(f"Test accuracy:     {cv_metric['test_accuracy'].mean():.3f}")
    print('F-1 Score')
    print('-'*32)
    print(f"Training F1 score: {cv_metric['train_f1_macro'].mean():.3f}")
    print(f"Test F1 score:     {cv_metric['test_f1_macro'].mean():.3f}")

In [12]:
data_df = pd.read_csv(dataFolder_path+'judge_1377884607_tweet_product_company.csv')

In [13]:
data_df.dropna(subset=['tweet_text'],inplace=True)

In [14]:
data_df = data_df[data_df.is_there_an_emotion_directed_at_a_brand_or_product != "I can't tell" ]

In [15]:
le = LabelEncoder()
data_df['sentiment_target'] = le.fit_transform(data_df.is_there_an_emotion_directed_at_a_brand_or_product)
le.classes_

array(['Negative emotion', 'No emotion toward brand or product',
       'Positive emotion'], dtype=object)

## Adding Translations to Address Class Imbalance

In [None]:
#pip install googletrans==4.0.0-rc1

In [7]:
import googletrans
import time

from googletrans import Translator
translator = Translator()

def German_translation(x):
    # print(x)    
    german_translation = translator.translate(x, dest='de')    
    return german_translation.text

def English_translation(x):
    # print(x)    
    english_translation = translator.translate(x, dest='en')
    time.sleep(1)
    return english_translation.text


In [16]:
data_df.head()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product,sentiment_target
0,".@wesley83 I have a 3G iPhone. After 3 hrs tweeting at #RISE_Austin, it was dead! I need to upgrade. Plugin stations at #SXSW.",iPhone,Negative emotion,0
1,"@jessedee Know about @fludapp ? Awesome iPad/iPhone app that you'll likely appreciate for its design. Also, they're giving free Ts at #SXSW",iPad or iPhone App,Positive emotion,2
2,@swonderlin Can not wait for #iPad 2 also. They should sale them down at #SXSW.,iPad,Positive emotion,2
3,@sxsw I hope this year's festival isn't as crashy as this year's iPhone app. #sxsw,iPad or iPhone App,Negative emotion,0
4,"@sxtxstate great stuff on Fri #SXSW: Marissa Mayer (Google), Tim O'Reilly (tech books/conferences) &amp; Matt Mullenweg (Wordpress)",Google,Positive emotion,2


In [17]:
# create dataframe of negative tweets
neg_df = data_df[data_df.sentiment_target == 0]

In [18]:
neg_df.shape

(545, 4)

In [19]:
neg_df.head()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product,sentiment_target
0,".@wesley83 I have a 3G iPhone. After 3 hrs tweeting at #RISE_Austin, it was dead! I need to upgrade. Plugin stations at #SXSW.",iPhone,Negative emotion,0
3,@sxsw I hope this year's festival isn't as crashy as this year's iPhone app. #sxsw,iPad or iPhone App,Negative emotion,0
17,I just noticed DST is coming this weekend. How many iPhone users will be an hour late at SXSW come Sunday morning? #SXSW #iPhone,iPhone,Negative emotion,0
38,@mention - False Alarm: Google Circles Not Coming Now���and Probably Not Ever? - {link} #Google #Circles #Social #SXSW,Google,Negative emotion,0
62,Again? RT @mention Line at the Apple store is insane.. #sxsw,,Negative emotion,0


In [None]:
# ger_df = neg_df.copy()

In [None]:
# ger_df.tweet_text = ger_df.tweet_text.apply(lambda x: German_translation(x))

In [None]:
# ger_df.head()

In [4]:
# ger_df.to_csv(r'neg_ger.csv')
ger_df = pd.read_csv('neg_ger.csv')

In [5]:
ger_df.set_index("Unnamed: 0")

Unnamed: 0_level_0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product,sentiment_target
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,".@Wesley83 Ich habe ein 3G -iPhone.Nach 3 Stunden twitterte es bei #Rise_austin, es war tot!Ich muss upgraden.Plugin -Stationen bei #SXSW.",iPhone,Negative emotion,0
3,"@SXSW Ich hoffe, das diesjährige Festival ist nicht so unklar wie die diesjährige iPhone -App.#SXSW",iPad or iPhone App,Negative emotion,0
17,"Ich habe gerade bemerkt, dass DST dieses Wochenende kommt.Wie viele iPhone -Nutzer werden am Sonntagmorgen eine Stunde später bei SXSW sein?#SXSW #iphone",iPhone,Negative emotion,0
38,@mention - Falschalarm: Google -Kreise kommen jetzt nicht. Und wahrscheinlich nie?- {link} #google #circles #social #sxsw,Google,Negative emotion,0
62,Wieder?RT @mention Line im Apple Store ist verrückt. #Sxsw,,Negative emotion,0
...,...,...,...,...
8603,"Google Guy von #sxsw Talk erklärt, wie er realistische Twitter -Bots als Experiment gemacht hat.Gee, danke, dass du das gemacht hast.",,Negative emotion,0
8611,"Ich denke, mein Effing Ehemann steht für ein #IPAD 2 in der Schlange. Kann von jemandem in die Line-up für Frau Nummer 2 hinweisen?#SXSWI #SXSW",iPad,Negative emotion,0
8638,"Ich bin mir ziemlich sicher, dass der Diskussionsteilnehmer, der denkt ""Apple, in ihrem Erfolg ertrunken"".ist verdammt verrückt.#SXSW",Apple,Negative emotion,0
8672,"Hey, macht jemand #sxsw, der sich für die Gruppen -SMS -App GroupMe anmeldet?Ich habe es auf meinem iPhone bekommen, aber sonst ist niemand dabei, also ... irgendwie nutzlos.",,Negative emotion,0


In [8]:
# Now Translate back to english
neg_tran_df = ger_df.copy()
neg_tran_df.tweet_text = ger_df.tweet_text.apply(lambda x: English_translation(x))

In [9]:
neg_tran_df.head()

Unnamed: 0.1,Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product,sentiment_target
0,0,".@Wesley83 I have a 3G -IPhone. After 3 hours tweeting it at #rise_austin, it was dead! I have to upgrade.plugin stations at #SXSW.",iPhone,Negative emotion,0
1,3,@SXSW I hope this year's festival is not as unclear as this year's iPhone app.#Sxsw,iPad or iPhone App,Negative emotion,0
2,17,I just noticed that DST is coming this weekend. How many iPhone users will be at SXSW an hour later on Sunday morning?,iPhone,Negative emotion,0
3,38,@MENTION - false alarm: Google circles are not coming now.And probably never?- {Link} #google #circles #Social #sxsw,Google,Negative emotion,0
4,62,Again?#SXSW,,Negative emotion,0


In [10]:
neg_tran_df.set_index("Unnamed: 0")

Unnamed: 0_level_0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product,sentiment_target
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,".@Wesley83 I have a 3G -IPhone. After 3 hours tweeting it at #rise_austin, it was dead! I have to upgrade.plugin stations at #SXSW.",iPhone,Negative emotion,0
3,@SXSW I hope this year's festival is not as unclear as this year's iPhone app.#Sxsw,iPad or iPhone App,Negative emotion,0
17,I just noticed that DST is coming this weekend. How many iPhone users will be at SXSW an hour later on Sunday morning?,iPhone,Negative emotion,0
38,@MENTION - false alarm: Google circles are not coming now.And probably never?- {Link} #google #circles #Social #sxsw,Google,Negative emotion,0
62,Again?#SXSW,,Negative emotion,0
...,...,...,...,...
8603,Google Guy from #SXSW Talk explains how he did realistic Twitter bots as an experiment.,,Negative emotion,0
8611,I think my Effing husband stands in line for a #ipad 2.Can someone point out the line-up for woman number 2? #Sxswi #sxsw,iPad,Negative emotion,0
8638,"I am pretty sure that the discussion participant who thinks ""Apple drowned in her success"".",Apple,Negative emotion,0
8672,"Hey, does someone do #SXSW who registers for the GroupMe groups? I got it on my iPhone, but nobody else is there, so ... somehow useless.",,Negative emotion,0


In [20]:
neg_tran_df.to_csv(r'neg_translated.csv')