### Initial Set-up of DF and functions, adapted from Saad's

In [4]:
import pandas as pd
pd.options.display.max_columns = 50
pd.options.display.max_colwidth = 280

import matplotlib.pyplot as plt
from collections import defaultdict


from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, GridSearchCV

import nltk
from nltk.probability import FreqDist
from nltk.tokenize import regexp_tokenize, word_tokenize, RegexpTokenizer
from nltk import pos_tag
from nltk.corpus import wordnet, stopwords
from nltk.stem import WordNetLemmatizer

import googletrans
import time
from googletrans import Translator



In [5]:
# Stop words
sw = stopwords.words('english')
sw.extend(['sxsw','rt','quot','austin','sxswi','mention','link'])

In [6]:
dataFolder_path = '../../data/'
data_df = pd.read_csv(dataFolder_path+'judge_1377884607_tweet_product_company.csv')

In [7]:
# def get_wordnet_pos(treebank_tag):
#     '''
#     Translate nltk POS to wordnet tags
#     '''
#     if treebank_tag.startswith('J'):
#         return wordnet.ADJ
#     elif treebank_tag.startswith('V'):
#         return wordnet.VERB
#     elif treebank_tag.startswith('N'):
#         return wordnet.NOUN
#     elif treebank_tag.startswith('R'):
#         return wordnet.ADV
#     else:
#         return wordnet.NOUN


def doc_preparer(doc, stem = False, stop_words=sw):
    '''

    :param doc: a document from the satire corpus 
    :return: a document string with words which have been 
            lemmatized, 
            parsed for stopwords, 
            made lowercase,
            and stripped of punctuation and numbers.
    '''
    #Stemming seems to work better. Lemming can't identify plurals of products
    
    
#     lemmed_keywords = ['apple',
#                 'ipad', 'ipads',
#                 'iphone', 'iphones',
#                 'itunes',
#                 'google', 'googled',
#                 'android', 'droid', 'androids', 'droids',
#                 'circle', 'circles'
#                 'app', 'apps']

#     stemmed_keywords = ['appl',
#                         'ipad',
#                         'iphon',
#                         'itun',
#                         'googl',
#                         'android',
#                         'droid',
#                         'circl',
#                         'app']

    regex_token = RegexpTokenizer(r"([a-zA-Z]+(?:’[a-z]+)?)")
    doc = regex_token.tokenize(doc)
    doc = [word.lower() for word in doc]
    doc = [word for word in doc if word not in sw]
#     doc = pos_tag(doc)
#     doc = [(word[0], get_wordnet_pos(word[1])) for word in doc]
#     lemmatizer = WordNetLemmatizer()
#     doc = [lemmatizer.lemmatize(word[0], word[1]) for word in doc]
#     doc = [word for word in doc if word in lemmed_keywords]
    
    
    
    p_stemmer = nltk.stem.PorterStemmer()
    if stem:
        doc = [p_stemmer.stem(word) for word in doc if p_stemmer.stem(word)]
    return ' '.join(doc)

def cv_printScores(cv_metric):
    print('CV Results')
    print('='*32)
    print('Accuracy')
    print('-'*32)
    print(f"Training accuracy: {cv_metric['train_accuracy'].mean():.3f}")
    print(f"Test accuracy:     {cv_metric['test_accuracy'].mean():.3f}")
    print('F-1 Score')
    print('-'*32)
    print(f"Training F1 score: {cv_metric['train_f1_macro'].mean():.3f}")
    print(f"Test F1 score:     {cv_metric['test_f1_macro'].mean():.3f}")
    
# Functions from https://github.com/NandhiniN85/Class-Imbalancing/blob/main/NLP%20-%20Class%20Imbalanced.ipynb
    
def German_translation(x):
    # print(x)    
    german_translation = translator.translate(x, dest='de')    
    return german_translation.text

def English_translation(x):
    # print(x)    
    english_translation = translator.translate(x, dest='en')
    #time.sleep(1)
    return english_translation.text


In [8]:
# Drop Nulls & I can't tell
data_df.dropna(subset=['tweet_text'],inplace=True)
data_df = data_df[data_df.is_there_an_emotion_directed_at_a_brand_or_product != "I can't tell" ]

In [9]:
# Encode targets
le = LabelEncoder()
data_df['sentiment_target'] = le.fit_transform(data_df.is_there_an_emotion_directed_at_a_brand_or_product)
le.classes_

array(['Negative emotion', 'No emotion toward brand or product',
       'Positive emotion'], dtype=object)

In [10]:
# Train test split before stemming for purposes of translation
X = data_df['tweet_text']
y = data_df['sentiment_target']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3, random_state=42)

In [8]:
# Create a dataframe of negatives from training data
X_train_df = X_train.to_frame()
neg_df = X_train_df.copy()
neg_df['sentiment_target'] = y_train
neg_df = neg_df[neg_df.sentiment_target == 0]

neg_X = neg_df.drop('sentiment_target', axis=1)
neg_y = neg_df.sentiment_target

y_train_over = pd.concat([y_train, neg_y])

In [9]:
X_train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5998 entries, 3791 to 7397
Data columns (total 1 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   tweet_text  5998 non-null   object
dtypes: object(1)
memory usage: 93.7+ KB


In [10]:
# Translate to negative tweet text in train to German
translator = Translator()

neg_X.tweet_text = neg_X.tweet_text.apply(lambda x: German_translation(x))

In [11]:
# Save output
# neg_X.tweet_text.to_csv(r'neg_de_train.csv')

In [13]:
# Translate back into English
neg_X.tweet_text = neg_X.tweet_text.apply(lambda x: English_translation(x))

In [17]:
# Save output for easier reuse
# neg_X.tweet_text.to_csv(r'negtrain2.csv')

In [35]:
# Add oversample X to X_train
X_train_over = pd.concat([X_train_df, neg_X])
X_train_over

Unnamed: 0,tweet_text
3791,"Tech Check podcast -- #SxSW #Android passes #BlackBerry, a big Twitter #fail! -- {link} by @mention #sxsw #cnn"
4683,"In honor of Apple's #SXSW pop-up shop, here are some thoughts on how landlords &amp; leasing agents can utilize pop-up shops. {link}"
5800,"RT @mention Hoot! New Blog post: HootSuite Mobile for #SXSW ~ Updates for iPhone, BlackBerry &amp; Android {link}"
4879,RT @mention @mention 3 iPhone Apps We'll Be Using at South By Southwest Interactive {link} #SXSW #SXSWi
2804,"#sxsw: @mention intrvw @mention &quot;Schmidt [Google CEO] told me: u'r good at telling stories; go talk to lots of ppl, tell us what u hear&quot;"
...,...
3218,"Lunch with @mmention at #cnngrill. View from the HTML5 developer trenches: Android is painful, iOS is slim (for what @mmention does) #sxsw"
2501,"New iPhone car correction has already tried to ""change colleagues"". & Quot; vissigots. & Quot;"
3163,@MENTION Google Circles will be lame.#sxsw & lt;3
65,Visitor @MENTION IPAD Design Headache #SXSW {Link}}


In [36]:
X_train_over.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6358 entries, 3791 to 7397
Data columns (total 1 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   tweet_text  6358 non-null   object
dtypes: object(1)
memory usage: 99.3+ KB


In [37]:
# Create stem'd versions
#X_train_over.reset_index(inplace=True)
X_train_over['stemmed_tokens'] = X_train_over['tweet_text'].map(lambda x:doc_preparer(x,stem=True))

In [None]:
'''# RFC with data_df
X = data_df['stemmed_tokens']
y = data_df['sentiment_target']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3)'''

In [18]:
# Default DF
tvec = TfidfVectorizer()

X_train_vec = tvec.fit_transform(X_train)
X_train_vec = pd.DataFrame.sparse.from_spmatrix(X_train_vec)
X_train_vec.columns = sorted(tvec.vocabulary_)
X_train_vec.set_index(y_train.index, inplace=True)

rfc = RandomForestClassifier()

tvec_rfc_cvResults = cross_validate(rfc,
                                    X_train_vec,
                                    y_train,
                                    scoring=('accuracy', 'f1_macro'),
                                    cv=5,
                                    verbose=1,
                                    n_jobs=-2,
                                    return_train_score=True)

cv_printScores(tvec_rfc_cvResults)

[Parallel(n_jobs=-2)]: Using backend LokyBackend with 11 concurrent workers.
[Parallel(n_jobs=-2)]: Done   2 out of   5 | elapsed:   13.2s remaining:   19.9s


CV Results
Accuracy
--------------------------------
Training accuracy: 0.997
Test accuracy:     0.670
F-1 Score
--------------------------------
Training F1 score: 0.997
Test F1 score:     0.486


[Parallel(n_jobs=-2)]: Done   5 out of   5 | elapsed:   15.2s finished


In [None]:
# Test with oversampled neg DF. Same params and setup (will be variation from train/test split)
'''X_neg = over_neg_df['stemmed_tokens']
y_neg = over_neg_df['sentiment_target']

X_neg_train, X_neg_test, y_neg_train, y_neg_test = train_test_split(X_neg, y_neg,test_size = 0.3)

tvec_neg = TfidfVectorizer()

X_neg_train_vec = tvec_neg.fit_transform(X_neg_train)
X_neg_train_vec = pd.DataFrame.sparse.from_spmatrix(X_neg_train_vec)
X_neg_train_vec.columns = sorted(tvec_neg.vocabulary_)
X_neg_train_vec.set_index(y_neg_train.index, inplace=True)

rfc_neg = RandomForestClassifier()

tvec_neg_rfc_cvResults = cross_validate(rfc_neg,
                                    X_neg_train_vec,
                                    y_neg_train,
                                    scoring=('accuracy', 'f1_macro'),
                                    cv=5,
                                    verbose=1,
                                    n_jobs=-2,
                                    return_train_score=True)

cv_printScores(tvec_neg_rfc_cvResults)'''

In [38]:
X_train_over

Unnamed: 0,tweet_text,stemmed_tokens
3791,"Tech Check podcast -- #SxSW #Android passes #BlackBerry, a big Twitter #fail! -- {link} by @mention #sxsw #cnn",tech check podcast android pass blackberri big twitter fail cnn
4683,"In honor of Apple's #SXSW pop-up shop, here are some thoughts on how landlords &amp; leasing agents can utilize pop-up shops. {link}",honor appl pop shop thought landlord amp leas agent util pop shop
5800,"RT @mention Hoot! New Blog post: HootSuite Mobile for #SXSW ~ Updates for iPhone, BlackBerry &amp; Android {link}",hoot new blog post hootsuit mobil updat iphon blackberri amp android
4879,RT @mention @mention 3 iPhone Apps We'll Be Using at South By Southwest Interactive {link} #SXSW #SXSWi,iphon app use south southwest interact
2804,"#sxsw: @mention intrvw @mention &quot;Schmidt [Google CEO] told me: u'r good at telling stories; go talk to lots of ppl, tell us what u hear&quot;",intrvw schmidt googl ceo told u r good tell stori go talk lot ppl tell us u hear
...,...,...
3218,"Lunch with @mmention at #cnngrill. View from the HTML5 developer trenches: Android is painful, iOS is slim (for what @mmention does) #sxsw",lunch mmention cnngrill view html develop trench android pain io slim mmention
2501,"New iPhone car correction has already tried to ""change colleagues"". & Quot; vissigots. & Quot;",new iphon car correct alreadi tri chang colleagu vissigot
3163,@MENTION Google Circles will be lame.#sxsw & lt;3,googl circl lame lt
65,Visitor @MENTION IPAD Design Headache #SXSW {Link}},visitor ipad design headach


In [39]:
X_train_over.drop('tweet_text', axis=1, inplace=True)

In [40]:
X_train_over.reset_index(inplace=True)

In [45]:
X_train_over.drop('index', axis=1)

Unnamed: 0,stemmed_tokens
0,tech check podcast android pass blackberri big twitter fail cnn
1,honor appl pop shop thought landlord amp leas agent util pop shop
2,hoot new blog post hootsuit mobil updat iphon blackberri amp android
3,iphon app use south southwest interact
4,intrvw schmidt googl ceo told u r good tell stori go talk lot ppl tell us u hear
...,...
6353,lunch mmention cnngrill view html develop trench android pain io slim mmention
6354,new iphon car correct alreadi tri chang colleagu vissigot
6355,googl circl lame lt
6356,visitor ipad design headach


In [48]:
y_train_over.reset_index(drop=True, inplace=True)

In [50]:
tvec_neg = TfidfVectorizer()

X_neg_train_vec = tvec_neg.fit_transform(X_train_over.stemmed_tokens)
X_neg_train_vec = pd.DataFrame.sparse.from_spmatrix(X_neg_train_vec)
X_neg_train_vec.columns = sorted(tvec_neg.vocabulary_)
X_neg_train_vec.set_index(y_train_over.index, inplace=True)

rfc_neg = RandomForestClassifier()

tvec_neg_rfc_cvResults = cross_validate(rfc_neg,
                                    X_neg_train_vec,
                                    y_train_over,
                                    scoring=('accuracy', 'f1_macro'),
                                    cv=5,
                                    verbose=1,
                                    n_jobs=-2,
                                    return_train_score=True)

cv_printScores(tvec_neg_rfc_cvResults)

[Parallel(n_jobs=-2)]: Using backend LokyBackend with 11 concurrent workers.
[Parallel(n_jobs=-2)]: Done   2 out of   5 | elapsed:   13.9s remaining:   20.8s


CV Results
Accuracy
--------------------------------
Training accuracy: 0.970
Test accuracy:     0.690
F-1 Score
--------------------------------
Training F1 score: 0.972
Test F1 score:     0.645


[Parallel(n_jobs=-2)]: Done   5 out of   5 | elapsed:   15.7s finished


## Take Three: Pipeline to avoid data leakage in crossval step

In [1]:
# !pip install transformers

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
Collecting filelock
  Downloading filelock-3.6.0-py3-none-any.whl (10.0 kB)
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp38-cp38-win_amd64.whl (3.3 MB)
Collecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
Collecting typing-extensions>=3.7.4.3
  Downloading typing_extensions-4.2.0-py3-none-any.whl (24 kB)
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py): started
  Building wheel for sacremoses (setup.py): finished with status 'done'
  Created wheel for sacremoses: filename=sacremoses-0.0.53-py3-none-any.whl size=895257 sha256=596b460ba05033a7e05c4f27aa74dabea42e3d05dc4c593a55b32f0c3570dca3
  Stored in directory: c:\users\nopto\appdata\local\pip\cache\wheels\82\ab\9b\c15899bf659ba74f623ac776e861cf2eb8608c1825ddec

ERROR: After October 2020 you may experience errors when installing or updating packages. This is because pip will change the way that it resolves dependency conflicts.

We recommend you use --use-feature=2020-resolver to test your packages with the new resolver before it becomes the default.

huggingface-hub 0.5.1 requires packaging>=20.9, but you'll have packaging 20.4 which is incompatible.


In [17]:
def back_translation(X):
    translator = Translator()
    temp = X
    
    temp.tweet_text = temp.tweet_text.apply(lambda x: German_translation(x))
    temp.tweet_text = temp.tweet_text.apply(lambda x: English_translation(x))
    
    return temp

def back_trans_neg(Xtrain, ytrain):
    
    # Create DF of only negatives tweets in train
    X_train_df = Xtrain.to_frame()
    # print(X_train_df.head())
    neg_df = X_train_df.copy()
    neg_df['sentiment_target'] = ytrain
    neg_df = neg_df[neg_df.sentiment_target == 0]
    
    # Translate
    neg_X = back_translation(neg_X)    
    
    neg_y = neg_df.sentiment_target
    y_train_back = pd.concat([y_train, neg_y])
    y_train_back.reset_index(drop=True, inplace=True)
    
    neg_X = neg_df.drop('sentiment_target', axis=1)
    X_train_back = pd.concat([X_train_df, neg_X])
    X_train_back.reset_index(drop=True, inplace=True)
    
    return X_train_back, y_train_back

def stemmatic(Xtrain):
    X_train_stem = Xtrain
    X_train_stem['stemmed_tokens'] = X_train_stem['tweet_text'].map(lambda x:doc_preparer(x,stem=True))
    X_train_stem.drop('tweet_text', axis=1, inplace=True)
    
    return X_train_stem    

In [18]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import FunctionTransformer

trans_func = FunctionTransformer(back_trans_neg)
stem_func = FunctionTransformer(stemmatic)

T_pipe = Pipeline(steps = [
                        ('tf', trans_func),
                        ('stem', stem_func),
                        ('rfc', RandomForestClassifier(random_state=42))
                        ])

stratified_kfold = StratifiedKFold(n_splits=5,
                                       shuffle=True,
                                       random_state=42)
    
param_grid = {'rfc__max_depth':[100, 1000]}
grid_search = GridSearchCV(estimator=T_pipe,
                           param_grid=param_grid,
                           scoring='accuracy',
                           cv=stratified_kfold,
                           n_jobs=-2)

grid_search.fit(X_train, y_train)
cv_score = grid_search.best_score_

                                                                                                                                              tweet_text
3791                                      Tech Check podcast -- #SxSW #Android passes #BlackBerry, a big Twitter #fail! -- {link} by @mention #sxsw #cnn
4683                In honor of Apple's #SXSW pop-up shop, here are some thoughts on how landlords &amp; leasing agents can utilize pop-up shops. {link}
5800                                   RT @mention Hoot! New Blog post: HootSuite Mobile for #SXSW ~ Updates for iPhone, BlackBerry &amp; Android {link}
4879                                             RT @mention @mention 3 iPhone Apps We'll Be Using at South By Southwest Interactive {link} #SXSW #SXSWi
2804  #sxsw: @mention intrvw @mention &quot;Schmidt [Google CEO] told me: u'r good at telling stories; go talk to lots of ppl, tell us what u hear&quot;


NameError: name 'ytrain' is not defined