### Initial Set-up of DF and functions, adapted from Saad's

In [1]:
import pandas as pd
pd.options.display.max_columns = 50
pd.options.display.max_colwidth = 280

import matplotlib.pyplot as plt
from collections import defaultdict


from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, GridSearchCV

import nltk
from nltk.probability import FreqDist
from nltk.tokenize import regexp_tokenize, word_tokenize, RegexpTokenizer
from nltk import pos_tag
from nltk.corpus import wordnet, stopwords
from nltk.stem import WordNetLemmatizer

import googletrans
import time
from googletrans import Translator

In [2]:
# Stop words
sw = stopwords.words('english')
sw.extend(['sxsw','rt','quot','austin','sxswi','mention','link'])

In [3]:
dataFolder_path = '../../data/'
data_df = pd.read_csv(dataFolder_path+'judge_1377884607_tweet_product_company.csv')

In [4]:
def doc_preparer(doc, stem = False, stop_words=sw):
    '''

    :param doc: a document from the satire corpus 
    :return: a document string with words which have been 
            lemmatized, 
            parsed for stopwords, 
            made lowercase,
            and stripped of punctuation and numbers.
    '''
    regex_token = RegexpTokenizer(r"([a-zA-Z]+(?:’[a-z]+)?)")
    doc = regex_token.tokenize(doc)
    doc = [word.lower() for word in doc]
    doc = [word for word in doc if word not in sw]
#     doc = pos_tag(doc)
#     doc = [(word[0], get_wordnet_pos(word[1])) for word in doc]
#     lemmatizer = WordNetLemmatizer()
#     doc = [lemmatizer.lemmatize(word[0], word[1]) for word in doc]
#     doc = [word for word in doc if word in lemmed_keywords]
    
    
    
    p_stemmer = nltk.stem.PorterStemmer()
    if stem:
        doc = [p_stemmer.stem(word) for word in doc if p_stemmer.stem(word)]
    return ' '.join(doc)

def cv_printScores(cv_metric):
    print('CV Results')
    print('='*32)
    print('Accuracy')
    print('-'*32)
    print(f"Training accuracy: {cv_metric['train_accuracy'].mean():.3f}")
    print(f"Test accuracy:     {cv_metric['test_accuracy'].mean():.3f}")
    print('F-1 Score')
    print('-'*32)
    print(f"Training F1 score: {cv_metric['train_f1_macro'].mean():.3f}")
    print(f"Test F1 score:     {cv_metric['test_f1_macro'].mean():.3f}")
    
# Functions from https://github.com/NandhiniN85/Class-Imbalancing/blob/main/NLP%20-%20Class%20Imbalanced.ipynb
    
def German_translation(x):
    # print(x)
    time.sleep(1)
    german_translation = translator.translate(x, dest='de')
    
    return german_translation.text

def English_translation(x):
    # print(x)
    time.sleep(1)
    english_translation = translator.translate(x, dest='en')
    
    return english_translation.text


In [5]:
# Drop Nulls & I can't tell
data_df.dropna(subset=['tweet_text'],inplace=True)
data_df = data_df[data_df.is_there_an_emotion_directed_at_a_brand_or_product != "I can't tell" ]

In [6]:
# Encode targets
le = LabelEncoder()
data_df['sentiment_target'] = le.fit_transform(data_df.is_there_an_emotion_directed_at_a_brand_or_product)
le.classes_

array(['Negative emotion', 'No emotion toward brand or product',
       'Positive emotion'], dtype=object)

In [7]:
'''# Train test split before stemming for purposes of translation
X = data_df[['tweet_text', 'sentiment_target']]
y = data_df['sentiment_target']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3, random_state=42)'''

"# Train test split before stemming for purposes of translation\nX = data_df[['tweet_text', 'sentiment_target']]\ny = data_df['sentiment_target']\n\nX_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3, random_state=42)"

In [8]:
# Create a dataframe of negatives from training data
'''X_train_df = X_train.to_frame()
neg_df = X_train_df.copy()
neg_df['sentiment_target'] = y_train
neg_df = neg_df[neg_df.sentiment_target == 0]

neg_X = neg_df.drop('sentiment_target', axis=1)
neg_y = neg_df.sentiment_target

y_train_over = pd.concat([y_train, neg_y])'''

"X_train_df = X_train.to_frame()\nneg_df = X_train_df.copy()\nneg_df['sentiment_target'] = y_train\nneg_df = neg_df[neg_df.sentiment_target == 0]\n\nneg_X = neg_df.drop('sentiment_target', axis=1)\nneg_y = neg_df.sentiment_target\n\ny_train_over = pd.concat([y_train, neg_y])"

In [9]:
#X_train_df.info()

In [10]:
# Translate to negative tweet text in train to German
#translator = Translator()

#neg_X.tweet_text = neg_X.tweet_text.apply(lambda x: German_translation(x))

In [11]:
# Save output
# neg_X.tweet_text.to_csv(r'neg_de_train.csv')

In [12]:
# Translate back into English
#neg_X.tweet_text = neg_X.tweet_text.apply(lambda x: English_translation(x))

In [13]:
# Save output for easier reuse
# neg_X.tweet_text.to_csv(r'negtrain2.csv')

In [14]:
#X_train_over.info()

In [15]:
# Create stem'd versions
#X_train_over.reset_index(inplace=True)
#X_train_over['stemmed_tokens'] = X_train_over['tweet_text'].map(lambda x:doc_preparer(x,stem=True))

In [16]:
'''# RFC with data_df
X = data_df['stemmed_tokens']
y = data_df['sentiment_target']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3)'''

"# RFC with data_df\nX = data_df['stemmed_tokens']\ny = data_df['sentiment_target']\n\nX_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3)"

In [17]:
'''# Default DF
tvec = TfidfVectorizer()

X_train_vec = tvec.fit_transform(X_train)
X_train_vec = pd.DataFrame.sparse.from_spmatrix(X_train_vec)
X_train_vec.columns = sorted(tvec.vocabulary_)
X_train_vec.set_index(y_train.index, inplace=True)

rfc = RandomForestClassifier()

tvec_rfc_cvResults = cross_validate(rfc,
                                    X_train_vec,
                                    y_train,
                                    scoring=('accuracy', 'f1_macro'),
                                    cv=5,
                                    verbose=1,
                                    n_jobs=-2,
                                    return_train_score=True)

cv_printScores(tvec_rfc_cvResults)'''

"# Default DF\ntvec = TfidfVectorizer()\n\nX_train_vec = tvec.fit_transform(X_train)\nX_train_vec = pd.DataFrame.sparse.from_spmatrix(X_train_vec)\nX_train_vec.columns = sorted(tvec.vocabulary_)\nX_train_vec.set_index(y_train.index, inplace=True)\n\nrfc = RandomForestClassifier()\n\ntvec_rfc_cvResults = cross_validate(rfc,\n                                    X_train_vec,\n                                    y_train,\n                                    scoring=('accuracy', 'f1_macro'),\n                                    cv=5,\n                                    verbose=1,\n                                    n_jobs=-2,\n                                    return_train_score=True)\n\ncv_printScores(tvec_rfc_cvResults)"

In [18]:
# Test with oversampled neg DF. Same params and setup (will be variation from train/test split)
'''X_neg = over_neg_df['stemmed_tokens']
y_neg = over_neg_df['sentiment_target']

X_neg_train, X_neg_test, y_neg_train, y_neg_test = train_test_split(X_neg, y_neg,test_size = 0.3)

tvec_neg = TfidfVectorizer()

X_neg_train_vec = tvec_neg.fit_transform(X_neg_train)
X_neg_train_vec = pd.DataFrame.sparse.from_spmatrix(X_neg_train_vec)
X_neg_train_vec.columns = sorted(tvec_neg.vocabulary_)
X_neg_train_vec.set_index(y_neg_train.index, inplace=True)

rfc_neg = RandomForestClassifier()

tvec_neg_rfc_cvResults = cross_validate(rfc_neg,
                                    X_neg_train_vec,
                                    y_neg_train,
                                    scoring=('accuracy', 'f1_macro'),
                                    cv=5,
                                    verbose=1,
                                    n_jobs=-2,
                                    return_train_score=True)

cv_printScores(tvec_neg_rfc_cvResults)'''

"X_neg = over_neg_df['stemmed_tokens']\ny_neg = over_neg_df['sentiment_target']\n\nX_neg_train, X_neg_test, y_neg_train, y_neg_test = train_test_split(X_neg, y_neg,test_size = 0.3)\n\ntvec_neg = TfidfVectorizer()\n\nX_neg_train_vec = tvec_neg.fit_transform(X_neg_train)\nX_neg_train_vec = pd.DataFrame.sparse.from_spmatrix(X_neg_train_vec)\nX_neg_train_vec.columns = sorted(tvec_neg.vocabulary_)\nX_neg_train_vec.set_index(y_neg_train.index, inplace=True)\n\nrfc_neg = RandomForestClassifier()\n\ntvec_neg_rfc_cvResults = cross_validate(rfc_neg,\n                                    X_neg_train_vec,\n                                    y_neg_train,\n                                    scoring=('accuracy', 'f1_macro'),\n                                    cv=5,\n                                    verbose=1,\n                                    n_jobs=-2,\n                                    return_train_score=True)\n\ncv_printScores(tvec_neg_rfc_cvResults)"

In [19]:
#X_train_over

In [20]:
#X_train_over.drop('tweet_text', axis=1, inplace=True)

In [21]:
#X_train_over.reset_index(inplace=True)

In [22]:
#X_train_over.drop('index', axis=1)

In [23]:
# y_train_over.reset_index(drop=True, inplace=True)

In [24]:
'''tvec_neg = TfidfVectorizer()

X_neg_train_vec = tvec_neg.fit_transform(X_train_over.stemmed_tokens)
X_neg_train_vec = pd.DataFrame.sparse.from_spmatrix(X_neg_train_vec)
X_neg_train_vec.columns = sorted(tvec_neg.vocabulary_)
X_neg_train_vec.set_index(y_train_over.index, inplace=True)

rfc_neg = RandomForestClassifier()

tvec_neg_rfc_cvResults = cross_validate(rfc_neg,
                                    X_neg_train_vec,
                                    y_train_over,
                                    scoring=('accuracy', 'f1_macro'),
                                    cv=5,
                                    verbose=1,
                                    n_jobs=-2,
                                    return_train_score=True)

cv_printScores(tvec_neg_rfc_cvResults)'''

"tvec_neg = TfidfVectorizer()\n\nX_neg_train_vec = tvec_neg.fit_transform(X_train_over.stemmed_tokens)\nX_neg_train_vec = pd.DataFrame.sparse.from_spmatrix(X_neg_train_vec)\nX_neg_train_vec.columns = sorted(tvec_neg.vocabulary_)\nX_neg_train_vec.set_index(y_train_over.index, inplace=True)\n\nrfc_neg = RandomForestClassifier()\n\ntvec_neg_rfc_cvResults = cross_validate(rfc_neg,\n                                    X_neg_train_vec,\n                                    y_train_over,\n                                    scoring=('accuracy', 'f1_macro'),\n                                    cv=5,\n                                    verbose=1,\n                                    n_jobs=-2,\n                                    return_train_score=True)\n\ncv_printScores(tvec_neg_rfc_cvResults)"

## Take Three: Pipeline to avoid data leakage in crossval step

In [25]:
# Including sentinment target with X_train for use in backtranslation of negatives
X = data_df[['tweet_text', 'sentiment_target']]
y = data_df['sentiment_target']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3, random_state=42)

# Remove extraneaous column from X_test
X_test.drop('sentiment_target', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [30]:
def back_translation(df):
    temp = df
    
    temp.tweet_text = temp.tweet_text.apply(lambda x: German_translation(x))
    temp.tweet_text = temp.tweet_text.apply(lambda x: English_translation(x))
    
    return temp

def back_trans_neg(Xtrain):
    
    # Create DF of only negatives tweets in train
    X_train_df = Xtrain
    # print(X_train_df.head())
    neg_df = X_train_df.copy()
    neg_df = neg_df[neg_df.sentiment_target == 0]
    X_train_df = X_train_df.drop('sentiment_target', axis=1)
    
    # Translate
    neg_X = back_translation(neg_df)
    X_train_back = pd.concat([X_train_df, neg_X])
    X_train_back.reset_index(drop=True, inplace=True)
    
    # Update Y in pipeline??
    '''neg_y = neg_df.sentiment_target
    y_train_back = pd.concat([y_train, neg_y])
    y_train_back.reset_index(drop=True, inplace=True)
    y_train = y_train_back'''
    
    return X_train_back

def stemmatic(Xtrain):
    X_train_stem = Xtrain
    X_train_stem['stemmed_tokens'] = X_train_stem['tweet_text'].map(lambda x:doc_preparer(x,stem=True))
    X_train_stem.drop('tweet_text', axis=1, inplace=True)
    
    return X_train_stem    

In [31]:
"""translator = Translator()

functest = back_trans_neg(X_train)
functest"""

# Worked but took ~15 minutes, impractical for pipeline

Unnamed: 0,tweet_text,sentiment_target
0,"Tech Check podcast -- #SxSW #Android passes #BlackBerry, a big Twitter #fail! -- {link} by @mention #sxsw #cnn",
1,"In honor of Apple's #SXSW pop-up shop, here are some thoughts on how landlords &amp; leasing agents can utilize pop-up shops. {link}",
2,"RT @mention Hoot! New Blog post: HootSuite Mobile for #SXSW ~ Updates for iPhone, BlackBerry &amp; Android {link}",
3,RT @mention @mention 3 iPhone Apps We'll Be Using at South By Southwest Interactive {link} #SXSW #SXSWi,
4,"#sxsw: @mention intrvw @mention &quot;Schmidt [Google CEO] told me: u'r good at telling stories; go talk to lots of ppl, tell us what u hear&quot;",
...,...,...
6353,"Lunch with @mmention at #cnngrill. View from the HTML5 developer trenches: Android is painful, iOS is slim (for what @mmention does) #sxsw",0.0
6354,"New iPhone car correction has already tried to ""change colleagues"". & Quot; vissigots. & Quot;",0.0
6355,@MENTION Google Circles will be lame.#sxsw & lt;3,0.0
6356,Visitor @MENTION IPAD Design Headache #SXSW {Link}},0.0


In [32]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import FunctionTransformer

trans_func = FunctionTransformer(back_trans_neg)
stem_func = FunctionTransformer(stemmatic)

translator = Translator()

T_pipe = Pipeline(steps = [
                        ('tf', trans_func),
                        ('stem', stem_func),
                        ('rfc', RandomForestClassifier(random_state=42))
                        ])

stratified_kfold = StratifiedKFold(n_splits=5,
                                       shuffle=True,
                                       random_state=42)
    
param_grid = {'rfc__max_depth':[10, 100]}
grid_search = GridSearchCV(estimator=T_pipe,
                           param_grid=param_grid,
                           scoring='accuracy',
                           cv=stratified_kfold,
                           n_jobs=-2)

grid_search.fit(X_train, y_train)
cv_score = grid_search.best_score_

PicklingError: Could not pickle the task to send it to the workers.