### Initial Set-up of DF and functions, adapted from Saad's

In [1]:
import pandas as pd
pd.options.display.max_columns = 50
pd.options.display.max_colwidth = 280

import matplotlib.pyplot as plt
from collections import defaultdict


from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, GridSearchCV

import nltk
from nltk.probability import FreqDist
from nltk.tokenize import regexp_tokenize, word_tokenize, RegexpTokenizer
from nltk import pos_tag
from nltk.corpus import wordnet, stopwords
from nltk.stem import WordNetLemmatizer

import googletrans
import time
from googletrans import Translator

In [2]:
# Stop words
sw = stopwords.words('english')
sw.extend(['sxsw','rt','quot','austin','sxswi','mention','link'])

In [3]:
dataFolder_path = '../../data/'
data_df = pd.read_csv(dataFolder_path+'judge_1377884607_tweet_product_company.csv')

In [4]:
def doc_preparer(doc, stem = False, stop_words=sw):
    '''

    :param doc: a document from the satire corpus 
    :return: a document string with words which have been 
            lemmatized, 
            parsed for stopwords, 
            made lowercase,
            and stripped of punctuation and numbers.
    '''
    regex_token = RegexpTokenizer(r"([a-zA-Z]+(?:’[a-z]+)?)")
    doc = regex_token.tokenize(doc)
    doc = [word.lower() for word in doc]
    doc = [word for word in doc if word not in sw]
#     doc = pos_tag(doc)
#     doc = [(word[0], get_wordnet_pos(word[1])) for word in doc]
#     lemmatizer = WordNetLemmatizer()
#     doc = [lemmatizer.lemmatize(word[0], word[1]) for word in doc]
#     doc = [word for word in doc if word in lemmed_keywords]
    
    
    
    p_stemmer = nltk.stem.PorterStemmer()
    if stem:
        doc = [p_stemmer.stem(word) for word in doc if p_stemmer.stem(word)]
    return ' '.join(doc)

def cv_printScores(cv_metric):
    print('CV Results')
    print('='*32)
    print('Accuracy')
    print('-'*32)
    print(f"Training accuracy: {cv_metric['train_accuracy'].mean():.3f}")
    print(f"Test accuracy:     {cv_metric['test_accuracy'].mean():.3f}")
    print('F-1 Score')
    print('-'*32)
    print(f"Training F1 score: {cv_metric['train_f1_macro'].mean():.3f}")
    print(f"Test F1 score:     {cv_metric['test_f1_macro'].mean():.3f}")
    
# Functions from https://github.com/NandhiniN85/Class-Imbalancing/blob/main/NLP%20-%20Class%20Imbalanced.ipynb
    
def German_translation(x):
    # print(x)
    time.sleep(1)
    german_translation = translator.translate(x, dest='de')
    
    return german_translation.text

def English_translation(x):
    # print(x)
    time.sleep(1)
    english_translation = translator.translate(x, dest='en')
    
    return english_translation.text


In [5]:
# Drop Nulls & I can't tell
data_df.dropna(subset=['tweet_text'],inplace=True)
data_df = data_df[data_df.is_there_an_emotion_directed_at_a_brand_or_product != "I can't tell" ]

In [6]:
# Encode targets
le = LabelEncoder()
data_df['sentiment_target'] = le.fit_transform(data_df.is_there_an_emotion_directed_at_a_brand_or_product)
le.classes_

array(['Negative emotion', 'No emotion toward brand or product',
       'Positive emotion'], dtype=object)

## Take Three: Pipeline to avoid data leakage in crossval step

In [7]:
# Including sentinment target with X_train for use in backtranslation of negatives
X = data_df[['tweet_text', 'sentiment_target']]
y = data_df['sentiment_target']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3, random_state=42)

# Remove extraneaous column from X_test
X_test.drop('sentiment_target', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [61]:
def back_translation(df):
    temp = df
    translator = Translator()
    
    temp.tweet_text = temp.tweet_text.apply(lambda x: German_translation(x))
    temp.tweet_text = temp.tweet_text.apply(lambda x: English_translation(x))
    
    return temp

def back_trans_neg(Xtrain):
    # Check index of X_train against index of back translation to see which negatives are in it, 
    Xfromcsv = pd.read_csv('backs_trans_fulltrain.csv')
    Xfromcsv.set_index('Unnamed: 0', inplace=True)
    neg_list = Xfromcsv.index.tolist()
    
    backtransdf = Xtrain[Xtrain.index.isin(neg_list)] 
    print(len(backtransdf.index))
    
    # Append the backtranslated versions of the neg in X_train to X_train
    X_train_back = pd.concat([Xtrain, backtransdf])
    X_train_back.drop('sentiment_target', axis=1, inplace=True)
    
    # Use # of backtranslations added (len), Add that many 0s to y_train  
        
    return X_train_back

def stemmatic(Xtrain):
    #print(type(Xtrain))
    X_train_stem = Xtrain
    X_train_stem['stemmed_tokens'] = X_train_stem['tweet_text'].map(lambda x:doc_preparer(x,stem=True))
    X_train_stem.drop('tweet_text', axis=1, inplace=True)
    #print("Stem end")
    
    return X_train_stem    

In [65]:
X_train.index.nunique()

5998

### Create and Save list of Back Translated Negative Tweets

In [11]:
'''# Make DF of negatives, post train/test split
neg_df = X_train.copy()
neg_df = neg_df[neg_df.sentiment_target == 0]
neg_df = neg_df.drop('sentiment_target', axis=1)

translator = Translator()

neg_X = back_translation(neg_df)'''

In [14]:
# neg_X.to_csv(r'backs_trans_fulltrain.csv')

### Pipeline

In [62]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer

# Wrap functions
trans_func = FunctionTransformer(back_trans_neg)
stem_func = FunctionTransformer(stemmatic)

# Create CT so tiffdiff gets 1D info
tif_ct = ColumnTransformer(
    [('vec', TfidfVectorizer(), 0)],   # column should be a string or int
    remainder='passthrough'
)


T_pipe = Pipeline(steps = [
                        ('tf', trans_func),
                        ('stem', stem_func),
                        ('tiffydiff', tif_ct),
                        ('rfc', RandomForestClassifier(random_state=42))
                        ], verbose=True)

stratified_kfold = StratifiedKFold(n_splits=5, shuffle=False)
    
param_grid = {'rfc__max_depth':[10, 100]}
grid_search = GridSearchCV(estimator=T_pipe,
                           param_grid=param_grid,
                           scoring='accuracy',
                           cv=stratified_kfold,
                           n_jobs=-2)

grid_search.fit(X_train, y_train)
cv_score = grid_search.best_score_

360
[Pipeline] ................ (step 1 of 4) Processing tf, total=   0.0s
[Pipeline] .............. (step 2 of 4) Processing stem, total=   2.9s
[Pipeline] ......... (step 3 of 4) Processing tiffydiff, total=   0.1s


ValueError: Found input variables with inconsistent numbers of samples: [6358, 5998]