In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_df_example = pd.read_csv('../input/emaildataset/train.csv')
train_df_example
test_df = pd.read_csv('../input/emaildataset/test.csv')
test_df['email_body'][5]
# \n, time like 10:30 AM, email nkkidd@axiamed.com, â€œDonovan Ventures", OOO (28),

In [None]:
train_df = pd.read_csv('../input/tweet-sentiment-extraction/train.csv')

In [None]:
train_df = train_df[['text', 'sentiment']]
train_df

In [None]:
train_df['sentiment'].hist()

In [None]:
import spacy
import re
from nltk.corpus import stopwords
BAD_SYMBOLS_RE = re.compile('[^a-z #+_]')
nlp = spacy.load("en_core_web_sm")
# Cleaning the tweets

def cleanUpTweet(txt):
    txt = txt.lower()
    # Remove mentions
    txt = re.sub(r'@[A-Za-z0-9_]+', '', txt)
    # Remove hashtags
    txt = re.sub(r'#', '', txt)
    # Remove retweets:
    txt = re.sub(r'RT : ', '', txt)
    # Remove urls
    txt = re.sub(r'https?:\/\/[A-Za-z0-9\.\/]+', '', txt)
#     # Remove emoji
#     txt = remove_emoji(txt)
    # Remove punctuation
    txt = re.sub(r'[^\w\s]', '', txt)
    # Remove bad symbols
    txt = BAD_SYMBOLS_RE.sub(' ', txt)
    
    doc = nlp(txt)

    lemma_list = []
    for token in doc:
        lemma_list.append(token.lemma_)
    
    #Filter the stopword
    filtered_sentence =[] 
    for word in lemma_list:
        lexeme = nlp.vocab[word]
        if lexeme.is_stop == False:
            filtered_sentence.append(word) 
    return " ".join([word for word in filtered_sentence])

In [None]:
from tqdm.notebook import tqdm
tqdm.pandas()

In [None]:
train_df['clean_text'] = train_df['text'].astype('str').progress_apply(cleanUpTweet)

In [None]:
train_df['text'][5]

In [None]:
train_df['clean_text'][5]

In [None]:
train_df = train_df[train_df['clean_text'] != ' ']
train_df = train_df[train_df['clean_text'] != '']
train_df

In [None]:
BAD_SYMBOLS_RE = re.compile('[^A-Za-z]+')
def cleanUpEmails(txt):
    txt = txt.lower()
    txt = re.sub('\n', ' ', txt)
    txt = re.sub('\S*@\S*\s?', '', txt)
    txt = re.sub('\S*.com.\S*', '', txt)
    txt = re.sub('\S*.\S*.com', '', txt)
    txt = re.sub('pm', '', txt)
    txt = BAD_SYMBOLS_RE.sub(' ', txt)
    
    doc = nlp(txt)

    lemma_list = []
    for token in doc:
        lemma_list.append(token.lemma_)
    
    #Filter the stopword
    filtered_sentence =[] 
    for word in lemma_list:
        lexeme = nlp.vocab[word]
        if lexeme.is_stop == False:
            filtered_sentence.append(word) 
    return " ".join([word for word in filtered_sentence if len(word)>1])

In [None]:
test_df['email_body'][4]

In [None]:
cleanUpEmails(test_df['email_body'][4])

In [None]:
test_df['clean_email_body'] = test_df['email_body'].progress_apply(cleanUpEmails)

In [None]:
test_df['clean_email_body']

# Augmentation

## 1. Synonym Replacement

In [None]:
from nltk.corpus import wordnet

def get_synonyms(word):
    
    synonyms = set()
    
    for syn in wordnet.synsets(word):
        for l in syn.lemmas():
            synonym = l.name().replace("_", " ").replace("-", " ").lower()
            synonym = "".join([char for char in synonym if char in ' qwertyuiopasdfghjklzxcvbnm'])
            synonyms.add(synonym) 
    if word in synonyms:
        synonyms.remove(word)
    
    return list(synonyms)

In [None]:
import random
def synonym_replacement(words):
    
    n=2 
    
    words = words.split()
    
    new_words = words.copy()
    random_word_list = list(set([word for word in words]))
    random.shuffle(random_word_list)
    num_replaced = 0
    
    for random_word in random_word_list:
        synonyms = get_synonyms(random_word)
        
        if len(synonyms) >= 1:
            synonym = random.choice(list(synonyms))
            new_words = [synonym if word == random_word else word for word in new_words]
            num_replaced += 1
        
        if num_replaced >= n: #only replace up to n words
            break

    sentence = ' '.join(new_words)

    return sentence

In [None]:
print(f" Example of Synonym Replacement: {synonym_replacement('hey man how are you doing')}")

In [None]:
train_df['syn_augm'] = train_df['clean_text'].progress_apply(synonym_replacement)
train_df

## 2.Random Deletion

In [None]:
def random_deletion(words):
    
    p=0.2

    words = words.split()
    
    #obviously, if there's only one word, don't delete it
    if len(words) < 2:
        return words

    #randomly delete words with probability p
    new_words = []
    for word in words:
        r = random.uniform(0, 1)
        if r > p:
            new_words.append(word)

    #if you end up deleting all words, just return a random word
    if len(new_words) == 0:
        rand_int = random.randint(0, len(words)-1)
        return [words[rand_int]]

    sentence = ' '.join(new_words)
    
    return sentence

In [None]:
train_df['clean_text']

In [None]:
train_df['rand_delete'] = train_df['clean_text'].progress_apply(random_deletion)
train_df

## 3. Random Swap

In [None]:
def swap_word(new_words):
    
    if len(new_words) > 1:
        random_idx_1 = random.randint(0, len(new_words)-1)
        random_idx_2 = random_idx_1
        counter = 0

        while random_idx_2 == random_idx_1:
            random_idx_2 = random.randint(0, len(new_words)-1)
            counter += 1

            if counter > 3:
                return new_words

        new_words[random_idx_1], new_words[random_idx_2] = new_words[random_idx_2], new_words[random_idx_1] 
        return new_words
    else:
        return new_words

In [None]:
def random_swap(words):
    
    n=2
    
    words = words.split()
    new_words = words.copy()
    # n is the number of words to be swapped
    for _ in range(n):
        new_words = swap_word(new_words)
        
    sentence = ' '.join(new_words)
    
    return sentence

In [None]:
train_df['rand_swap'] = train_df['clean_text'].progress_apply(random_swap)
train_df

## 4. Random Insertion

In [None]:
def random_insertion(words):
    
    n=2
    
    words = words.split()
    new_words = words.copy()
    
    for _ in range(n):
        add_word(new_words)
        
    sentence = ' '.join(new_words)
    return sentence

def add_word(new_words):
    
    synonyms = []
    counter = 0
    
    if len(new_words) > 1:
        while len(synonyms) < 1:
            random_word = new_words[random.randint(0, len(new_words)-1)]
            synonyms = get_synonyms(random_word)
            counter += 1
            if counter >= 10:
                return

        random_synonym = synonyms[0]
        random_idx = random.randint(0, len(new_words)-1)
        new_words.insert(random_idx, random_synonym)

In [None]:
train_df['rand_insert'] = train_df['clean_text'].progress_apply(random_insertion)
train_df

In [None]:
X_train = train_df[6000:]['clean_text']
y_train = train_df[6000:]['sentiment']
X_valid = train_df[:6000]['clean_text']
y_valid = train_df[:6000]['sentiment']
X_train_rand_delete = train_df[6000:]['clean_text'].append(train_df[6000:]['rand_delete']).astype('str')
X_valid_rand_delete = train_df[:6000]['clean_text'].append(train_df[:6000]['rand_delete']).astype('str')
X_train_syn_augm = train_df[6000:]['clean_text'].append(train_df[6000:]['syn_augm']).astype('str')
X_valid_syn_augm = train_df[:6000]['clean_text'].append(train_df[:6000]['syn_augm']).astype('str')
X_train_rand_insert = train_df[6000:]['clean_text'].append(train_df[6000:]['rand_insert']).astype('str')
X_valid_rand_insert = train_df[:6000]['clean_text'].append(train_df[:6000]['rand_insert']).astype('str')
X_train_rand_swap = train_df[6000:]['clean_text'].append(train_df[6000:]['rand_swap']).astype('str')
X_valid_rand_swap = train_df[:6000]['clean_text'].append(train_df[:6000]['rand_swap']).astype('str')
y_train_augm = train_df[6000:]['sentiment'].append(train_df[6000:]['sentiment'])
y_valid_augm = train_df[:6000]['sentiment'].append(train_df[:6000]['sentiment'])
X_train

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

def tfidf_features(x_train, x_val):

    tfidf_vectorizer = TfidfVectorizer(min_df=5, max_df=0.9, ngram_range=(1, 2), token_pattern='(\S+)')
    x_train = tfidf_vectorizer.fit_transform(x_train)
    x_val = tfidf_vectorizer.transform(x_val)
    
    return x_train, x_val

In [None]:
X_train_tfidf, X_valid_tfidf = tfidf_features(X_train, X_valid)

In [None]:
X_train_tfidf_rand_delete, X_valid_tfidf_rand_delete = tfidf_features(X_train_rand_delete, X_valid_rand_delete)

In [None]:
X_train_tfidf_syn_augm, X_valid_tfidf_syn_augm = tfidf_features(X_train_syn_augm, X_valid_syn_augm)

In [None]:
X_train_tfidf_rand_insert, X_valid_tfidf_rand_insert = tfidf_features(X_train_rand_insert, X_valid_rand_insert)

In [None]:
X_train_tfidf_rand_swap, X_valid_tfidf_rand_swap = tfidf_features(X_train_rand_swap, X_valid_rand_swap)

# Naive Bayes

In [None]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

def NB(X_train_tfidf, y_train, X_valid_tfidf, y_valid):

    clf = MultinomialNB().fit(X_train_tfidf, y_train)
    y_pred = clf.predict(X_valid_tfidf)
    
    cf = classification_report(y_valid,y_pred)
    print(cf)
    print("f1: ", f1_score(y_valid,y_pred,average='weighted',zero_division=0))
    print("f1 train: ", f1_score(y_train,clf.predict(X_train_tfidf),average='weighted',zero_division=0))
    print("accuracy ", accuracy_score(y_valid,y_pred))

In [None]:
NB(X_train_tfidf, y_train, X_valid_tfidf, y_valid)

In [None]:
NB(X_train_tfidf_rand_delete, y_train_augm, X_valid_tfidf_rand_delete, y_valid_augm)

In [None]:
NB(X_train_tfidf_syn_augm, y_train_augm, X_valid_tfidf_syn_augm, y_valid_augm)

In [None]:
NB(X_train_tfidf_rand_swap, y_train_augm, X_valid_tfidf_rand_swap, y_valid_augm)

In [None]:
NB(X_train_tfidf_rand_insert, y_train_augm, X_valid_tfidf_rand_insert, y_valid_augm)

# SVM

In [None]:
from sklearn.metrics import classification_report
from sklearn.svm import SVC

def SVM(X_train_tfidf, y_train, X_valid_tfidf, y_valid):
    clf = SVC(kernel='linear', decision_function_shape='ovo').fit(X_train_tfidf, y_train)
    y_pred = clf.predict(X_valid_tfidf)
    
    cf = classification_report(y_valid,y_pred)
    print(cf)
    print("f1: ", f1_score(y_valid,y_pred,average='weighted',zero_division=0))
    print("f1 train: ", f1_score(y_train,clf.predict(X_train_tfidf),average='weighted',zero_division=0))
    print("accuracy ", accuracy_score(y_valid,y_pred))

In [None]:
SVM(X_train_tfidf, y_train, X_valid_tfidf, y_valid)

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score

def RF(X_train_tfidf, y_train, X_valid_tfidf, y_valid):
    clf = RandomForestClassifier(max_depth=60, random_state=0)
    clf.fit(X_train_tfidf, y_train)
    y_pred = clf.predict(X_valid_tfidf)
    
    cf = classification_report(y_valid,y_pred)
    print(cf)
    print("f1: ", f1_score(y_valid,y_pred,average='weighted',zero_division=0))
    print("f1 train: ", f1_score(y_train,clf.predict(X_train_tfidf),average='weighted',zero_division=0))
    print("accuracy ", accuracy_score(y_valid,y_pred))

In [None]:
RF(X_train_tfidf, y_train, X_valid_tfidf, y_valid)