# ******* Grupo Laura-Borja-Nehomar *******

1. Preparación del Entorno

In [None]:
#Básicos
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import re
import itertools
import datetime

from textblob import TextBlob

# NLTK
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer 
from nltk.tokenize import word_tokenize

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import unidecode
import string

from nltk.probability import FreqDist

import json
use_cuda = False

2. Cargamos los datos

In [None]:
train_df = pd.read_csv('/kaggle/input/tweet-sentiment-extraction/train.csv')
train_df = train_df[train_df['text'].notna()]
train_df = train_df.reset_index()
train_df.head(10)

In [None]:
train_df.info()

3. StopWords

In [None]:
stop_words = set(stopwords.words('english'))

appos = {
"aren't" : "are not",
"can't" : "cannot",
"couldn't" : "could not",
"didn't" : "did not",
"doesn't" : "does not",
"don't" : "do not",
"hadn't" : "had not",
"hasn't" : "has not",
"haven't" : "have not",
"he'd" : "he would",
"he'll" : "he will",
"he's" : "he is",
"i'd" : "i would",
"i'd" : "i had",
"i'll" : "i will",
"i'm" : "i am",
"isn't" : "is not",
"it's" : "it is",
"it'll":"it will",
"i've" : "i have",
"let's" : "let us",
"mightn't" : "might not",
"mustn't" : "must not",
"shan't" : "shall not",
"she'd" : "she would",
"she'll" : "she will",
"she's" : "she is",
"shouldn't" : "should not",
"that's" : "that is",
"there's" : "there is",
"they'd" : "they would",
"they'll" : "they will",
"they're" : "they are",
"they've" : "they have",
"we'd" : "we would",
"we're" : "we are",
"weren't" : "were not",
"we've" : "we have",
"what'll" : "what will",
"what're" : "what are",
"what's" : "what is",
"what've" : "what have",
"where's" : "where is",
"who'd" : "who would",
"who'll" : "who will",
"who're" : "who are",
"who's" : "who is",
"who've" : "who have",
"won't" : "will not",
"wouldn't" : "would not",
"you'd" : "you would",
"you'll" : "you will",
"you're" : "you are",
"you've" : "you have",
"'re": " are",
"wasn't": "was not",
"we'll":" will",
"didn't": "did not"
}

4. Función principal de pre-procesado de datos

In [None]:
def text_preprocess(text):
    lemma = nltk.wordnet.WordNetLemmatizer()
    
    text = str(text)
    
    #removing mentions and hashtags

    text = ' '.join(re.sub("(@[A-Za-z0-9]+)|(#[A-Za-z0-9]+)", " ", text).split())
    
    #remove http links from tweets
    
    
    link_regex    = re.compile('((https?):((//)|(\\\\))+([\w\d:#@%/;$()~_?\+-=\\\.&](#!)?)*)', re.DOTALL)
    links         = re.findall(link_regex, text)
    for link in links:
        text = text.replace(link[0], '')  
    
    text_pattern = re.sub("`", "'", text)
    
    #fix misspelled words

    '''Here we are not actually building any complex function to correct the misspelled words but just checking that each character 
    should occur not more than 2 times in every word. It’s a very basic misspelling check.'''

    text = ''.join(''.join(s)[:2] for _, s in itertools.groupby(text))
    
    
   # print(text_pattern)
    
    #Convert to lower and negation handling
    
    text_lr = text_pattern.lower()
    
   # print(text_lr)
    
    words = text_lr.split()
    text_neg = [appos[word] if word in appos else word for word in words]
    text_neg = " ".join(text_neg) 
   # print(text_neg)
    
    #remove stopwords
    
    tokens = word_tokenize(text_neg)
    text_nsw = [i for i in tokens if i not in stop_words]
    text_nsw = " ".join(text_nsw) 
   # print(text_nsw)
    
    
    #remove tags
    
    text_tags=re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",text_nsw)

    # remove special characters and digits
    text_alpha=re.sub("(\\d|\\W)+"," ",text_tags)
    
    #Remove accented characters
    text = unidecode.unidecode(text_alpha)
    
    '''#Remove punctuation
    table = str.maketrans('', '', string.punctuation)
    text = [w.translate(table) for w in text.split()]'''
    
    sent = TextBlob(text)
    tag_dict = {"J": 'a', 
                "N": 'n', 
                "V": 'v', 
                "R": 'r'}
    words_and_tags = [(w, tag_dict.get(pos[0], 'n')) for w, pos in sent.tags]    
    lemmatized_list = [wd.lemmatize(tag) for wd, tag in words_and_tags]
   
    return " ".join(lemmatized_list)

In [None]:
train_df['processed_text'] = None

for i in range(len(train_df)):
    train_df.processed_text[i] = text_preprocess(train_df.text[i])

5. Vemos si el DataSet esta debalanceado

In [None]:
import matplotlib.pyplot as plt
ax = train_df['sentiment'].value_counts(sort=False).plot(kind='barh')
ax.set_xlabel('Número de muestras')
ax.set_ylabel('Etiqueta')

6. Wordcloud

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# Polarity ==  negative
train_s0 = train_df[train_df.sentiment == 'negative']
all_text = ' '.join(word for word in train_s0.processed_text)
wordcloud_neg = WordCloud(colormap='Reds', width=1000, height=1000, background_color='white').generate(all_text) #mode='RGBA'
plt.figure(figsize=(20,10))
plt.title('Negative sentiment - Wordcloud')
plt.imshow(wordcloud_neg, interpolation='bilinear')
plt.axis("off")
plt.margins(x=0, y=0)
plt.show()

wordcloud_neg.to_file('negative_senti_wordcloud.jpg')

# Polarity ==  neutral
train_s1 = train_df[train_df.sentiment == 'neutral']
all_text = ' '.join(word for word in train_s1.processed_text)
wordcloud_neu = WordCloud(width=1000, height=1000, colormap='Blues', background_color='white').generate(all_text)
plt.figure( figsize=(20,10))
plt.title('Neutral sentiment - Wordcloud')
plt.imshow(wordcloud_neu, interpolation='bilinear')
plt.axis("off")
plt.margins(x=0, y=0)
plt.show()

wordcloud_neu.to_file('neutral_senti_wordcloud.jpg')

# Polarity ==  positive
train_s2 = train_df[train_df.sentiment  == 'positive']
all_text = ' '.join(word for word in train_s2.processed_text)
wordcloud_pos = WordCloud(width=1000, height=1000, colormap='Wistia',background_color='white').generate(all_text)
plt.figure(figsize=(20,10))
plt.title('Positive sentiment - Wordcloud')
plt.imshow(wordcloud_pos, interpolation='bilinear')
plt.axis("off")
plt.margins(x=0, y=0)
plt.show()

wordcloud_pos.to_file('positive_senti_wordcloud.jpg')

Cargamos el resto de los datos

In [None]:
test_df = pd.read_csv('/kaggle/input/tweet-sentiment-extraction/test.csv')
sub_df = pd.read_csv('/kaggle/input/tweet-sentiment-extraction/sample_submission.csv')

In [None]:
train = np.array(train_df)
test = np.array(test_df)

!mkdir -p data

Prepare data in QA format


In [None]:
train_data = [
    {
        'context': "This tweet sentiment extraction challenge is great",
        'qas': [
            {
                'id': "00001",
                'question': "positive",
                'answers': [
                    {
                        'text': "is great",
                        'answer_start': 43
                    }
                ]
            }
        ]
    }
    ]

In [None]:
%%time

"""
Prepare training data in QA-compatible format
"""

# Adpated from https://www.kaggle.com/cheongwoongkang/roberta-baseline-starter-simple-postprocessing
def find_all(input_str, search_str):
    l1 = []
    length = len(input_str)
    index = 0
    while index < length:
        i = input_str.find(search_str, index)
        if i == -1:
            return l1
        l1.append(i)
        index = i + 1
    return l1

def do_qa_train(train):

    output = []
    for line in train:
        context = line[1]
        qas = []
        question = line[-1]
        qid = line[0]
        answers = []
        answer = line[2]
        if type(answer) != str or type(context) != str or type(question) != str:
            print(context, type(context))
            print(answer, type(answer))
            print(question, type(question))
            continue
        answer_starts = find_all(context, answer)
        for answer_start in answer_starts:
            answers.append({'answer_start': answer_start, 'text': answer.lower()})
            break
        qas.append({'question': question, 'id': qid, 'is_impossible': False, 'answers': answers})

        output.append({'context': context.lower(), 'qas': qas})
        
    return output

qa_train = do_qa_train(train)

with open('data/train.json', 'w') as outfile:
    json.dump(qa_train, outfile)

In [None]:
%%time

"""
Prepare testing data in QA-compatible format
"""

def do_qa_test(test):
    output = []
    for line in test:
        context = line[1]
        qas = []
        question = line[-1]
        qid = line[0]
        if type(context) != str or type(question) != str:
            print(context, type(context))
            print(answer, type(answer))
            print(question, type(question))
            continue
        answers = []
        answers.append({'answer_start': 1000000, 'text': '__None__'})
        qas.append({'question': question, 'id': qid, 'is_impossible': False, 'answers': answers})
        output.append({'context': context.lower(), 'qas': qas})
    return output

qa_test = do_qa_test(test)

with open('data/test.json', 'w') as outfile:
    json.dump(qa_test, outfile)

In [None]:
!pip install '/kaggle/input/simple-transformers-pypi/seqeval-0.0.12-py3-none-any.whl' -q
!pip install '/kaggle/input/simple-transformers-pypi/simpletransformers-0.22.1-py3-none-any.whl' -q

Train model

In [None]:
%%time


from simpletransformers.question_answering import QuestionAnsweringModel

MODEL_PATH = '/kaggle/input/transformers-pretrained-distilbert/distilbert-base-uncased-distilled-squad/'

# Create the QuestionAnsweringModel
model = QuestionAnsweringModel('distilbert', 
                               MODEL_PATH, 
                               args={'reprocess_input_data': True,
                                     'overwrite_output_dir': True,
                                     'learning_rate': 5e-5,
                                     'num_train_epochs': 3,
                                     'max_seq_length': 192,
                                     'doc_stride': 64,
                                     'fp16': False,
                                    },
                              use_cuda=use_cuda)

model.train_model('data/train.json')

Submission

In [None]:
%%time

predictions = model.predict(qa_test)
predictions_df = pd.DataFrame.from_dict(predictions)

sub_df['selected_text'] = predictions_df['answer']

sub_df.to_csv('submission.csv', index=False)

print("File submitted successfully.")