In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tensorflow import keras
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from nltk.tokenize import TweetTokenizer

import re

def is_not_url (word):
    return word.startswith("http") == False and word.startswith("https") == False

def remove_emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)


def remove_hashtag (string):
    return re.sub(r"#", "", string)

def only_special (s):
    return re.match(r'^[_\W]+$', s) != None

def only_number (s):
    return re.match(r'\w*\d\w*', s) != None


def filter_text (column):    
    tknz = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True)
    o = []
    for i in column:
        #Remove hashtags and emojis from tweet
        tokenized = tknz.tokenize(remove_hashtag(remove_emoji(i)))
        
        #remove words with lenght smaller than 2
        tokenized = [word for word in tokenized if len(word) > 3]
        
        #remove urls
        tokenized = [word for word in tokenized if is_not_url(word)]
        
        #remove words with only special characters
        tokenized = [word for word in tokenized if only_special (word) == False]
        
        #remove numbers
        tokenized = [word for word in tokenized if only_number (word) == False]
        o.append (tuple(tokenized))
    return o

#remove empty text lines from dataSet
def remove_empty_text (df):
    return df[df['text'].map(lambda d: len(d)) > 0]

#filter target values to 1 if Positivo or 0 otherwise
def filter_y (s):
    if (s == 'Positivo'):
        return 1
    else:
        return 0

#preprocessing inputs dataframe
def preprocessing_dataframe (df):
    df = remove_empty_text(df)
    df = df.drop_duplicates()
    df = df.drop(df[df['target'] == 'Neutro'].index)
    df['target'] = list(map (filter_y, df['target']))
    return df

#preprocessing input csv
def preprocessing_input (all_data):
    all_data['Text'] = filter_text(all_data['Text'])
    json = { 'text': all_data['Text'], 'target': all_data['Classificacao'] }
    train_data = pd.DataFrame(data=json)
    train_data = preprocessing_dataframe (train_data)
    return train_data


train_data = preprocessing_input(pd.read_csv('/kaggle/input/tweets-from-mgbr/Tweets_Mg.csv'))
train_data.head(50)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train_data['text'], train_data['target'], test_size=0.33, random_state=23)


train_corpus = []

for line in X_train:
    train_corpus += [" ".join(line)]

test_corpus = []

for line in X_test:
    test_corpus += [" ". join(line)]


tfidf = TfidfVectorizer (min_df = 1, ngram_range = (1, 3), use_idf=True, max_features=200)

features_train = tfidf.fit_transform (train_corpus)
frame_train = pd.DataFrame(features_train.todense(), columns = tfidf.get_feature_names())

features_test = tfidf.transform(test_corpus)
frame_test = pd.DataFrame(features_test.todense(), columns = tfidf.get_feature_names())

In [None]:
import tensorflow as tf

model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.Dense(16, activation=tf.nn.relu))
model.add(tf.keras.layers.Dense(2, activation=tf.nn.softmax))


model.compile (optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

history = model.fit(frame_train, y_train, epochs=30, validation_data=(frame_test, y_test))

model.summary()

In [None]:
val_loss, val_acc = model.evaluate(frame_test, y_test)
print (val_loss, val_acc)

In [None]:
predictions = model.predict([frame_test])
print (predictions)

In [None]:
history_dict = history.history
history_dict.keys()

In [None]:
import matplotlib.pyplot as plt

acc = history_dict['accuracy']
val_acc = history_dict['val_accuracy']
loss = history_dict['loss']
val_loss = history_dict['val_loss']

epochs = range(1, len(acc) + 1)

# "bo" is for "blue dot"
plt.plot(epochs, loss, 'bo', label='Training loss')
# b is for "solid blue line"
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()
