In [1]:

import tensorflow as tf
from tensorflow import keras
import pandas as pd
import numpy as np
import os


In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [6]:

cols = ['sentiment','id','date','query','user','text']

df = pd.read_csv("/content/gdrive/MyDrive/Cuarto Año/1º Cuatrimestre/Inteligencia artificial en las organizaciones/prácticas/práctica final/modelo/dataset_spanish.csv",header=None, names=cols,encoding="latin")

df.loc[df['sentiment'] == 4, 'sentiment'] = 1

In [7]:
df.shape

(5737, 6)

In [8]:
df[df["sentiment"] == 1]

Unnamed: 0,sentiment,id,date,query,user,text
2,1.0,0.75,@marodriguezb Gracias MAR,,,
5,1.0,1.0,Toca @crackoviadeTV3 . GrabaciÃ³n dl especial ...,,,


In [9]:
df.head()

Unnamed: 0,sentiment,id,date,query,user,text
0,,sentiment,text,,,
1,0.0,0.5,@PauladeLasHeras No te libraras de ayudar me/n...,,,
2,1.0,0.75,@marodriguezb Gracias MAR,,,
3,2.0,0.0,"Off pensando en el regalito Sinde, la que se v...",,,
4,3.0,1.0,Conozco a alguien q es adicto al drama! Ja ja ...,,,


In [10]:
df['sentiment'].value_counts()

1.0       2
5327.0    1
3601.0    1
1169.0    1
671.0     1
         ..
4895.0    1
5208.0    1
5296.0    1
5456.0    1
0.0       1
Name: sentiment, Length: 5735, dtype: int64

In [11]:
# preprocessing

import re
import nltk
from nltk.corpus import stopwords
from  nltk.stem import SnowballStemmer

TEXT_CLEANING_REGULAR = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"

nltk.download('stopwords')

stop_words = stopwords.words("spanish")
stemmer = SnowballStemmer("spanish")

def preprocess(text, stem=False):
    # Remove link,user and special characters
    text = re.sub(TEXT_CLEANING_REGULAR, ' ', str(text).lower()).strip()
    tokens = []
    for token in text.split():
        if token not in stop_words:
            if stem:
                tokens.append(stemmer.stem(token))
            else:
                tokens.append(token)
    return " ".join(tokens)

df.text = df.text.apply(lambda x: preprocess(x))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [12]:
#split data into training and validation
from sklearn.model_selection import train_test_split

TRAIN_SIZE = 0.8

train_df, test_df = train_test_split(df, test_size=1-TRAIN_SIZE, random_state=42)

y_train = train_df.sentiment.to_numpy()
y_test = test_df.sentiment.to_numpy()

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer

#vectorize by turning each text into a sequence of integers
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_df.text)

vocab_size = len(tokenizer.word_index) + 1
print("Total words", vocab_size)


Total words 290419


In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

SEQUENCE_LENGTH = 50

x_train = pad_sequences(tokenizer.texts_to_sequences(train_df.text), maxlen=SEQUENCE_LENGTH)
x_test = pad_sequences(tokenizer.texts_to_sequences(test_df.text), maxlen=SEQUENCE_LENGTH)


In [None]:
from tensorflow.keras import layers

model = keras.models.Sequential()
model.add(layers.Embedding(vocab_size, 32, input_length=SEQUENCE_LENGTH))
model.add(layers.LSTM(64, dropout=0.2, recurrent_dropout=0.2))
model.add(layers.Dense(1,activation="sigmoid"))

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 50, 32)            9293408   
                                                                 
 lstm (LSTM)                 (None, 64)                24832     
                                                                 
 dense (Dense)               (None, 1)                 65        
                                                                 
Total params: 9,318,305
Trainable params: 9,318,305
Non-trainable params: 0
_________________________________________________________________


In [None]:
optim = keras.optimizers.Adam(lr=0.001)

model.compile(loss="binary_crossentropy", optimizer=optim, metrics="accuracy")

  super(Adam, self).__init__(name, **kwargs)


In [None]:
model.fit(x_train, y_train, epochs=5, validation_split=0.1,verbose=1,batch_size=1000)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f7b09285250>

In [None]:
score = model.evaluate(x_test, y_test, batch_size=1000)



In [None]:
import pickle

model.save("/content/gdrive/MyDrive/Cuarto Año/1º Cuatrimestre/Inteligencia artificial en las organizaciones/prácticas/práctica final/sentiments.h5")

# saving
with open("/content/gdrive/MyDrive/Cuarto Año/1º Cuatrimestre/Inteligencia artificial en las organizaciones/prácticas/práctica final/tokenizer.pkl", 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

# loading
#with open('tokenizer.pickle', 'rb') as handle:
#   tokenizer = pickle.load(handle)


In [None]:
def decode_sentiment(score, include_neutral=True):
    if include_neutral:        
        label = "NEUTRAL"
        if score <= 0.4:
            label = "NEGATIVE"
        elif score >= 0.7:
            label = "POSITIVE"

        return label
    else:
        return "NEGATIVE" if score < 0.5 else "POSITIVE"

def predict(text, include_neutral=True):
    # Tokenize text
    x_test = pad_sequences(tokenizer.texts_to_sequences([text]), maxlen=SEQUENCE_LENGTH)
    # Predict
    score = model.predict([x_test])[0]
    # Decode sentiment
    label = decode_sentiment(score, include_neutral=include_neutral)

    return {"label": label, "score": float(score)}  

In [None]:
predict("I love apples")