In [3]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from keras.utils import to_categorical
import re
import os

In [11]:
data = pd.read_csv(os.path.join(os.getcwd(),"data/train.csv"),encoding='latin-1')
data.columns = ["polarity","id","date","query","user","text"]

stopwords = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", 
             "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during",
             "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here", 
             "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into",
             "is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or",
             "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", 
             "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's",
             "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up",
             "very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's",
             "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've",
             "your", "yours", "yourself", "yourselves" ]
data['text'] = data['text'].apply(lambda x: x.lower())
# def cleanupat(string):
#     newstring = [word for word in string.split() if word[0]!='@']
#     return " ".join(newstring)
data['text'] = data['text'].apply(lambda x: " ".join([word for word in x.split() if word[0]!='@']))
data['text'] = data['text'].apply(lambda x: " ".join([word for word in x.split() if word not in (stopwords)]))
print(data[data['polarity']==0].size)
# print(data[data['polarity']==1].size)
# print(data[data['polarity']==2].size)
# print(data[data['polarity']==3].size)
print(data[data['polarity']==4].size)

4799994
4800000


In [6]:
max_features = 3000
max_len = 50
tokenizer = Tokenizer(num_words=max_features, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', split=' ')
tokenizer.fit_on_texts(data['text'].values)
X = tokenizer.texts_to_sequences(data['text'].values)
X = pad_sequences(X, maxlen = max_len)
X.shape

(1599999, 50)

In [7]:
embed_dim = 128
lstm_out = 196

# model = Sequential()
# model.add(Embedding(max_features, embed_dim,input_length = X.shape[1]))
# model.add(SpatialDropout1D(0.4))
# model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
# model.add(Dense(2,activation='softmax'))
# model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])


model = Sequential()
model.add(Embedding(max_features, embed_dim,input_length = X.shape[1]))
model.add(SpatialDropout1D(0.7))
model.add(LSTM(64, dropout=0.4, recurrent_dropout=0.4))
model.add(Dense(2,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])



print(model.summary())

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 50, 128)           384000    
                                                                 
 spatial_dropout1d (Spatial  (None, 50, 128)           0         
 Dropout1D)                                                      
                                                                 
 lstm (LSTM)                 (None, 64)                49408     
                                                                 
 dense (Dense)               (None, 2)                 130       
                                                                 
Total params: 433538 (1.65 MB)
Trainable params: 433538 (1.65 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [8]:
Y = pd.get_dummies(data['polarity']).values
smallsize = 10000
X_small = X[np.random.choice(X.shape[0], smallsize, replace=False)]
Y_small = Y[np.random.choice(Y.shape[0], smallsize, replace=False)]
X_train, X_test, Y_train, Y_test = train_test_split(X_small,Y_small, test_size = 0.33, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(6700, 50) (6700, 2)
(3300, 50) (3300, 2)


In [9]:
batch_size = 32
model.fit(X_train, Y_train, epochs = 12, batch_size=batch_size, verbose = 2)

Epoch 1/12
210/210 - 10s - loss: 0.6937 - accuracy: 0.5033 - 10s/epoch - 46ms/step
Epoch 2/12
210/210 - 9s - loss: 0.6879 - accuracy: 0.5570 - 9s/epoch - 42ms/step
Epoch 3/12
210/210 - 9s - loss: 0.6676 - accuracy: 0.5951 - 9s/epoch - 42ms/step
Epoch 4/12
210/210 - 9s - loss: 0.6398 - accuracy: 0.6328 - 9s/epoch - 42ms/step
Epoch 5/12
210/210 - 9s - loss: 0.6096 - accuracy: 0.6693 - 9s/epoch - 42ms/step
Epoch 6/12
210/210 - 9s - loss: 0.5837 - accuracy: 0.6831 - 9s/epoch - 43ms/step
Epoch 7/12
210/210 - 9s - loss: 0.5647 - accuracy: 0.7006 - 9s/epoch - 43ms/step
Epoch 8/12
210/210 - 9s - loss: 0.5447 - accuracy: 0.7142 - 9s/epoch - 42ms/step
Epoch 9/12
210/210 - 9s - loss: 0.5142 - accuracy: 0.7367 - 9s/epoch - 42ms/step
Epoch 10/12
210/210 - 9s - loss: 0.5019 - accuracy: 0.7454 - 9s/epoch - 42ms/step
Epoch 11/12
210/210 - 9s - loss: 0.4887 - accuracy: 0.7584 - 9s/epoch - 42ms/step
Epoch 12/12
210/210 - 9s - loss: 0.4574 - accuracy: 0.7724 - 9s/epoch - 42ms/step


<keras.src.callbacks.History at 0x299f49b50>

In [18]:
validation_size = 100
def chooserandom(matrix, n):
    return matrix[np.random.choice(matrix.shape[0],n,replace=False)]
# X_validate = chooserandom(X_test,validation_size)
# Y_validate = chooserandom(X_test,validation_size)
X_validate = X_test[-validation_size:]
Y_validate = Y_test[-validation_size:]

score,acc = model.evaluate(X_validate, Y_validate, verbose = 2, batch_size = batch_size)
print("score: %.2f" % (score))
print("acc: %.2f" % (acc))

32/32 - 0s - loss: 0.9672 - accuracy: 0.4920 - 310ms/epoch - 10ms/step
score: 0.97
acc: 0.49


In [19]:
pos_cnt, neg_cnt, pos_correct, neg_correct = 0, 0, 0, 0
for x in range(len(X_validate)):
    
    result = model.predict(X_validate[x].reshape(1,X_test.shape[1]),batch_size=1,verbose = 0)[0]
   
    if np.argmax(result) == np.argmax(Y_validate[x]):
        if np.argmax(Y_validate[x]) == 0:
            neg_correct += 1
        else:
            pos_correct += 1
       
    if np.argmax(Y_validate[x]) == 0:
        neg_cnt += 1
    else:
        pos_cnt += 1



print("pos_acc", pos_correct/pos_cnt*100, "%")
print("neg_acc", neg_correct/neg_cnt*100, "%")

pos_acc 46.07843137254902 %
neg_acc 52.44897959183673 %


In [None]:
sometestthing = tokenizer.texts_to_sequences(["Looks like you're having a good day"])
sometestthing = pad_sequences(sometestthing, maxlen=max_len, dtype='int32', value=0)
sometestthing.shape

In [17]:
testvalue = input()
sometestthing = tokenizer.texts_to_sequences([testvalue])
sometestthing = pad_sequences(sometestthing, maxlen=max_len, dtype='int32', value=0)
sometestthing.shape
sentiment = model.predict(sometestthing, batch_size=1,verbose = 0)
if(np.argmax(sentiment) == 0):
    print("negative")
elif (np.argmax(sentiment) == 1):
    print("positive")

i am so proud of you <3
positive


In [None]:
def maketest():
    testvalue = input("Input your prompt:").lower()
    sometestthing = tokenizer.texts_to_sequences([testvalue])
    sometestthing = pad_sequences(sometestthing, maxlen=max_len, dtype='int32', value=0)
    sometestthing.shape
    sentiment = model.predict(sometestthing, batch_size=1,verbose = 0)
    if(np.argmax(sentiment) == 0):
        print("negative")
    elif (np.argmax(sentiment) == 1):
        print("positive")
maketest()
maketest()

In [2]:
import profanity_check