In [433]:
###Code Below based off of Data 100 Project 1'

In [434]:
import pandas as pd
import numpy as np
import re
df = pd.read_csv('uncleanedds.csv')

In [435]:
df = df.drop(columns = ["Unnamed: 0"])
df = df[~df['full_text'].str.startswith('RT')]
df = df[~df['full_text'].str.startswith('LIVE')]
df = df[~df['full_text'].str.startswith('ICYMI')]
df = df[~df['full_text'].str.startswith('WATCH')]
df['no_punc_text'] = df['full_text']

In [436]:
def removeatusernames(tweet):
    return re.sub(r'(@[A-Za-z0-9]+)','',tweet)

In [437]:
def removelinks(tweet):
    return re.sub(r'http\S+','',tweet)

In [438]:
def removehashtags(tweet):
    return re.sub(r'(#[A-Za-z0-9]+)','',tweet)

In [439]:
def removeemojis(tweet):
    return tweet.encode('ascii', 'ignore').decode('ascii')

In [440]:
def removePuncs(tweet):
    return re.sub(r'[^\w\s]','', tweet)

In [441]:
df['full_text'] = df['full_text'].apply(removelinks).apply(removeatusernames).apply(removehashtags)

In [442]:
df['no_punc_text'] = df['no_punc_text'].apply(removelinks).apply(removeatusernames).apply(removehashtags).apply(removePuncs).apply(removeemojis)

In [443]:
df = df[~df['no_punc_text'].str.startswith(' ')]
df = df[~df['full_text'].str.startswith(' ')]

In [444]:
df['full_text'] = df['full_text'].str.lower()
df['no_punc_text'] = df['no_punc_text'].str.lower()

In [445]:
sent = pd.read_csv('vader_lexicon.txt', delimiter="\t", names=['token', 'polarity', 'Col 3', 'Col 4']).loc[:, 'token':'polarity'].set_index('token')

In [447]:
no_duplicate = sent.reset_index()
no_duplicate = no_duplicate.drop_duplicates(subset='token', keep='first').set_index('token')
r = sent.merge(tidy_format, how='inner', left_index=True, right_on='word')
polarity_by_id = r.groupby(r.index)[['polarity']].sum()
polarity = df.merge(polarity_by_id, how='outer', left_index=True, right_index=True).fillna(0)
df['polarity'] = polarity['polarity']

In [448]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()
def sentiment_analyzer_scores_1(sentence):
    score = analyser.polarity_scores(sentence)
    return score['compound']

In [449]:
df['pol'] = df['full_text'].apply(sentiment_analyzer_scores_1)

In [450]:
df.head()

Unnamed: 0,id,full_text,name,no_punc_text,polarity,pol
0,1113147225239040000,real protective wit my soul where u been,ariana,real protective wit my soul where u been,0.0,0.0
1,1113146485925744641,what’s your fav lyric 🌪🌬,ariana,whats your fav lyric,4.4,0.4588
3,1113140084637786113,we love u 🖤 thank u 🖤 !! ヽ( ⌒o⌒)人(⌒○⌒ )ﾉ,ariana,we love u thank u o,4.7,0.8065
4,1113139911106813952,u deserve the world ! only up from here chicoo...,ariana,u deserve the world only up from here chicooo...,0.0,0.0
8,1112865178981416960,a full dad wrote this,ariana,a full dad wrote this,0.0,0.0


In [451]:
from textblob import TextBlob
sentiment_objects = [TextBlob(tweet) for tweet in df['no_punc_text']]
sentiment_values = [[tweet.sentiment.polarity, str(tweet)] for tweet in sentiment_objects]
sentiment_df = pd.DataFrame(sentiment_values, columns=["polarity", "text"])

In [452]:
sentiment_df['sentiment'] = [1 if x >= 0 else 0 for x in sentiment_df['polarity']]

In [453]:
positive_tweets = sentiment_df[sentiment_df['polarity'] >= 0].reset_index()

In [454]:
negative_tweets = sentiment_df[sentiment_df['polarity'] < 0].reset_index()


671

In [455]:
test = [positive_tweets[2200:4400],negative_tweets[300:600]]
train = [positive_tweets[0:2200],negative_tweets[0:300]]


In [471]:
test_sent = pd.concat(test)
train_sent = pd.concat(train)
alltogether = pd.concat([train_sent, test_sent])

In [475]:
X_train = alltogether.iloc[0:2500, 2].values
y_train = alltogether.iloc[0:2500, 3].values
X_test = alltogether.iloc[2500:, 2].values
y_test = alltogether.iloc[2500:,3].values

In [458]:
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer()
total_reviews = X_train + X_test
tokenizer.fit_on_texts(total_reviews)
max_length = max([len(s.split()) for s in total_reviews])
vocab_size = len(tokenizer.word_index) + 1
X_train_tokens = tokenizer.texts_to_sequences(X_train)
X_test_tokens = tokenizer.texts_to_sequences(X_test)
X_train_pad = pad_sequences(X_train_tokens, maxlen = max_length, padding = 'post')
X_test_pad = pad_sequences(X_test_tokens, maxlen = max_length, padding = 'post')

In [459]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, GRU
from keras.layers.embeddings import Embedding
from keras import optimizers

EMBEDDING_DIM = 100

model = Sequential()
model.add(Embedding(vocab_size, EMBEDDING_DIM, input_length = max_length))
model.add(GRU(units = 32, dropout = 0.2, recurrent_dropout = 0.2))
model.add(Dense(1, activation = 'sigmoid'))
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

In [460]:
model.fit(X_train_pad, y_train, batch_size = 128, epochs = 25, validation_data=(X_test_pad, y_test), verbose = 1)

Train on 2500 samples, validate on 2500 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x1a5701dd68>

In [469]:
testsample1 = 'I dont know how to feel right now. I feel very hurt'
testsample2 = 'this movie really sucks! can i get my money back please'
tsts = [testsample1,testsample2]
test_samples_tokens = tokenizer.texts_to_sequences(tsts)
test_samples_tokens_pad = pad_sequences(test_samples_tokens, maxlen= max_length)
model.predict(x = test_samples_tokens_pad)

array([[0.5877866 ],
       [0.58541316]], dtype=float32)