In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/kaggle/input/first-gop-debate-twitter-sentiment/Sentiment.csv')
df.head(3)

In [None]:
columns_to_drop = ['candidate_gold','relevant_yn_gold','sentiment_gold',
                   'subject_matter_gold','tweet_coord','tweet_location','user_timezone',
                   'id','tweet_created','tweet_id','name']
df.drop(labels=columns_to_drop,axis=1,inplace=True)
df.head(3)

In [None]:
df.dropna(inplace=True)
df.isnull().sum()

In [None]:
df['sentiment'].unique()

# **Preprocessing Text Data**

In [None]:
import nltk
from nltk.corpus import stopwords

# tweets = []
stopwords_set = set(stopwords.words("english"))

def remove_stopwords(doc):
    words_filtered = [e.lower() for e in doc.split()]
    words_cleaned = [word for word in words_filtered
        if 'http' not in word
        and not word.startswith('@')
        and not word.startswith('#')
        and word != 'rt']
    doc_without_stopwords = ' '.join([word for word in words_cleaned if not word in stopwords_set])
    
    return doc_without_stopwords

In [None]:
df['text'] = df['text'].apply(remove_stopwords)

In [None]:
#this function is used to remove the punctuation in the text data
def remove_punctuations(doc):
    punctuations = """!()-[]{};:'"\,“”<>./?@#$%^&*_~"""
    #we add one more punctuation to our list as this punctuation mark was used multiple times in the text data
    punctuations += '�' 
    for p in punctuations:
      if p in doc:
        doc = doc.replace(p,"")
    return doc

In [None]:
df['text'] = df['text'].apply(remove_punctuations)

In [None]:
#this function will remove all the tokens which are not alphabatic
def remove_digits(doc):
    tokens = doc.split()
    result = ' '.join([i for i in tokens if i.isalpha()])
    return result

In [None]:
df['text'] = df['text'].apply(remove_digits)

In [None]:
#importing libraries for stemming
import re
import nltk
from nltk.stem import SnowballStemmer #general stemmer
print(" ".join(SnowballStemmer.languages))

In [None]:
#we will select the dutch language stemmer as out text is in dutch language
stemmer = SnowballStemmer("english")
# stemmer.stem(df['text'].iloc[0])
df['text'] = df['text'].apply(stemmer.stem)

In [None]:
# selecting tweets with positive and negative sentiment
df_final = df[df['sentiment'] != 'Neutral']

In [None]:
df_final['sentiment'] = df_final['sentiment'].apply(lambda x : 1 if x == 'Positive' else 0)

In [None]:
#creating pradictor and target variable 
X = df_final['text']
y = df_final['sentiment']

In [None]:
X

In [None]:
y

# **Preparing data for word2vec**
https://www.kaggle.com/pierremegret/gensim-word2vec-tutorial

In [None]:
# tokenizing the proprocessed text data
sent = [row.split() for row in df_final['text']]

In [None]:
sent[0]

In [None]:
from gensim.models.phrases import Phrases, Phraser

In [None]:
phrases = Phrases(sent, min_count=30, progress_per=10000)

In [None]:
bigram = Phraser(phrases)

In [None]:
sentences = bigram[sent]

In [None]:
from gensim.models import Word2Vec

In [None]:
w2v_model = Word2Vec(min_count=20,
                     window=2,
                     vector_size=300, 
                     alpha=0.03, 
                     min_alpha=0.0007)

In [None]:
w2v_model.build_vocab(sentences, progress_per=10000)

In [None]:
w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

https://stackoverflow.com/questions/42064690/using-pre-trained-word2vec-with-lstm-for-word-generation

https://www.kaggle.com/guichristmann/lstm-classification-model-with-word2vec

https://www.tensorflow.org/api_docs/python/tf/keras/layers/LSTM

In [None]:
w2v_model.wv.vectors

In [None]:
w2v_model

In [None]:
# w2v_model.wv.get_vecattr(word, "count")  #gives the count of words/occurence of the word.
def word2token(sentence):
#     print(sentence)
    words = sentence.split()
#     print(words)
    vec = []
    for word in words:
        try:
            vec.append(w2v_model.wv.key_to_index[word])
        # If word is not in index return 0. I realize this means that this
        # is the same as the word of index 0 (i.e. most frequent word), but 0s
        # will be padded later anyway by the embedding layer (which also
        # seems dirty but I couldn't find a better solution right now)
        except KeyError:
            vec.append(0)
    return vec

In [None]:
temp = df_final['text'].apply(word2token)

In [None]:
from keras.preprocessing.sequence import pad_sequences
X = pad_sequences(temp)

In [None]:
X.shape

In [None]:
X[0:10]

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM

In [None]:
vocab_size,embedding_size = w2v_model.wv.vectors.shape

In [None]:
pretrained_weights = w2v_model.wv.vectors

In [None]:
pretrained_weights.shape

In [None]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_size, weights=[pretrained_weights]))
model.add(LSTM(32))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])
model.summary()

In [None]:
# spliting the dataset into test and train set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

In [None]:
# output size = 32
batch_size = 64
epochs = 100
model.fit(X_train, y_train, epochs = epochs, batch_size=batch_size)

In [None]:
y_pred = model.predict(X_test)

In [None]:
for i in range(10):
    print(y_pred[i])

In [None]:
for i,yp in enumerate(y_pred):
    if yp >= 0.5:
        y_pred[i] = 1
    else:
        y_pred[i] = 0

In [None]:
from sklearn.metrics import confusion_matrix
cf_mat = confusion_matrix(y_test, y_pred,labels=[0,1])
cf_mat