## In this practice session, we will understand sentiment analysis using logistic regression and Artificial Neural Nets

In [None]:
!python -m pip install pip --upgrade --user -q
!python -m pip install numpy pandas seaborn matplotlib scipy statsmodels sklearn scikit-image nltk tensorflow keras wordcloud --user -q

In [None]:
import IPython
IPython.Application.instance().kernel.do_shutdown(True)

In [None]:
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns

import nltk
from nltk.stem.porter import *

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import CountVectorizer

import tensorflow as tf
from tensorflow.python import keras
from keras.models import Sequential
from keras import layers

%matplotlib inline

In [None]:
train_data = pd.read_csv('train_tweets.csv') 
test_data = pd.read_csv('test_tweets.csv')

In [None]:
y_train = train_data['label']

In [None]:
train_data_len = len(train_data)

In [None]:
train_data.head()

In [None]:
test_data.head()

# Removing Twitter Handles

In [None]:
# concatenating the training data set and the test data set to get a the complete data set

comp_data = train_data.append(test_data, ignore_index=True)

In [None]:
# a function is defined to remove patterns in text. Here the pattern that we are removing are the twitter handles as they
# don't hold much significance

def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)
    
    return input_txt

In [None]:
# the formatted data is then stored to a new feature called the cleaned_tweets where the formatted tweets for each example
# are stored to create a cleaned vocabulary

comp_data['cleaned_tweets'] = np.vectorize(remove_pattern)(comp_data['tweet'], "@[\w]*")

In [None]:
comp_data.head()

In [None]:
# replacing numbers and other special characters with a space in the cleaned_tweets elements

comp_data['cleaned_tweets'] = comp_data['cleaned_tweets'].str.replace("[^a-zA-Z#]", " ")

In [None]:
comp_data.head()

## Removing words of length less than 3

In [None]:
# There are going to be many words like 'we', 'i' etc., which don't provide us with a lot of information. Hence we are 
# going to remove such words from our vocabulary and have words which gives us the most valued output.

comp_data['cleaned_tweets'] = comp_data['cleaned_tweets'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

In [None]:
comp_data.head()

## Tokenizing tweets

In [None]:
# The tweets in the cleaned_tweets feature are then tokenized. Tokenization means to split all the words in a given 
# sentence into individual words. These individual words are stored as a list in their respective row indexes

tokenized_tweet = comp_data['cleaned_tweets'].apply(lambda x: x.split())
tokenized_tweet.head()

In [None]:
# Stemmer is a process of removing morphological affixes from the words. 

stemmer = PorterStemmer()

In [None]:
# all the words in the tokenized tweet are stemmed to give their root form. This process is implemented because
# there might be different forms of the same word in the tweets. For example, let's take the word 'drag'. The different
# forms of drag are drags, dragging, dragged etc., These different forms might be present in different tweets. 
# 
# Considering the different forms of a word is going to increase the length of our corpus, there by increasing the length
# of our vocabulary. These different forms of the word would mean almost the same in the sentences. Hence when stemming
# is implemented, we get the core form of the word and helps in shrinking the total number of elements in our corpus.

# The alternative for stemmer is lemmatizer

tokenized_tweet = tokenized_tweet.apply(lambda x: [stemmer.stem(i) for i in x]) # stemming
tokenized_tweet.head()

In [None]:
# joining all the words of the tweets in the tokenized_tweet and assigning it back to the cleaned_tweets feature for further
# implementation

tokenized_tweet = [' '.join(i) for i in tokenized_tweet]

comp_data['cleaned_tweets'] = tokenized_tweet

In [None]:
comp_data['cleaned_tweets'].head()

In [None]:
# wordcloud gives you a visual form of the most occuring words in the corpus

all_words = ' '.join([text for text in comp_data['cleaned_tweets']])
from wordcloud import WordCloud
wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(all_words)

plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

In [None]:
# plotting the wordcloud for the words that appear the most in the positive tweets

positive_words =' '.join([text for text in comp_data['cleaned_tweets'][comp_data['label'] == 0]])

wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(positive_words)
plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

In [None]:
# plotting the wordcloud for the words that appear the most in the negative tweets

negative_words = ' '.join([text for text in comp_data['cleaned_tweets'][comp_data['label'] == 1]])
wordcloud = WordCloud(width=800, height=500,
random_state=21, max_font_size=110).generate(negative_words)
plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

In [None]:
# defining a function to extract all the hashtags from the tweets

def hashtag_extract(x):
    hashtags = []
    # Loop over the words in the tweet
    for i in x:
        ht = re.findall(r"#(\w+)", i)
        hashtags.append(ht)

    return hashtags

In [None]:
# extracting hashtags from positive tweets
HT_positive = hashtag_extract(comp_data['cleaned_tweets'][comp_data['label'] == 0])

# extracting hashtags from negative tweets
HT_negative = hashtag_extract(comp_data['cleaned_tweets'][comp_data['label'] == 1])

HT_positive

In [None]:
# unnesting list
HT_positive = sum(HT_positive,[])
HT_negative = sum(HT_negative,[])

HT_positive

In [None]:
# FreqDist is the sortform of Frequency Distribution. Here we are getting the frequency distribution of the number of 
# occurances of each hashtag in a positive tweet and then storing it as a dataframe to plot a bar graph

ht_count_pos = nltk.FreqDist(HT_positive)
ht_df_pos = pd.DataFrame({'Hashtag': list(ht_count_pos.keys()),
                  'Count': list(ht_count_pos.values())})
# selecting top 10 most frequent hashtags     
ht_df_pos = ht_df_pos.nlargest(columns="Count", n = 10) 
plt.figure(figsize=(16,5))
ax = sns.barplot(data=ht_df_pos, x= "Hashtag", y = "Count")
ax.set(ylabel = 'Count')
plt.show()

In [None]:
ht_count_neg = nltk.FreqDist(HT_negative)
ht_df_neg = pd.DataFrame({'Hashtag': list(ht_count_neg.keys()), 'Count': list(ht_count_neg.values())})
# selecting top 10 most frequent hashtags
ht_df_neg = ht_df_neg.nlargest(columns="Count", n = 10)   
plt.figure(figsize=(16,5))
ax = sns.barplot(data=ht_df_neg, x= "Hashtag", y = "Count")
ax.set(ylabel = 'Count')
plt.show()

In [None]:
# Count Vectorizer creates a bag of words of total count of 1000 elements. The process that is followed by count vectorizer 
# is, it creates a matrix where the number of occurances of each word are checked and then a dictionary is created. Then,
# it sorts this dictionary to get the descending order of the count. The max_feature attribute takes the value which gives
# the top 1000 words which have occured the most in our vocabulary.

bow_vectorizer = CountVectorizer(max_features=1000, stop_words='english')
# bag-of-words feature matrix
bow_fit = bow_vectorizer.fit(comp_data['cleaned_tweets'])
bow =  bow_vectorizer.transform(comp_data['cleaned_tweets']).toarray()
bow

In [None]:
train_bow = bow[:train_data_len,:]
test_bow = bow[train_data_len:,:]

# splitting data into training and validation set
xtrain_bow, xvalid_bow, ytrain, yvalid = train_test_split(train_bow, y_train, test_size=0.3)

# Logistic Regression Implementation

In [None]:
lreg = LogisticRegression()
lreg.fit(xtrain_bow, ytrain) # training the model

prediction = lreg.predict_proba(xvalid_bow) # predicting on the validation set
prediction_int = prediction[:,1] >= 0.3 # if prediction is greater than or equal to 0.3 than 1 else 0
prediction_int = prediction_int.astype(np.int)

f1_score(yvalid, prediction_int) # calculating f1 score

# ANN Implementation

In [None]:
input_dim = xtrain_bow.shape[1]

model = Sequential()
model.add(layers.Dense(10, input_dim=input_dim, activation='sigmoid'))
model.add(layers.Dense(1, activation='sigmoid'))

In [None]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [None]:
history = model.fit(xtrain_bow, ytrain, epochs=5, validation_data=(xvalid_bow, yvalid), batch_size=10)

In [None]:
loss, accuracy = model.evaluate(xtrain_bow, ytrain)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(xvalid_bow, yvalid)
print("Validation Accuracy: {:.4f}".format(accuracy))

In [None]:
def plot_history(history):
    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    x = range(1, len(acc) + 1)

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(x, acc, 'b', label='Training acc')
    plt.plot(x, val_acc, 'r', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(x, loss, 'b', label='Training loss')
    plt.plot(x, val_loss, 'r', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()

In [None]:
plot_history(history)

In [None]:
result = model.predict(test_bow)

In [None]:
final_result = []

for i in result:
    if i > 0.03:
        final_result.append(0)
    else:
        final_result.append(1)

In [None]:
test_data['label'] = final_result

In [None]:
test_data