# 1. Import libraries

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import warnings
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from nltk import ngrams
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter
from wordcloud import WordCloud
import nltk
import re
import string
import seaborn as sns
import pandas as pd
import numpy as np
import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# We need to install a wide variety of libraries. For this we will install pandas, numpy, seaborn and matplotlib libraries.
sns.set()


warnings.filterwarnings("ignore")

# Graphics in retina format are more sharp and legible
%config InlineBackend.figure_format = 'retina'

# 2. Reading the datasets

In [None]:
train_data = pd.read_csv("/kaggle/input/tweet-sentiment-extraction/train.csv")
test_data = pd.read_csv("/kaggle/input/tweet-sentiment-extraction/test.csv")
train_data.head()

In [None]:
train_data.describe()

# 3. EDA
### Missing Values treatment in the dataset

In [None]:
train_data.isnull().sum()

We found one row is missing for text and selected_text, so we need to replace it or drop it.

In [None]:
# Dropping missing values
train_data.dropna(inplace=True)

### Distribution of the Sentiment Column

In [None]:
train_data['sentiment'].value_counts()

In [None]:
train_data['sentiment'].value_counts(normalize=True)

In [None]:
sns.countplot(data=train_data, x='sentiment',
              order=train_data['sentiment'].value_counts().index)

Examples of each sentiment

In [None]:
# Positive tweet
print('Positive Tweet example:',
      train_data[train_data['sentiment'] == 'positive']['text'].values[0])

# Negative tweet
print('Negative Tweet example:',
      train_data[train_data['sentiment'] == 'negative']['text'].values[0])

# Neutral tweet
print('Neutral Tweet example:',
      train_data[train_data['sentiment'] == 'neutral']['text'].values[0])

# 3. Text Data Preprocessing
We need to pre-process the data to get it all in a consistent format.We need to clean, tokenize and convert our data into a matrix. Let's create a function which will perform the following tasks on the text columns:

- Tokenizes
- Make text lowercase
- Removes hyperlinks
- Remove punctuation
- Removes numbers
- Removes useless words "stopwords"
- Stemming/Lemmatization

In [None]:
stop_words = stopwords.words('english')
stemmer = nltk.SnowballStemmer("english")


def clean_text(text):
    '''
        Make text lowercase, remove text in square brackets,remove links,remove punctuation
        and remove words containing numbers.
    '''
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S|www\.\S', '', text)  # remove urls
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation),
                  '', text)  # remove punctuation
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text


def preprocess_data(text):
    """
       Clean puntuation, urls, and so on, 
       removing stop-words and make stemming
    """
    text = clean_text(text)
    # Remove stop-words
    text = ' '.join(word for word in text.split()
                    if word not in stop_words)    # Remove stopwords
    # Stemm all the words in the sentence
    text = ' '.join(stemmer.stem(word) for word in text.split())
    return text

In [None]:
train_data['clean_text'] = train_data['text'].apply(preprocess_data)
train_data.head()

In [None]:
# Convert sentiment to numerical variable
train_data['label'] = train_data.sentiment.map({'negative': 0,
                                                'positive': 1,
                                                'neutral': 2})
train_data.head()

# 5. Analyzing Text Statistics

We can now do some statistical analysis to explore the data like:

- Text length analysis
- length for whole sentence
- count of word in each sentence
- word frequency analysis

### Text length analysis

In [None]:
train_data['text_n_chars'] = train_data.text.apply(
    len)  # count all chars in each sentence
train_data['text_n_words'] = train_data.text.apply(
    lambda sent: len(sent.split()))  # count number of words in each sentence
train_data.head()

In [None]:
# The distribution of number of words for each sentiment
sns.histplot(data=train_data, x='text_n_words',
             hue='sentiment', multiple='stack')

### Most frequent words. In whole Text.

In [None]:
words = [word for sent in train_data['clean_text'] for word in sent.split()]
words[:10]  # words without sorting

In [None]:
# sort words descending order
freq_words = Counter(words)
freq_words_sorted = sorted(
    freq_words.items(), key=lambda pair: pair[1], reverse=True)
freq_words_df = pd.DataFrame(
    freq_words_sorted[:20], columns=['word', 'counts'])
freq_words_df.head(10)

In [None]:
plt.figure(figsize=(12, 6))
sns.barplot(data=freq_words_df, x='counts', y='word')
plt.title('Top 20 words in whole text')
plt.show()

### Frequent words for each sentiment

In [None]:
def freq_sentiment_words(text, sentiment, num):
    '''
        take the whole data, and return data which is have # of words in each sentiment has been passed
    '''
    words = [word for sent in text[text['sentiment'] == sentiment]
             ['clean_text'] for word in sent.split()]
    freq_words = Counter(words)
    freq_words_sorted = sorted(
        freq_words.items(), key=lambda pair: pair[1], reverse=True)
    freq_words_df = pd.DataFrame(
        freq_words_sorted[:num], columns=['word', 'counts'])
    return freq_words_df


def plot_freq(data, st):
    '''
        take the data, and st refeere to kind of sentiment
    '''
    plt.figure(figsize=(12, 6))
    sns.barplot(data=data, x='counts', y='word')
    plt.title(f'Top 20 words in {st} sentiment')
    plt.show()

In [None]:
# In Positive Sentiment
positive_words = freq_sentiment_words(train_data, 'positive', 20)
print(positive_words.head())
plot_freq(positive_words, 'positive')

In [None]:
# In Negative Sentiment
negative_words = freq_sentiment_words(train_data, 'negative', 20)
print(negative_words.head())
plot_freq(negative_words, 'negative')

In [None]:
# In Neutral Sentiment
neutral_words = freq_sentiment_words(train_data, 'neutral', 20)
print(neutral_words.head())
plot_freq(neutral_words, 'neutral')

### Distribution of top n-grams

In [None]:
def get_top_n_gram(corpus, sentiment,  n_gram, top_n=None):
    """
        Creates n-gram distribution and returns top n elements
    """
    # list of splited senteces, which is just list of words
    text = [word for sent in corpus[corpus['sentiment'] ==
                                    sentiment]['clean_text'] for word in sent.split()]

    grams = ngrams(text, n_gram)
    grams = (' '.join(g) for g in grams)
    num_of_grams = [words for words in grams]
    freq_words = Counter(num_of_grams)
    freq_words_sorted = sorted(
        freq_words.items(), key=lambda pair: pair[1], reverse=True)
    freq_words_df = pd.DataFrame(
        freq_words_sorted[:top_n], columns=['word', 'counts'])
    return freq_words_df[:top_n]

In [None]:
# Bi-Gram for positive sentiment
positive_gram = get_top_n_gram(train_data, 'positive', 2, 20)
print(positive_gram.head())
plot_freq(positive_gram, 'positive')

In [None]:
# Bi-Gram for negative sentiment
negative_gram = get_top_n_gram(train_data, 'negative', 2, 20)
print(negative_gram.head())
plot_freq(negative_gram, 'negative')

In [None]:
# Bi-Gram for neutral sentiment
netutral_gram = get_top_n_gram(train_data, 'neutral', 2, 20)
print(netutral_gram.head())
plot_freq(netutral_gram, 'neutral')

We can easily make tri-grams for sentiment using this function get_top_n_gram by passing n_gram = 3

### Word Cloud

In [None]:
# getting list of positive words
positive_text_clean = train_data[train_data['sentiment']
                                 == 'positive']['clean_text']
positive_clean_words = [
    word for words in positive_text_clean for word in words.split()]
positive_clean_words[:10]

In [None]:
# getting list of negative words
negative_text_clean = train_data[train_data['sentiment']
                                 == 'negative']['clean_text']
negative_clean_words = [
    word for words in negative_text_clean for word in words.split()]
negative_clean_words[:10]

In [None]:
# getting list of neutral words
neutral_text_clean = train_data[train_data['sentiment']
                                == 'neutral']['clean_text']
neutral_clean_words = [
    word for words in neutral_text_clean for word in words.split()]
neutral_clean_words[:10]

In [None]:
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=[30, 15])
wordcloud1 = WordCloud(background_color='white',
                       width=600,
                       height=400).generate(" ".join(positive_clean_words))
ax1.imshow(wordcloud1)
ax1.axis('off')
ax1.set_title('Positive text', fontsize=40)

wordcloud2 = WordCloud(background_color='white',
                       width=600,
                       height=400).generate(" ".join(negative_clean_words))
ax2.imshow(wordcloud2)
ax2.axis('off')
ax2.set_title('Negative text', fontsize=40)

wordcloud3 = WordCloud(background_color='white',
                       width=600,
                       height=400).generate(" ".join(neutral_clean_words))
ax3.imshow(wordcloud3)
ax3.axis('off')
ax3.set_title('Neutral text', fontsize=40)

# 6. Naive model

In [None]:
def jaccard(str1, str2): #function for finding jaccard similarity for given two sentences
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [None]:
train_data = pd.read_csv("/kaggle/input/tweet-sentiment-extraction/train.csv").fillna('')
train_data['naive'] =  train_data['text']
js=0
for i in range(train_data.shape[0]):
    if i!=314: #as there is an null value and we replaced it with '',it will give zero length for sentence and raises zero error,so neglect that one sentence
        js = js + jaccard(train_data.iloc[i,2],train_data.iloc[i,4])
print("final jaccard score for naive predictions:",js/(train_data.shape[0]))

In [None]:
train_data['text'] = train_data['text'].apply(preprocess_data)
js=0
for i in range(train_data.shape[0]):
    if i!=314: #as there is an null value and we replaced it with '',it will give zero length for sentence and raises zero error,so neglect that one sentence
        js = js + jaccard(train_data.iloc[i,2],train_data.iloc[i,4])
print("final jaccard score for naive predictions:",js/(train_data.shape[0]))

# Conclusion

As we can see from naive model, deep processing didn't improve final result for naive model.
Naive model result we will use as reference for future research.