In [None]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# We need to install a wide variety of libraries. For this we will install pandas, numpy, seaborn and matplotlib libraries.

import numpy as np
import pandas as pd
import seaborn as sns
import string
import re
sns.set()

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import ngrams


from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.model_selection import train_test_split




import matplotlib.pyplot as plt
import os
import warnings
warnings.filterwarnings("ignore")

# Graphics in retina format are more sharp and legible
%config InlineBackend.figure_format = 'retina'

# 1- Reading the datasets

In [None]:
train_data = pd.read_csv("/kaggle/input/tweet-sentiment-extraction/train.csv")
test_data = pd.read_csv("/kaggle/input/tweet-sentiment-extraction/test.csv")
train_data.head()

In [None]:
train_data.describe()

# 2- EDA

### `Missing Values treatment in the dataset`

In [None]:
train_data.isnull().sum()

We found one row is missing for text and selected_text, so we need to replace it or drop it.

In [None]:
# Dropping missing values
train_data.dropna(inplace= True)

### `Distribution of the Sentiment Column`

In [None]:
train_data['sentiment'].value_counts()

In [None]:
train_data['sentiment'].value_counts(normalize= True)

In [None]:
sns.countplot(data= train_data, x= 'sentiment',
             order = train_data['sentiment'].value_counts().index);

### `Examples of each sentiment`

In [None]:
# Positive tweet
print('Positive Tweet example:', train_data[train_data['sentiment'] == 'positive']['text'].values[0])

# Negative tweet
print('negative Tweet example:', train_data[train_data['sentiment'] == 'negative']['text'].values[0])

# Neutral tweet
print('Neutral Tweet example:', train_data[train_data['sentiment'] == 'neutral']['text'].values[0])

# 3- Text Data Preprocessing
We need to pre-process the data to get it all in a consistent format.We need to clean, tokenize and convert our data into a matrix. Let's create a function which will perform the following tasks on the text columns:

* Tokenizes
* Make text lowercase
* Removes hyperlinks
* Remove punctuation
* Removes numbers
* Removes useless words "stopwords"
* Stemming/Lemmatization



In [None]:
stop_words = stopwords.words('english')
stemmer    = nltk.SnowballStemmer("english")

In [None]:
def clean_text(text):
    '''
        Make text lowercase, remove text in square brackets,remove links,remove punctuation
        and remove words containing numbers.
    '''
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text) # remove urls
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text) # remove punctuation
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

In [None]:
def preprocess_data(text):
    text = clean_text(text)                                                     # Clean puntuation, urls, and so on
    text = ' '.join(word for word in text.split() if word not in stop_words)    # Remove stopwords
    text = ' '.join(stemmer.stem(word) for word in text.split())                # Stemm all the words in the sentence
    return text

In [None]:
train_data['clean_text'] = train_data['text'].apply(preprocess_data)
train_data.head()

In [None]:
# Convert sentiment to numerical variable
train_data['label'] = train_data.sentiment.map({'negative': 0,
                                                'positive': 1,
                                                'neutral': 2})
train_data.head()


# 4- Analyzing Text Statistics
We can now do some statistical analysis to explore the data like:
* Text length analysis.
    * length for whole sentence, # of each character in the sentence.
    *  count # of word in each sentence.
* word frequency analysis



In [None]:
train_data['text_n_chars'] = train_data.text.apply(len) # count all chars in each sentence
train_data['text_n_words'] = train_data.text.apply(lambda sent: len(sent.split())) # count number of words in each sentence
train_data.head()

### `The distribution of number of words for each sentiment.`

In [None]:
sns.histplot(data= train_data, x= 'text_n_words', hue= 'sentiment', multiple= 'stack');


### `The distribution of number of letters for each sentiment`

In [None]:
sns.histplot(data= train_data, x= 'text_n_chars', hue= 'sentiment', multiple= 'stack');


### Most frequent words.

#### `In whole Text`

In [None]:
from collections import Counter

In [None]:
words = [word for sent in train_data['clean_text'] for word in sent.split()]
words[:10] # words without sorting

In [None]:
# sort words descending order
freq_words = Counter(words)
freq_words_sorted = sorted(freq_words.items(), key=lambda pair: pair[1], reverse=True)
freq_words_df = pd.DataFrame(freq_words_sorted[:20], columns=['word', 'counts'])

In [None]:
freq_words_df.head(10)

In [None]:
plt.figure(figsize=(12, 6))
sns.barplot(data= freq_words_df , x= 'counts', y= 'word')
plt.title('Top 20 words in whole text')
plt.show();

### Frequent words for each sentiment

In [None]:
def freq_sentiment_words(text, sentiment, num):
    '''
        take the whole data, and return data which is have # of words in each sentiment has been passed
    '''
    words = [word for sent in text[text['sentiment'] == sentiment]['clean_text'] for word in sent.split()]
    freq_words = Counter(words)
    freq_words_sorted = sorted(freq_words.items(), key=lambda pair: pair[1], reverse=True)
    freq_words_df = pd.DataFrame(freq_words_sorted[:num], columns=['word', 'counts'])
    return freq_words_df

#### `In Positive Sentiment`

In [None]:
positive_words = freq_sentiment_words(train_data, 'positive', 20)
positive_words.head()

In [None]:
def plot_freq(data, st):
    '''
        take the data, and st refeere to kind of sentiment
    '''
    plt.figure(figsize=(12, 6))
    sns.barplot(data= data , x= 'counts', y= 'word')
    plt.title(f'Top 20 words in {st} sentiment')
    plt.show();

In [None]:
plot_freq(positive_words, 'positive')

#### `In Negative Sentiment`

In [None]:
negative_words = freq_sentiment_words(train_data, 'negative', 20)
negative_words.head()

In [None]:
plot_freq(negative_words, 'negative')

#### `In Neutral Sentiment`

In [None]:
neutral_words = freq_sentiment_words(train_data, 'neutral', 20)
neutral_words.head()

In [None]:
plot_freq(neutral_words, 'neutral')

### Distribution of top n-grams

In [None]:
def get_top_n_gram(corpus, sentiment,  n_gram, top_n=None):
    
    # list of splited senteces, which is just list of words
    text = [word for sent in corpus[corpus['sentiment'] == sentiment]['clean_text'] for word in sent.split()]

    grams = ngrams(text, n_gram)
    grams = (' '.join(g) for g in grams)
    num_of_grams = [words for words in grams]
    freq_words = Counter(num_of_grams)
    freq_words_sorted = sorted(freq_words.items(), key=lambda pair: pair[1], reverse=True)
    freq_words_df = pd.DataFrame(freq_words_sorted[:top_n], columns=['word', 'counts'])
    return freq_words_df[:top_n]

#### `Bi-Gram for positive sentiment`

In [None]:
positive_gram = get_top_n_gram(train_data, 'positive', 2, 20)
positive_gram.head()

In [None]:
plot_freq(positive_gram, 'positive')

#### `Bi-Gram for negative sentiment`

In [None]:
negative_gram = get_top_n_gram(train_data, 'negative', 2, 20)
negative_gram.head()

In [None]:
plot_freq(negative_gram, 'negative')

#### `Bi-Gram for neutral sentiment`

In [None]:
netutral_gram = get_top_n_gram(train_data, 'neutral', 2, 20)
netutral_gram.head()

In [None]:
plot_freq(netutral_gram, 'neutral')

**We can easily make tri-grams for sentiment using this function `get_top_n_gram` by passing n_gram = 3**

### Word Cloud

In [None]:
# getting list of positive words 
positive_text_clean = train_data[train_data['sentiment' ] == 'positive']['clean_text']
positive_clean_words = [word for words in positive_text_clean for word in words.split()]
positive_clean_words[:10]

In [None]:
# getting list of negative words 
negative_text_clean = train_data[train_data['sentiment' ] == 'negative']['clean_text']
negative_clean_words = [word for words in negative_text_clean for word in words.split()]
negative_clean_words[:10]

In [None]:
# getting list of neutral words 
neutral_text_clean = train_data[train_data['sentiment' ] == 'neutral']['clean_text']
neutral_clean_words = [word for words in neutral_text_clean for word in words.split()]
neutral_clean_words[:10]

In [None]:
from wordcloud import WordCloud
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=[30, 15])
wordcloud1 = WordCloud( background_color='white',
                        width=600,
                        height=400).generate(" ".join(positive_clean_words))
ax1.imshow(wordcloud1)
ax1.axis('off')
ax1.set_title('Positive text',fontsize=40);

wordcloud2 = WordCloud( background_color='white',
                        width=600,
                        height=400).generate(" ".join(negative_clean_words))
ax2.imshow(wordcloud2)
ax2.axis('off')
ax2.set_title('Negative text',fontsize=40);

wordcloud3 = WordCloud( background_color='white',
                        width=600,
                        height=400).generate(" ".join(neutral_clean_words))
ax3.imshow(wordcloud3)
ax3.axis('off')
ax3.set_title('Neutral text',fontsize=40);

Thanks for your time ^_^.