In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## 1: Understanding the 
* Natural Language Processing (NLP) works by converting words (texts) into numbers.
* These numbers are then used to train an AI/ML model to make predictions.
* In this case, we will analyze thousands of Twitter tweets to predict people's sentiment

## 2. Importing the Libraries and Datasets

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load the data
tweets_df = pd.read_csv('/kaggle/input/twitter-sentiment-analysis-hatred-speech/train.csv')
tweets_df.head()

This dataset is labelled for hatred/negetive tweets, hence "label: 1"  indicates negetive tweets and "label: 0" is positive sentiments.

In [None]:
tweets_df.info()

In [None]:
tweets_df.describe()

In [None]:
tweets_df['tweet']

Since we are analyzing the "tweets" and the labels, we don't reuqire the "id" columns. Hence, lets drop it.

In [None]:
tweets_df.drop('id', axis=1, inplace=True)
tweets_df.head()

## 3. Exploring the Dataset

In [None]:
tweets_df.hist(bins=30, figsize=(15,5), color='b')

Notice that all the labels are discreate values of 0 and 1 as discussed earlier, hence this is binary class problem.

In [None]:
sns.countplot(tweets_df['label'], label='count')

Notice that this is class bias situation meaning data with label: 1 is much more that label: 0.

In [None]:
# let's save the length of each tweets(character) in a sepearte column
tweets_df['length'] = tweets_df['tweet'].apply(len)
tweets_df.head()

In [None]:
# plot a histogram on the character count/tweet length
tweets_df['length'].plot(bins=50, kind='hist')

Seems like majority of the tweets are between 70 to 100 character.

Lets seperate the positive and negative tweets in different lists.

In [None]:
positive = tweets_df[tweets_df['label']==0]
positive

In [None]:
negative = tweets_df[tweets_df['label']==1]
negative

## 4. Plot the WordCloud

In [None]:
sentences = tweets_df['tweet'].to_list()
sentences[:10]

In [None]:
len(sentences)

The 'sentences' is still separate by commas and spaces lets join them such that it is single corpus.

In [None]:
single_sentence = ' '.join(sentences)
single_sentence[:500]

In [None]:
!pip install WordCloud
from wordcloud import WordCloud

In [None]:
# Plot the WordCloud for all tweets
plt.figure(figsize=(20,20))
plt.imshow(WordCloud().generate(single_sentence))

#TODO: WordCloud details

In [None]:
# Plot the WorldCloud for positive words
positive_sentences = positive['tweet'].to_list()
single_positive = ' '.join(positive_sentences)
plt.figure(figsize=(20,20))
plt.imshow(WordCloud().generate(single_positive))

In [None]:
# Plot the WorldCloud for negative words
negative_sentences = negative['tweet'].to_list()
single_negative = ' '.join(negative_sentences)
plt.figure(figsize=(20,20))
plt.imshow(WordCloud().generate(single_negative))

## 5. Data Cleaning: Remove Punctuation

In [None]:
import string
string.punctuation

In [None]:
# Lets test our string punctuation with a test string
Test = 'Good morning beautiful people :)... I am having fun learning Machine learning and AI!!'

In [None]:
test_punc_remove = ''.join([c for c in Test if c not in string.punctuation])
test_punc_remove

Great, we have removed all the punctuation now.

## 6. Data Cleaning: Remove Stopwords

In [None]:
import nltk
nltk.download('stopwords')

In [None]:
# let imprt the stopwords and see them
from nltk.corpus import stopwords
stopwords.words('english')

In [None]:
test_punc_clean = [word for word in test_punc_remove.split() if word.lower() not in stopwords.words('english')]
test_punc_clean

* ## Create a function for Step 5 and Step 6

In [None]:
def message_cleaning(message):
    punc_removed = [char for char in message if char not in string.punctuation]
    punc_removed_join = ''.join(punc_removed)
    punc_removed_join_clean = [word for word in punc_removed_join.split() if word.lower() not in stopwords.words('english')]
    return punc_removed_join_clean

In [None]:
# Let's apply the function to our tweet dataset
tweets_df_clean = tweets_df['tweet'].apply(message_cleaning)

In [None]:
print(tweets_df_clean[5]) # cleaned up version
print(tweets_df['tweet'][5]) # show the orignal version

## 7. Count Vectorization or Tokenization
In order to use textual data for predictive modeling, the text must be parsed to remove certain words – this process is called tokenization. These words need to then be encoded as integers, or floating-point values, for use as inputs in machine learning algorithms. This process is called feature extraction (or vectorization).

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(analyzer=message_cleaning, dtype='uint8')
tweets_countvectorizer = vectorizer.fit_transform(tweets_df['tweet']).toarray()

In [None]:
vectorizer.get_feature_names()[:10]

In [None]:
tweets_countvectorizer

In [None]:
tweets_countvectorizer.shape

## Now this tweets_countvectorizer is the features for our model

## 8. Naive Bayes
Naive Bayes is a classification technique based on Bayes' Theorem. Bayes’ theorem is based conditional probability which states the likelihood the occurrence of event “A” given another event “B” has already happened.
There are 3 type of Naïve Bayes:
* Gaussian ->The model assume that the data follows normal distribution and all our features are continuous.
* Bernoulli -> It assumes that all our features are binary such that they only take two values: 0s and 1s.
* Multinomial -> It assumes that the data has discreate value such as ratings between 1 to 5.

More on Naive Bayes can be found in my article here: https://medium.com/analytics-vidhya/na%C3%AFve-bayes-classifiers-fafde4f0a411

In [None]:
# Let's define the features(X) and labels(y) for our model
X = tweets_countvectorizer
X

In [None]:
X.shape

In [None]:
y = tweets_df['label']
y

In [None]:
y.shape

## 9. Train the Naive Bayes Classifier Model

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
from sklearn.naive_bayes import MultinomialNB

NB_classifier = MultinomialNB()
NB_classifier.fit(X_train, y_train)

## 9. Check our Model Accuracy through Confusion Matrix
A confusion matrix is a table that is often used to describe the performance of a classification model (or “classifier”) on a set of test data for which the true values are known.

More details can be found in my article here: https://medium.com/analytics-vidhya/clarity-in-confusion-matrix-17fb1da6dabf

In [None]:
np.set_printoptions(precision=3)
from sklearn.metrics import classification_report, confusion_matrix
y_predict_test = NB_classifier.predict(X_test)
cm = confusion_matrix(y_test, y_predict_test)
sns.heatmap(cm, annot=True, fmt='.2f')

In [None]:
print(classification_report(y_test, y_predict_test))

Special thanks to Ryan Ahmed from Coursera https://www.coursera.org/projects/twitter-sentiment-analysis