In [None]:


import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## In this project, sentiments in tweets are analyzed. In brief, the goal is to classify the tweets based on positive sentiments using words (such as love, happy) and negative sentiments using words (racist, hate). The tasks performed are:
1. Exploratory Data Analysis
2. Plot WordCloud
3. Data cleaning (Removing Punctuations)
4. Data cleaning (Removing Stopwords)
5. Countvectorization (Tocknization)
6. Create pipeline to perform Task 3,4,5
7. Train Naive Bayes Classifier
8. Annalyze Model Performance

In [None]:
tweets_test_df = pd.read_csv('/kaggle/input/twitter-sentiment-analysis-hatred-speech/test.csv')
tweets_train_df = pd.read_csv('/kaggle/input/twitter-sentiment-analysis-hatred-speech/train.csv')

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
tweets_train_df.head()

In [None]:
tweets_train_df.info()

In [None]:
tweets_train_df['tweet']

In [None]:
tweets_train_df.drop(['id'], axis=1, inplace=True)

In [None]:
tweets_train_df.head()

## Exploring Dataset

In [None]:
sns.heatmap(tweets_train_df.isnull(), yticklabels = False, cbar = False, cmap = 'Blues')

In [None]:
tweets_train_df.hist(bins= 30, figsize = (12,5), color = 'b')

In [None]:
# These plots clearly shows that its a complete unbalanced data.
sns.countplot(x=tweets_train_df['label'] ,data=tweets_train_df)

In [None]:
tweets_test_df.head()

In [None]:
tweets_train_df['lengths'] = tweets_train_df['tweet'].apply(len)

In [None]:
tweets_train_df['lengths'] = tweets_train_df['tweet'].apply(len)

In [None]:
tweets_train_df.head()

In [None]:
#distribution of tweets
tweets_train_df['lengths'].plot(bins=100, kind = 'hist')

In [None]:
tweets_train_df.describe()

In [None]:
#Min length is 11, so let's see it
tweets_train_df[tweets_train_df['lengths']==11]['tweet'].iloc[0]

In [None]:
# lets viwe the meesage with average length
tweets_train_df[tweets_train_df['lengths']==85]

In [None]:
# Now separting positive and negative tweets
positive = tweets_train_df[tweets_train_df['label']==0]
positive

In [None]:
negative = tweets_train_df[tweets_train_df['label']==1]
negative

In [None]:
# Plot the word cloud
from wordcloud import WordCloud

In [None]:
sentences = tweets_train_df['tweet'].tolist()

#All tweets has been converted to a list
#sentences

In [None]:
len(sentences)

In [None]:
#Joining sentences (combining all the sentences that we have)
sentences_as_single_string = " ".join(sentences)

In [None]:
plt.figure(figsize=(20,20))
plt.imshow(WordCloud().generate(sentences_as_single_string))

 As we have a complete unbalanced data, with almost 30K positive and 2.5K negative tweets. Hence,we can see that positive words are more often used in this string.

In [None]:
# Lets plot wordcloud of negative words.
negative_sentences = negative['tweet'].tolist()
negative_string = " ".join(negative_sentences)

In [None]:
plt.figure(figsize=(20,20))
plt.imshow(WordCloud().generate(negative_string))

## Data cleaning (Remove punctuations from text)

In [None]:
import string
string.punctuation

In [None]:
sample = 'Hi! everyone :) ; enjoy learning real world example of NLP !.....'

In [None]:
sample_punc_removed = [char   for char in sample if char not in string.punctuation]

In [None]:
sample_punc_removed

In [None]:
#Now join again
test_punc_removed_string = ''.join(sample_punc_removed)
test_punc_removed_string

In [None]:
# Second and efficient method
out = sample.translate(str.maketrans('', '', string.punctuation))
out

In [None]:
# Third and basic method
punc_removed = []
for char in sample:
    if char not in string.punctuation:
        punc_removed.append(char)
        
punc_removed_join = ''.join(punc_removed)
punc_removed_join

## Data cleaning (Remove StopWords from text)

In [None]:
# The Question is what are stopwords, so lets download and plot them using Natural languae toolkit
import nltk #Natural language toolkit
nltk.download('stopwords')

In [None]:
#Lets import stopword and see the common words stored there. These are words that don't convey any specific information
from nltk.corpus import stopwords
stopwords.words('english')

In [None]:
# Lets remove common words and retain only unique words
test_punc_removed_string_clean = [word for word in test_punc_removed_string.split() if word.lower() not in stopwords.words('english')]

In [None]:
test_punc_removed_string_clean

In [None]:
# Lets try Pipeline approach to accomplish removal of punctuation and stopwords
test_sample = 'A sample to learn,; that how can we remove punctuations and stopwords in a pipeline fashion!!!'


In [None]:
pipe_punc_removed_cleaned = [char for char in test_sample if char not in string.punctuation]
pipe_punc_removed_cleaned = ''.join(pipe_punc_removed_cleaned)
pipe_punc_removed_cleaned = [word for word in pipe_punc_removed_cleaned.split() if word.lower() not in stopwords.words('english')]
pipe_punc_removed_cleaned

## Count Vectorization (Tokenization)

In [None]:
# This will take unique words utilized in text as features, and then count that how many time each word is utilized in that sentence. 
from sklearn.feature_extraction.text import CountVectorizer
sample_new = ['This is first method.', 'This method is the second method.', 'This new one is the third one.' ]

In [None]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(sample_new)

In [None]:
#Lets see the extracted feature names (unique words)
print (vectorizer.get_feature_names())

In [None]:
X

In [None]:
#We can see that in first sentence, only four features (unique words) are present there (first three and last feature).
#In second sentence of sample_new, word method is repeated two times, so we can see 2 at corresponding feature position
print(X.toarray())

In [None]:
# We can see with following example that Countvectroizer always convert each character to lower case before transforming.
second_sample = ['Hello World.', 'Hello Hello World', 'Hello World world world']
XX = vectorizer.fit_transform(second_sample)
print(XX.toarray())

## Now we will perform all operations in a pipeline, (1) Remove punctuations (2) Remove stopwords (3) Tockenization

In [None]:
def text_cleaning(text):
    remv_punc = [char for char in text.lower() if char not in string.punctuation]
    remv_punc_join = ''.join(remv_punc)
    remv_punc_clean = [word for word in remv_punc_join.split() if word.lower() not in stopwords.words('english')]
    return remv_punc_clean

In [None]:
#Lets visualize the newly created function
tweets_df_clean = tweets_train_df['tweet'].apply(text_cleaning)
print(tweets_df_clean[5])

In [None]:
#Actual version of selected tweet, we can see that we have removed all punctuations and stopwords using a single user defined function
tweets_train_df['tweet'][5]

In [None]:
# Now we will use "analyser" to apply countvectorization. 
#In other words, analyzer is an preprocess step before applying countVectorization step.
vectorizer_analyzer = CountVectorizer(analyzer = text_cleaning)
countvectorizer_tweets = CountVectorizer(analyzer= text_cleaning, dtype= 'uint8').fit_transform(tweets_train_df['tweet']).toarray()

In [None]:
countvectorizer_tweets.shape

In [None]:
X_features = countvectorizer_tweets
y_label = tweets_train_df['label']

## Now we will train a Naive Bayes Classifier Model

In [None]:
X_features.shape

In [None]:
y_label.shape

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_features, y_label, test_size = 0.2, random_state = 1)

In [None]:
from sklearn.naive_bayes import MultinomialNB
NaiveBclassifier = MultinomialNB()
NaiveBclassifier.fit(X_train,y_train)

## Analyzing the model performance

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
# Predicting test cases
y_pred_test = NaiveBclassifier.predict(X_test)

In [None]:
# Confusion matrix
cm = confusion_matrix(y_test, y_pred_test)
sns.heatmap(cm, annot= True)