# Twitter Sentiment Analysis


In this Jupiter notebook, first, we explore data to extract patterns and get better insight about the dataset.
Then we apply various models to get better results


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from pandas_profiling import ProfileReport
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim.parsing.preprocessing import remove_stopwords
from gensim.parsing.preprocessing import preprocess_documents
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
set(stopwords.words('english'))
import regex as re

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv('/kaggle/input/tweet-sentiment-extraction/train.csv')
test = pd.read_csv('/kaggle/input/tweet-sentiment-extraction/test.csv')

In [None]:
train.describe()

In [None]:
profile = ProfileReport(train, minimal=True)

In [None]:
profile

# 1- Exploratory Data Analysis


### 1-1- Comparing distribution of number of words

#### Kernel ditribution will show the overall distribution across each sentiments and histograms after that will show detailed distribution

In [None]:
train['Number_of_words_text'] = train['text'].apply(lambda x: len(str(x).split()))
train['Number_of_words_sel_text'] = train['selected_text'].apply(lambda x: len(str(x).split()))


In [None]:
plt.figure(figsize=(15,10))
plt.title('Kernel distribution of number of words for neutral sentiment')
text=sns.kdeplot(train.loc[(train['sentiment']=='neutral'),'Number_of_words_text'], shade=True, color="r")
sel_text=sns.kdeplot(train.loc[(train['sentiment']=='neutral'),'Number_of_words_sel_text'], shade=True, color="b")

In [None]:
plt.figure(figsize=(15,10))
plt.title('Kernel distribution of words for positive sentiment')
text=sns.kdeplot(train.loc[(train['sentiment']=='positive'),'Number_of_words_text'], shade=True, color="r")
sel_text=sns.kdeplot(train.loc[(train['sentiment']=='positive'),'Number_of_words_sel_text'], shade=True, color="b")

In [None]:
plt.figure(figsize=(15,10))
plt.title('Kernel distribution of number of words for negative sentiment')
text=sns.kdeplot(train.loc[(train['sentiment']=='negative'),'Number_of_words_text'], shade=True, color="r")
sel_text=sns.kdeplot(train.loc[(train['sentiment']=='negative'),'Number_of_words_sel_text'], shade=True, color="b")

In [None]:
 
# Method 1: on the same Axis
sns.distplot(train.loc[(train['sentiment']=='neutral'),'Number_of_words_text'], color="skyblue", label="Text")
sns.distplot(train.loc[(train['sentiment']=='neutral'),'Number_of_words_sel_text'], color="red", label="Selected Text")
plt.title('Dsitribution plot for text and selected text of negative sentiments')
plt.legend()
 


In [None]:
 
# Method 1: on the same Axis
sns.distplot(train.loc[(train['sentiment']=='positive'),'Number_of_words_text'], color="skyblue", label="Text")
sns.distplot(train.loc[(train['sentiment']=='positive'),'Number_of_words_sel_text'], color="red", label="Selected Text")
plt.title('Dsitribution plot for text and selected text of negative sentiments')
plt.legend()
 


In [None]:
 
# Method 1: on the same Axis
sns.distplot(train.loc[(train['sentiment']=='negative'),'Number_of_words_text'], color="skyblue", label="Text")
sns.distplot(train.loc[(train['sentiment']=='negative'),'Number_of_words_sel_text'], color="red", label="Selected Text")
plt.title('Dsitribution plot for text and selected text of negative sentiments')
plt.legend()
 


### 1-2- Calculating similarity between text and selected_text

In [None]:
def get_jaccard_sim(str1, str2):
    if len(str1) == 0 and len(str2) == 0:
        return 1.0
    if type(str1) is not list:
        a = set(str1.split())
        b = set(str2.split())
    else:
        a = set(str1)
        b = set(str2)
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [None]:
train.dropna(inplace=True)

In [None]:
train['jaccard_sim'] = 0
train['jaccard_sim'] = train.apply(lambda x: get_jaccard_sim(x['text'], x['selected_text']), axis=1)

In [None]:
train['sentiment'].value_counts()

In [None]:
print(train.loc[train['sentiment']=='neutral','jaccard_sim'].mean())
print(train.loc[train['sentiment']=='positive','jaccard_sim'].mean())
print(train.loc[train['sentiment']=='negative','jaccard_sim'].mean())

In [None]:
plt.hist(train.loc[train['sentiment']=='neutral','jaccard_sim'])

In [None]:
plt.hist(train.loc[train['sentiment']=='positive','jaccard_sim'])

In [None]:
plt.hist(train.loc[train['sentiment']=='negative','jaccard_sim'])

In [None]:
train.loc[train['jaccard_sim']==0]

In [None]:
train.loc[train['jaccard_sim']==1]

In [None]:
print(train.loc[train['jaccard_sim']==1, 'Number_of_words_text'].mean())
print(train.loc[train['jaccard_sim']==0, 'Number_of_words_text'].mean())

In [None]:
print(train.loc[train['jaccard_sim']<0.5, 'Number_of_words_text'].mean())
print(train.loc[train['jaccard_sim']>0.5, 'Number_of_words_text'].mean())
print(train.loc[train['jaccard_sim']>0.4, 'Number_of_words_text'].mean())

In [None]:
print(train.loc[train['jaccard_sim']<0.5, 'Number_of_words_sel_text'].mean())
print(train.loc[train['jaccard_sim']>0.5, 'Number_of_words_sel_text'].mean())
print(train.loc[train['jaccard_sim']>0.4, 'Number_of_words_sel_text'].mean())

#### Observation:

Average similarity between neutral tweets -  0.97
Average similarity between positive tweets -  0.31
Average similarity between negative tweets -  0.33

Difference in number of words between text and selected text increases when jaccard similarity drops below 0.4

Distribution of sentiment - 40% neutral; 31% positive and 28% negative

### 1-3- Checking stop word distribution in text and selected text

In [None]:
stop_words = set(stopwords.words('english'))

In [None]:
train['stop_words_text'] = train.apply(lambda x: [y for y in word_tokenize(x['text']) if not y in stop_words], axis=1)
train['stop_words_sel_text'] = train.apply(lambda x: [y for y in word_tokenize(x['selected_text']) if not y in stop_words], axis=1)

In [None]:
train.isna().sum()

In [None]:
train['jaccard_sim_stopwords'] = 0
train['jaccard_sim_stopwords'] = train.apply(lambda x: get_jaccard_sim(x['stop_words_text'], x['stop_words_sel_text']), axis=1)

In [None]:
  plt.hist(train.loc[train['sentiment']=='neutral','jaccard_sim_stopwords'])

In [None]:
  plt.hist(train.loc[train['sentiment']=='positive','jaccard_sim_stopwords'])

In [None]:
  plt.hist(train.loc[train['sentiment']=='negative','jaccard_sim_stopwords'])

In [None]:
train.loc[train['jaccard_sim_stopwords']!=train['jaccard_sim']]

### Observation 
Both text and selected text has stopwords and the distribution of jaccard similarity for stopwords looks same as similarity distribution for number of words - hence, possibly, separating stopwords won't won't help. I also tried with normal text cleaning processes like punctuation, repeating characters, emoji removal, normalization of text - but that didn't seem to help as selected_text have them too.

### 1-4- WordCloud

In [None]:
def text_preprocessing(data, index):
    data = [data[i, index].lower() for i in range(len(data))]
    data = [remove_stopwords(data[i]) for i in range(len(data))]
    data = preprocess_documents(data)
    return data

In [None]:
def make_word_cloud(df, preprocess, column_names):
    
    df[column_names] = df[column_names].fillna('')
    train = df.values
    index = df.columns.get_loc(column_names)
    
    if preprocess:
        train = text_preprocessing(train, index)
    else:
        train = [train[i, index].lower() for i in range(len(train))]
        
    df[column_names] = train
    if preprocess:
        df[column_names] = [(" ").join(i) for i in df[column_names]]
        
    pos = df[column_names][df['sentiment'] == 'positive']
    pos = list(pos.values)
    pos = (" ").join(i for i in pos)

    neg = df[column_names][df['sentiment'] == 'negative']
    neg = list(neg.values)
    neg = (" ").join(i for i in neg)

    neu = df[column_names][df['sentiment'] == 'neutral']
    neu = list(neu.values)
    neu = (" ").join(i for i in neu)
    
    
    fig=plt.figure(figsize=(10, 10))
    fig.add_subplot(3, 1, 1)
    wordcloud = WordCloud(max_font_size=50, max_words=100, background_color="white").generate(pos)
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.title('Positive Word Cloud for {} column with Preprocess {} \n'.format(column_names, preprocess), fontsize=15)
    fig.add_subplot(3, 1, 2)
    wordcloud = WordCloud(max_font_size=50, max_words=100, background_color="white").generate(neg)
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.title('Negative Word Cloud for {} column with Preprocess {} \n'.format(column_names, preprocess), fontsize=15)
    fig.add_subplot(3, 1, 3)
    wordcloud = WordCloud(max_font_size=50, max_words=100, background_color="white").generate(neu)
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.title('Neutral Word Cloud for {} column with Preprocess {} \n'.format(column_names, preprocess), fontsize=15)

    plt.show()

In [None]:
make_word_cloud(train, True, 'selected_text')

In [None]:
make_word_cloud(train, True, 'text')

In [None]:
make_word_cloud(train, False, 'selected_text')

In [None]:
make_word_cloud(train, False, 'text')