In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Index 
<a id='index'></a>
[Import libraries](#import_libraries) <br>
[Import functions](#import_functions) <br>
[Import dataset](#import_dataset) <br>
[Add polarity and subjectivity](#add_polarity_subj) <br>
[Look at pair plots](#pair_plot) <br>
[Word Cloud](#word_cloud) <br>
[Word Cloud for depressed](#word_cloud_dep) <br>
[Word Cloud for non-depressed](#word_cloud_non-dep) <br>
[Topic modelling on depressed](#lsa_depr) <br>
[Topic modelling on non-depressed](#lsa_nondepr) <br>

### Import libraries <a id='import_libraries'></a>

[Back to Index](#index)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
#Text manipulation
import nltk
from textblob import TextBlob
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

from gensim import matutils, models
import scipy.sparse

#Text visualisation 
from wordcloud import WordCloud
from sklearn.feature_extraction import text 

## visualisation
#import pyLDAvis.gensim
#from gensim.corpora import Dictionary
#from gensim.models.coherencemodel import CoherenceModel

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import spacy
spacy.load("en_core_web_sm")

nltk.download('stopwords')

# deactivate deprecation warnings
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

### Import functions for tweets <a id='import_functions'></a>

[Back to Index](#index)

In [None]:
###### Function to prepare the tweets
def clean_text_1(text):
    # Lowercase
    text = text.lower()
    # Remove special text in brackets ([chorus],[guitar],etc)
    text = re.sub('\[.*?\]', '', text)
    # Remove punctuation
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    # Remove words containing numbers
    text = re.sub('\w*\d\w*', '', text)    
    # Remove quotes
    text = re.sub('[‘’“”…]', '', text)
    # Remove new line \n 
    text = re.sub('\n', ' ', text)
    # Remove stop_word
    stop_words = stopwords.words('english')
    words = word_tokenize(text)
    new_text = ""
    for w in words:
        if w not in stop_words and len(w) > 1:
            new_text = new_text + " " + w
    return new_text

##### Function to integrate polarity and subjectivity in the tweets
pol = lambda x: TextBlob(x).sentiment.polarity
sub = lambda x: TextBlob(x).sentiment.subjectivity

### Function to lemmatize the text

def lemmatize_tag(text):
    wnl = WordNetLemmatizer()
    lemma=[]
    for i,j in pos_tag(word_tokenize(text)) :
        p=j[0].lower()
        if p in ['j','n','v']:
            if p == 'j':
                p = 'a'
            lemma.append(wnl.lemmatize(i,p))
        else :
            lemma.append(wnl.lemmatize(i))    
    return ' '.join(lemma)


### Function to extract nouns
def nouns(text):
    '''Given a string of text, tokenize the text and pull out only the nouns.'''
    is_noun = lambda pos: pos[:2] == 'NN'
    tokenized = word_tokenize(text)
    all_nouns = [word for (word, pos) in pos_tag(tokenized) if is_noun(pos)] 
    return ' '.join(all_nouns)

# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

## lemmatization with noun, adjective, verbs, adverb
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out


### Import dataset <a id='import_dataset'></a>

[Back to Index](#index)

In [None]:

df = pd.read_csv("/kaggle/input/sentimental-analysis-for-tweets/sentiment_tweets3.csv")

In [None]:
df

In [None]:
df.info()

### Add columns for polarity and subjectivity<a id='add_polarity_subj'></a>

[Back to Index](#index)

In [None]:
df['polarity'] = df['message to examine'].apply(pol)
df['subjectivity'] = df['message to examine'].apply(sub)
df

Unbalanced data. Less depressed

In [None]:

sns.histplot(df['label (depression result)'])

### Pair plots <a id='pair_plot'></a>

Polarity is not separating depression.

[Back to Index](#index)

In [None]:
sns.pairplot(df[['polarity','subjectivity','label (depression result)']],hue='label (depression result)')

### Word Cloud <a id='word_cloud'></a>


[Back to Index](#index)

In [None]:
# Clean text
df_clean = pd.DataFrame(df['message to examine'].apply(clean_text_1)).copy()
df_clean = pd.DataFrame(df['message to examine'].apply(lemmatize_tag)).copy()

In [None]:
# define stop words for text cleaning
stop_words = stopwords.words('english')

In [None]:
# define stop words for text cleaning
stop_words2=[]
for w in stop_words:
    stop_words2.append(w)

stop_words2.extend(['http://t.',"I'm",'http',"can't",'Å','Ā','like','t','åā','www','com','https'])

In [None]:
wc = WordCloud(collocations=False,stopwords=stop_words2, background_color='white', colormap='Dark2',
               max_font_size=150, random_state=42)

In [None]:
text_to_analyze = ''
for i in df['message to examine']:
    text_to_analyze = text_to_analyze + ' ' + i

In [None]:
wc = wc.generate(text_to_analyze)

'depression' is appearing at biggest

In [None]:
# Wordcloud plot
plt.rcParams['figure.figsize'] = [10, 10]
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.title('Wordcloud for Tweets')
plt.show()

### Word Cloud for depressed <a id='word_cloud_dep'></a>

The words 'depression' and 'axiety' are appearing as more relevant

[Back to Index](#index)

In [None]:
#Extract tweets from depressed
df_depressed = df_clean[df['label (depression result)']==1]

In [None]:
text_to_analyze = ''
for i in df_depressed['message to examine']:
    text_to_analyze = text_to_analyze + ' ' + i
wc = WordCloud(collocations=False,stopwords=stop_words2, background_color='white', colormap='Dark2',
               max_font_size=150, random_state=42)
wc = wc.generate(text_to_analyze)
# Wordcloud plot
plt.rcParams['figure.figsize'] = [10, 10]
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.title('Wordcloud for depressed Tweets')
plt.show()

### Word Cloud for non-depressed <a id='word_cloud_non_dep'></a>

Words like 'good', 'thank' and 'love' are more relevant

[Back to Index](#index)

In [None]:
#Extract tweets from depressed
df_nondepressed = df_clean[df['label (depression result)']==0]

In [None]:
stop_words2.extend(['get'])
text_to_analyze = ''
for i in df_nondepressed['message to examine']:
    text_to_analyze = text_to_analyze + ' ' + i
wc = WordCloud(collocations=False,stopwords=stop_words2, background_color='white', colormap='Dark2',
               max_font_size=150, random_state=42)
wc = wc.generate(text_to_analyze)
# Wordcloud plot
plt.rcParams['figure.figsize'] = [10, 10]
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.title('Wordcloud for non-depressed Tweets')
plt.show()

### Topic modelling on depressed <a id='lsa_depr'></a>

first topic: 'depression', 'anxiety'

[Back to Index](#index)

In [None]:
cv = CountVectorizer()#CountVectorizer(min_df=.2, max_df=.8,stop_words=stop_words2)

In [None]:
# Put tweets in list
alltweets = []
for i in df_depressed['message to examine']:
    alltweets.append(i)

In [None]:
data_cv = cv.fit_transform(alltweets)
data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm.index = df_depressed.index
data_dtm

In [None]:
vectorizer = TfidfVectorizer(stop_words=stop_words2)#min_df=.4, max_df=.8,stop_words=stop_words2)
tfidf = vectorizer.fit_transform(alltweets)
#len(vectorizer.get_feature_names())
data_tfidf = pd.DataFrame(tfidf.toarray(), columns=vectorizer.get_feature_names())
data_tfidf.index = df_depressed.index
data_tfidf

In [None]:

# SVD represent documents and terms in vectors 
svd_model = TruncatedSVD(n_components=5) #try with 10 topics
svd_model.fit(data_tfidf)
print(svd_model.components_.shape)
print(svd_model.singular_values_)

In [None]:
terms = vectorizer.get_feature_names()
# Print out the topics
for i, comp in enumerate(svd_model.components_):
    terms_comp = zip(terms, comp)
    sorted_terms = sorted(terms_comp, key= lambda x:x[1], reverse=True)[:7]
    print("Topic "+str(i)+": ")
    for t in sorted_terms:
        print("%.2f*%s "% (t[1], t[0]) ,end='')
    print("")

### Topic modelling on non-depressed <a id='lsa_nondepr'></a>

first topic: 'good', 'day,'love', 'today'

[Back to Index](#index)

In [None]:
# Put tweets in list
alltweets = []
for i in df_nondepressed['message to examine']:
    alltweets.append(i)

In [None]:
data_cv = cv.fit_transform(alltweets)
data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm.index = df_nondepressed.index

In [None]:
vectorizer = TfidfVectorizer(stop_words=stop_words2)#min_df=.4, max_df=.8,stop_words=stop_words2)
tfidf = vectorizer.fit_transform(alltweets)
#len(vectorizer.get_feature_names())
data_tfidf = pd.DataFrame(tfidf.toarray(), columns=vectorizer.get_feature_names())
data_tfidf.index = df_nondepressed.index

In [None]:

# SVD represent documents and terms in vectors 
svd_model = TruncatedSVD(n_components=5) #try with 10 topics
svd_model.fit(data_tfidf)
print(svd_model.components_.shape)
print(svd_model.singular_values_)

In [None]:
terms = vectorizer.get_feature_names()
# Print out the topics
for i, comp in enumerate(svd_model.components_):
    terms_comp = zip(terms, comp)
    sorted_terms = sorted(terms_comp, key= lambda x:x[1], reverse=True)[:7]
    print("Topic "+str(i)+": ")
    for t in sorted_terms:
        print("%.2f*%s "% (t[1], t[0]) ,end='')
    print("")