In [178]:
import os
from pprint import pprint

import nltk
import numpy as np
import pandas as pd
import pymongo
from dotenv import load_dotenv
from nltk.corpus import stopwords
from sklearn import datasets, svm
from sklearn.decomposition import NMF, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

from helpers import connect_to_db, decontracted, display_topics

## Connect to database - Load data to pandas

In [2]:
client = connect_to_db()
db = client.accounts # connect to my database

posts = db['posts'] # collection of posts
authors = db['authors'] # collection of authors

In [9]:
cursor = posts.find()
entries = list(cursor)

df = pd.DataFrame(entries)

In [176]:
df.head()

Unnamed: 0,_id,account,post_id,likes,comments,date,content,hashtags,number_hashtags,img_text,number_emojis,mentions,emoji_terms,pre_cleaned_text,url_email
0,5f34706c8091264adc7b3c0f,mindfulmft,ierq6xRnBL,55,1,2013-12-28 15:53:14,#therapy #positivequotes #quotes #marriage #re...,guidance challenge mentalhealth quotes positiv...,14,true humility is staying teachabl,0.0,,,,
1,5f34706d8091264adc7b3c11,mindfulmft,ietVO_RnD_,33,0,2013-12-28 16:07:45,#motivation #love #power #encourage #journey #...,journey power forward relationship encourage a...,14,YOU ARE FAR Coo. pnart- TO BE THE ONLY THING S...,0.0,,,,
2,5f34706d8091264adc7b3c12,mindfulmft,ietrkpRnEl,65,1,2013-12-28 16:10:48,#storms #accomplishments #roots #strength #liv...,storms wisdom forward relationship encourageme...,15,Storms make trees take deeper root ss - Dolly ...,0.0,,,,
3,5f34706d8091264adc7b3c13,mindfulmft,ieum0hxnF-,39,1,2013-12-28 16:18:53,#words #self #life #MINDFULMFT #mindfulness #c...,wisdom control encouragement lessons motivatio...,13,- Let anyone determine your self-worth. = Spea...,0.0,,,,
4,5f34706d8091264adc7b3c14,mindfulmft,jH1aVfxnDG,61,1,2014-01-13 15:27:13,#truth #wisdom #wise #think #act #motivation #...,think wisdom wise family counseling encourage ...,16,Most of the problems in life are because of tw...,0.0,,,,


## CountVectorizer

In [11]:
cv = CountVectorizer(stop_words='english')
X = cv.fit_transform(df['pre_cleaned_text'])
doc_word = pd.DataFrame(X.toarray(), columns=cv.get_feature_names())

### LSA (Latent Semantic Analysis)

In [12]:
components = 10
lsa = TruncatedSVD(components)
doc_topic = lsa.fit_transform(doc_word)
lsa.explained_variance_ratio_

array([0.04197534, 0.02159446, 0.01473892, 0.01198958, 0.01063135,
       0.00969137, 0.00926591, 0.00768712, 0.00726978, 0.00710787])

In [13]:
topic_word = pd.DataFrame(lsa.components_.round(3),
             index = [f"component_{c + 1}" for c in range(components)],
             columns = cv.get_feature_names())
topic_word

Unnamed: 0,00,000,001,0012,0013,01,03,04,05,09,...,zombies,zone,zoned,zones,zoning,zoo,zoom,zoomed,zozi,zumba
component_1,0.0,0.001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.002,0.0,0.0,0.0,0.0,0.001,0.0,0.0,0.0
component_2,-0.0,-0.001,-0.0,-0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,...,0.0,-0.001,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0
component_3,0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,-0.0,-0.0,...,-0.0,-0.001,-0.0,-0.001,-0.0,-0.0,-0.0,0.0,-0.0,-0.0
component_4,-0.0,0.001,-0.0,0.0,-0.0,-0.0,0.0,0.0,0.0,-0.0,...,-0.0,-0.0,-0.0,0.0,-0.0,-0.0,-0.0,-0.0,0.0,-0.0
component_5,-0.0,0.0,-0.0,0.0,-0.0,-0.0,0.0,0.0,0.0,0.0,...,0.0,0.001,0.0,0.001,0.0,0.0,0.0,0.0,-0.0,-0.0
component_6,-0.0,-0.002,-0.0,-0.0,-0.0,0.0,-0.0,-0.0,-0.0,0.0,...,-0.0,-0.003,-0.0,-0.0,-0.0,-0.0,0.0,0.0,-0.0,-0.0
component_7,0.0,0.002,0.0,0.0,0.0,-0.0,0.0,-0.0,0.0,-0.0,...,0.0,-0.001,0.0,-0.001,0.0,0.0,-0.001,-0.0,-0.0,-0.0
component_8,-0.001,-0.004,-0.0,0.0,-0.0,-0.0,0.0,-0.0,0.0,-0.0,...,0.0,-0.0,-0.0,0.0,-0.001,0.0,-0.001,-0.0,-0.0,-0.0
component_9,0.001,0.001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.001,0.0,0.0,0.0,0.0,0.001,-0.0,0.0,0.0
component_10,-0.0,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.0,0.001,-0.0,0.0,-0.001,0.0,-0.001,0.0,-0.0,0.0


In [19]:
display_topics(lsa, cv.get_feature_names(), 5)


Topic  1
love, people, feel, just, need

Topic  2
love, loving, loved, great, let

Topic  3
people, want, just, know, let

Topic  4
people, self, ego, love, care

Topic  5
trauma, feel, body, love, like

Topic  6
relationship, relationships, partner, person, boundaries

Topic  7
trauma, work, relationship, healing, relationships

Topic  8
need, let, trauma, pain, hurt

Topic  9
need, trauma, know, want, like

Topic  10
person, just, self, want, like


### NMF (Non-negative Matrix Factorization)

In [15]:
components = 10
nmf_model = NMF(components)
doc_topic = nmf_model.fit_transform(doc_word)

In [16]:
topic_word = pd.DataFrame(lsa.components_.round(3),
             index = [f"component_{c + 1}" for c in range(components)],
             columns = cv.get_feature_names())
topic_word

Unnamed: 0,00,000,001,0012,0013,01,03,04,05,09,...,zombies,zone,zoned,zones,zoning,zoo,zoom,zoomed,zozi,zumba
component_1,0.0,0.001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.002,0.0,0.0,0.0,0.0,0.001,0.0,0.0,0.0
component_2,-0.0,-0.001,-0.0,-0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,...,0.0,-0.001,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0
component_3,0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,-0.0,-0.0,...,-0.0,-0.001,-0.0,-0.001,-0.0,-0.0,-0.0,0.0,-0.0,-0.0
component_4,-0.0,0.001,-0.0,0.0,-0.0,-0.0,0.0,0.0,0.0,-0.0,...,-0.0,-0.0,-0.0,0.0,-0.0,-0.0,-0.0,-0.0,0.0,-0.0
component_5,-0.0,0.0,-0.0,0.0,-0.0,-0.0,0.0,0.0,0.0,0.0,...,0.0,0.001,0.0,0.001,0.0,0.0,0.0,0.0,-0.0,-0.0
component_6,-0.0,-0.002,-0.0,-0.0,-0.0,0.0,-0.0,-0.0,-0.0,0.0,...,-0.0,-0.003,-0.0,-0.0,-0.0,-0.0,0.0,0.0,-0.0,-0.0
component_7,0.0,0.002,0.0,0.0,0.0,-0.0,0.0,-0.0,0.0,-0.0,...,0.0,-0.001,0.0,-0.001,0.0,0.0,-0.001,-0.0,-0.0,-0.0
component_8,-0.001,-0.004,-0.0,0.0,-0.0,-0.0,0.0,-0.0,0.0,-0.0,...,0.0,-0.0,-0.0,0.0,-0.001,0.0,-0.001,-0.0,-0.0,-0.0
component_9,0.001,0.001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.001,0.0,0.0,0.0,0.0,0.001,-0.0,0.0,0.0
component_10,-0.0,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.0,0.001,-0.0,0.0,-0.001,0.0,-0.001,0.0,-0.0,0.0


In [21]:
display_topics(nmf_model, cv.get_feature_names(), 10)


Topic  1
let, does, just, way, pain

Topic  2
love, loving, loved, great, fear

Topic  3
self, ego, care, child, inner

Topic  4
people, boundaries, life, person, think

Topic  5
trauma, body, healing, nervous, response

Topic  6
relationship, relationships, partner, person, work

Topic  7
feel, like, feeling, emotions, feelings

Topic  8
need, needs, boundaries, ask, space

Topic  9
want, know, like, just, things

Topic  10
time, life, work, new, day


## Using lemmatizers

In [22]:
tokenizer_ = nltk.stem.snowball.EnglishStemmer()

In [72]:
raw =  df[~df['pre_cleaned_text'].replace('', np.nan).isna()]['pre_cleaned_text'] #.apply(lambda post: ' '.join([w for w in post.split() if (w not in stopwords.words('english'))]))

In [73]:
raw

8                                   Explore your passions.
13       You do not know anyone else is story. It does ...
15       A great message. It hurt because it mattered a...
16       Do not let fears or self doubt get in the way....
18       With every obstacle you have a choice: to let ...
                               ...                        
14861    i am so excited to share my collaboration with...
14862                             speak it out loud loves!
14863        here is to a weekend with some time for you x
14864    Because self-care is community care. Community...
14865    During this time of collective healing, I have...
Name: pre_cleaned_text, Length: 14671, dtype: object

In [74]:
cv = CountVectorizer(stop_words='english', ngram_range=(1, 1), token_pattern='[a-z]{3,}', max_features=20000, min_df=100, max_df=14671)
X = cv.fit_transform(raw_documents=raw)
doc_word = pd.DataFrame(X.toarray(), columns=cv.get_feature_names())
doc_word

Unnamed: 0,abandon,abandoned,abandonment,ability,able,absence,absolutely,abuse,abusive,accept,...,write,writing,written,wrong,year,years,yes,yesterday,york,young
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14666,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14667,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14668,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14669,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [85]:
lemma = nltk.wordnet.WordNetLemmatizer()
stem_1 = nltk.stem.lancaster.LancasterStemmer()
stem_2 = nltk.stem.porter.PorterStemmer()
stem_3 = nltk.stem.snowball.EnglishStemmer()

In [138]:
def preprocessing(stem_):
    f = lambda text: ' '.join([stem_.stem(lemma.lemmatize(w)) for w in text.split()])
    
    token_stopwords = [f(w) for w in stopwords.words('english')]

    cv = CountVectorizer(preprocessor=lambda text: f(text), stop_words=token_stopwords, ngram_range=(1, 1), min_df=1000, max_df=14671)
    X = cv.fit_transform(raw_documents=raw)
    doc_word = pd.DataFrame(X.toarray(), columns=cv.get_feature_names())
    
    return doc_word

In [146]:
s3 = preprocessing(stem_3)

In [174]:
f = lambda text: ' '.join([stem_3.stem(lemma.lemmatize(w)) for w in text.split()])
    
token_stopwords = [f(w) for w in stopwords.words('english')]

cv = CountVectorizer(preprocessor=lambda text: f(text), stop_words=token_stopwords, ngram_range=(1, 1))
X = cv.fit_transform(raw_documents=raw)
doc_word = pd.DataFrame(X.toarray(), columns=cv.get_feature_names())

In [170]:
components = 10
lsa = TruncatedSVD(components)
doc_topic = lsa.fit_transform(s3)
lsa.explained_variance_ratio_

array([0.1108285 , 0.04915207, 0.03160552, 0.02561724, 0.02361751])

In [171]:
topic_word = pd.DataFrame(lsa.components_.round(3),
             index = [f"component_{c + 1}" for c in range(components)],
             columns = cv.get_feature_names())
topic_word

Unnamed: 0,abl,accept,actual,allow,also,alway,anoth,around,as,ask,...,use,want,way,well,without,word,work,world,would,year
component_1,0.035,0.035,0.036,0.058,0.062,0.063,0.033,0.053,0.03,0.071,...,0.05,0.156,0.168,0.031,0.042,0.04,0.155,0.045,0.087,0.029
component_2,-0.005,0.005,-0.006,-0.009,-0.027,-0.013,0.003,-0.01,-0.012,-0.017,...,-0.037,-0.041,-0.021,-0.012,-0.005,0.005,-0.053,-0.01,-0.038,-0.009
component_3,0.008,-0.024,-0.016,0.008,0.006,-0.036,-0.006,-0.01,0.014,-0.028,...,0.012,-0.148,-0.024,-0.008,-0.001,0.014,-0.103,-0.009,-0.037,-0.025
component_4,0.0,0.012,-0.012,-0.011,0.022,-0.005,0.018,-0.007,0.036,-0.038,...,-0.003,-0.086,-0.022,0.008,0.009,0.012,0.184,0.003,0.0,0.003
component_5,-0.007,-0.003,-0.01,-0.022,-0.022,0.008,0.008,-0.028,-0.029,0.045,...,-0.02,0.141,-0.031,-0.02,-0.005,-0.001,-0.119,-0.026,-0.034,-0.036


In [172]:
display_topics(lsa, cv.get_feature_names(), 5)


Topic  1
love, feel, need, peopl, relationship

Topic  2
love, partner, relationship, fear, give

Topic  3
feel, love, emot, like, safe

Topic  4
relationship, self, work, trauma, heal

Topic  5
relationship, partner, want, person, feel


# TD-IDF

In [None]:
tfidf1 = TfidfVectorizer(stop_words='english')
X_train_tfidf1 = tfidf1.fit_transform(X_train)
X_test_tfidf1  = tfidf1.transform(X_test)