## Applying stop-words

In [1]:

import pandas as pd
import numpy as np
import pickle
import os

import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use("seaborn-dark")

from gensim import matutils, models
import gensim.corpora as corpora
from gensim.corpora import Dictionary
from gensim.models.coherencemodel import CoherenceModel
from gensim.models.ldamodel import LdaModel
import scipy.sparse

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.decomposition import NMF, TruncatedSVD, LatentDirichletAllocation

In [3]:
en_df = pd.read_csv('english_clean_df.csv')

In [4]:
en_df

Unnamed: 0,twitter_id,content,removed_links,removed_users,removed_hashtags,removed_av,tokens,lemmatized,lang
0,aquitainexox,@Banterm42404691 @MrHarryCole Absolutely one s...,@Banterm42404691 @MrHarryCole Absolutely one s...,Absolutely one self indulgent ego trip!! #COP26,Absolutely one self indulgent ego trip!!,Absolutely one self indulgent ego trip!!,absolutely self indulgent ego trip,absolutely self indulgent ego trip,en
1,GEC_JCM_Info,"Please join us at a #COP26 side event ""Impleme...","Please join us at a #COP26 side event ""Impleme...","Please join us at a #COP26 side event ""Impleme...","Please join us at a side event ""Implementing ...","Please join us at a side event ""Implementing ...",join event implement joint credit mechanism jc...,join event implement joint credit mechanism jc...,en
2,zpnine,#India’s huge solar uptake has boosted #climat...,#India’s huge solar uptake has boosted #climat...,#India’s huge solar uptake has boosted #climat...,"’s huge solar uptake has boosted goals, says ...","’s huge solar uptake has boosted goals, says ...",huge solar uptake boost goals say minister,huge solar uptake boost goals say minister,en
3,PGDynes,The concrete needed to accommodate 2 billion m...,The concrete needed to accommodate 2 billion m...,The concrete needed to accommodate 2 billion m...,The concrete needed to accommodate 2 billion m...,The concrete needed to accommodate 2 billion m...,concrete need accommodate billion people citie...,concrete need accommodate billion people citie...,en
4,Xplorit,#WestVirginia Senator &amp; Ennersystams fraud...,#WestVirginia Senator &amp; Ennersystams fraud...,#WestVirginia Senator &amp; Ennersystams fraud...,Senator &amp; Ennersystams fraud . gets a pro...,Senator &amp; Ennersystams fraud . gets a pro...,senator amp ennersystams fraud get proper welc...,senator amp ennersystams fraud get proper welc...,en
...,...,...,...,...,...,...,...,...,...
5949,AqsaYounasRana,"ClimateChange is no more only a challenge, it’...","ClimateChange is no more only a challenge, it’...","ClimateChange is no more only a challenge, it’...","ClimateChange is no more only a challenge, it’...","ClimateChange is no more only a challenge, it’...",climatechange challenge crisis soon catastroph...,climatechange challenge crisis soon catastroph...,en
5950,pete4peace,We confronted the coal baron this morning! Fuc...,We confronted the coal baron this morning! Fuc...,We confronted the coal baron this morning! Fuc...,We confronted the coal baron this morning! Fuc...,We confronted the coal baron this morning! Fuc...,confront coal baron morning fuck,confront coal baron morning fuck,en
5951,mmill_landscape,Feeling simply disgusted by countries such as ...,Feeling simply disgusted by countries such as ...,Feeling simply disgusted by countries such as ...,Feeling simply disgusted by countries such as ...,Feeling simply disgusted by countries such as ...,feel simply disgust countries sign decrease st...,feel simply disgust countries sign decrease st...,en
5952,KarynCaplan,🚨 Join us in calling for @Google and #Facebook...,🚨 Join us in calling for @Google and #Facebook...,🚨 Join us in calling for and #Facebook () to ...,🚨 Join us in calling for and () to stop prom...,🚨 Join us in calling for and () to stop prom...,join call stop promote fund climate denial pub...,join call stop promote fund climate denial pub...,en


In [5]:
#NLTK Stop words
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from gensim.utils import simple_preprocess

stop_words = stopwords.words('english')
stop_words.extend(['amp'])


# Define functions for stopwords
def remove_stopwords(texts):
    return[[word for word in simple_preprocess(str(doc)) \
            if word not in stop_words] for doc in texts]

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/amirreza/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
print(en_df['lemmatized'])

0                      absolutely self indulgent ego trip
1       join event implement joint credit mechanism jc...
2              huge solar uptake boost goals say minister
3       concrete need accommodate billion people citie...
4       senator amp ennersystams fraud get proper welc...
                              ...                        
5949    climatechange challenge crisis soon catastroph...
5950                     confront coal baron morning fuck
5951    feel simply disgust countries sign decrease st...
5952    join call stop promote fund climate denial pub...
5953    doubt science clear warm degrees cut global em...
Name: lemmatized, Length: 5954, dtype: object


In [7]:
en_df['stop_word'] = remove_stopwords(en_df['lemmatized'])

In [8]:
en_df.head()

Unnamed: 0,twitter_id,content,removed_links,removed_users,removed_hashtags,removed_av,tokens,lemmatized,lang,stop_word
0,aquitainexox,@Banterm42404691 @MrHarryCole Absolutely one s...,@Banterm42404691 @MrHarryCole Absolutely one s...,Absolutely one self indulgent ego trip!! #COP26,Absolutely one self indulgent ego trip!!,Absolutely one self indulgent ego trip!!,absolutely self indulgent ego trip,absolutely self indulgent ego trip,en,"[absolutely, self, indulgent, ego, trip]"
1,GEC_JCM_Info,"Please join us at a #COP26 side event ""Impleme...","Please join us at a #COP26 side event ""Impleme...","Please join us at a #COP26 side event ""Impleme...","Please join us at a side event ""Implementing ...","Please join us at a side event ""Implementing ...",join event implement joint credit mechanism jc...,join event implement joint credit mechanism jc...,en,"[join, event, implement, joint, credit, mechan..."
2,zpnine,#India’s huge solar uptake has boosted #climat...,#India’s huge solar uptake has boosted #climat...,#India’s huge solar uptake has boosted #climat...,"’s huge solar uptake has boosted goals, says ...","’s huge solar uptake has boosted goals, says ...",huge solar uptake boost goals say minister,huge solar uptake boost goals say minister,en,"[huge, solar, uptake, boost, goals, say, minis..."
3,PGDynes,The concrete needed to accommodate 2 billion m...,The concrete needed to accommodate 2 billion m...,The concrete needed to accommodate 2 billion m...,The concrete needed to accommodate 2 billion m...,The concrete needed to accommodate 2 billion m...,concrete need accommodate billion people citie...,concrete need accommodate billion people citie...,en,"[concrete, need, accommodate, billion, people,..."
4,Xplorit,#WestVirginia Senator &amp; Ennersystams fraud...,#WestVirginia Senator &amp; Ennersystams fraud...,#WestVirginia Senator &amp; Ennersystams fraud...,Senator &amp; Ennersystams fraud . gets a pro...,Senator &amp; Ennersystams fraud . gets a pro...,senator amp ennersystams fraud get proper welc...,senator amp ennersystams fraud get proper welc...,en,"[senator, ennersystams, fraud, get, proper, we..."


In [9]:
#print(remove_stopwords(en_df['lemmatized']))

## Stemming 

In [10]:
sno = nltk.stem.SnowballStemmer('english')

In [11]:
sno.stem('absolutely')

'absolut'

In [12]:
stop_w_all = []
#stop_w = []
for i in range(len(en_df['stop_word'])):
    stop_w = []
    for x in en_df['stop_word'][i]:
        stop_w.append(sno.stem(x))
    stop_w_all.append(stop_w)
#id_tweet_df['lemmatized'] = lemmatized

In [13]:
stop_w_all

[['absolut', 'self', 'indulg', 'ego', 'trip'],
 ['join',
  'event',
  'implement',
  'joint',
  'credit',
  'mechan',
  'jcm',
  'creat',
  'benefit',
  'stakehold',
  'nov',
  'jst',
  'ishihara',
  'gec',
  'discuss',
  'jcm',
  'sdgs',
  'event',
  'stream',
  'onlin'],
 ['huge', 'solar', 'uptak', 'boost', 'goal', 'say', 'minist'],
 ['concret',
  'need',
  'accommod',
  'billion',
  'peopl',
  'citi',
  'year',
  'blow',
  'carbon',
  'budget'],
 ['senat',
  'ennersystam',
  'fraud',
  'get',
  'proper',
  'welcom',
  'world',
  'famous',
  'obstruct',
  'rube',
  'manchin',
  'greed',
  'threaten',
  'life',
  'earth',
  'investig'],
 ['today',
  'energi',
  'day',
  'end',
  'coal',
  'singl',
  'biggest',
  'contributor',
  'sight',
  'thank',
  'secur',
  'strong',
  'coalit',
  'countri',
  'organis',
  'today',
  'read'],
 ['way', 'futur', 'generat', 'emb', 'work', 'import'],
 ['work',
  'quito',
  'school',
  'produc',
  'excit',
  'new',
  'websit',
  'launch',
  'soon',
  '

In [14]:
en_df['root'] = stop_w_all

In [15]:
en_df

Unnamed: 0,twitter_id,content,removed_links,removed_users,removed_hashtags,removed_av,tokens,lemmatized,lang,stop_word,root
0,aquitainexox,@Banterm42404691 @MrHarryCole Absolutely one s...,@Banterm42404691 @MrHarryCole Absolutely one s...,Absolutely one self indulgent ego trip!! #COP26,Absolutely one self indulgent ego trip!!,Absolutely one self indulgent ego trip!!,absolutely self indulgent ego trip,absolutely self indulgent ego trip,en,"[absolutely, self, indulgent, ego, trip]","[absolut, self, indulg, ego, trip]"
1,GEC_JCM_Info,"Please join us at a #COP26 side event ""Impleme...","Please join us at a #COP26 side event ""Impleme...","Please join us at a #COP26 side event ""Impleme...","Please join us at a side event ""Implementing ...","Please join us at a side event ""Implementing ...",join event implement joint credit mechanism jc...,join event implement joint credit mechanism jc...,en,"[join, event, implement, joint, credit, mechan...","[join, event, implement, joint, credit, mechan..."
2,zpnine,#India’s huge solar uptake has boosted #climat...,#India’s huge solar uptake has boosted #climat...,#India’s huge solar uptake has boosted #climat...,"’s huge solar uptake has boosted goals, says ...","’s huge solar uptake has boosted goals, says ...",huge solar uptake boost goals say minister,huge solar uptake boost goals say minister,en,"[huge, solar, uptake, boost, goals, say, minis...","[huge, solar, uptak, boost, goal, say, minist]"
3,PGDynes,The concrete needed to accommodate 2 billion m...,The concrete needed to accommodate 2 billion m...,The concrete needed to accommodate 2 billion m...,The concrete needed to accommodate 2 billion m...,The concrete needed to accommodate 2 billion m...,concrete need accommodate billion people citie...,concrete need accommodate billion people citie...,en,"[concrete, need, accommodate, billion, people,...","[concret, need, accommod, billion, peopl, citi..."
4,Xplorit,#WestVirginia Senator &amp; Ennersystams fraud...,#WestVirginia Senator &amp; Ennersystams fraud...,#WestVirginia Senator &amp; Ennersystams fraud...,Senator &amp; Ennersystams fraud . gets a pro...,Senator &amp; Ennersystams fraud . gets a pro...,senator amp ennersystams fraud get proper welc...,senator amp ennersystams fraud get proper welc...,en,"[senator, ennersystams, fraud, get, proper, we...","[senat, ennersystam, fraud, get, proper, welco..."
...,...,...,...,...,...,...,...,...,...,...,...
5949,AqsaYounasRana,"ClimateChange is no more only a challenge, it’...","ClimateChange is no more only a challenge, it’...","ClimateChange is no more only a challenge, it’...","ClimateChange is no more only a challenge, it’...","ClimateChange is no more only a challenge, it’...",climatechange challenge crisis soon catastroph...,climatechange challenge crisis soon catastroph...,en,"[climatechange, challenge, crisis, soon, catas...","[climatechang, challeng, crisi, soon, catastro..."
5950,pete4peace,We confronted the coal baron this morning! Fuc...,We confronted the coal baron this morning! Fuc...,We confronted the coal baron this morning! Fuc...,We confronted the coal baron this morning! Fuc...,We confronted the coal baron this morning! Fuc...,confront coal baron morning fuck,confront coal baron morning fuck,en,"[confront, coal, baron, morning, fuck]","[confront, coal, baron, morn, fuck]"
5951,mmill_landscape,Feeling simply disgusted by countries such as ...,Feeling simply disgusted by countries such as ...,Feeling simply disgusted by countries such as ...,Feeling simply disgusted by countries such as ...,Feeling simply disgusted by countries such as ...,feel simply disgust countries sign decrease st...,feel simply disgust countries sign decrease st...,en,"[feel, simply, disgust, countries, sign, decre...","[feel, simpli, disgust, countri, sign, decreas..."
5952,KarynCaplan,🚨 Join us in calling for @Google and #Facebook...,🚨 Join us in calling for @Google and #Facebook...,🚨 Join us in calling for and #Facebook () to ...,🚨 Join us in calling for and () to stop prom...,🚨 Join us in calling for and () to stop prom...,join call stop promote fund climate denial pub...,join call stop promote fund climate denial pub...,en,"[join, call, stop, promote, fund, climate, den...","[join, call, stop, promot, fund, climat, denia..."


### stem/root in string form for each tweet

In [16]:
root_join_all = []
for i in range(len(en_df['root'])):
    root_join = " ".join(en_df['root'][i])
    root_join_all.append(root_join)

In [17]:
en_df['root_string'] = root_join_all

In [18]:
en_df.to_csv("root_string_df.csv", index=False)