The process in this notebook, as well as many of the functions, were taken/adapted from Derek Jedamski's course on LinkedIn Learning found here: https://www.linkedin.com/learning/nlp-with-python-for-machine-learning-essential-training

In [1]:
import os
import nltk
import pandas as pd
import re
import string
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
wn = nltk.WordNetLemmatizer()
stopwords = nltk.corpus.stopwords.words('english')
pd.set_option('display.max_colwidth', 100)
idx = pd.IndexSlice

In [3]:
df = pd.read_pickle("../data/compiled/full_dataset_cleaned.pkl")
tdf = df.loc[idx[:,],['full_text']]
tdf = tdf.reset_index().drop(columns='uid').set_index('tid')
tdf.head()

Unnamed: 0_level_0,full_text
tid,Unnamed: 1_level_1
1333476068192366593,"Teen pregnancy is high, HIV infection rate is growing fastest among teens, and teens are experim..."
1364161232270487553,Even though it was a charity. Stevens thought that speaking to DHSS rather than those suffering...
1364161201291153414,SCORA\nStanding Committee on Sexual &amp; Reproductive Health and Rights including HIV&amp;AIDS\...
1364161184505737217,"many females are HIV+, we wish you well, blessings, we can't heal you totally, but providing rel..."
1363439109948149760,@TheRustler83 Yep. Imagine if the government were demanding HIV tests weekly in every high school.


In [4]:
def remove_things(list_to_clean, to_remove):
    new_list = [thing for thing in list_to_clean if thing not in to_remove]
    return new_list

In [21]:
def clean_text(text):
    """code adpated from LinkedIn Learning class NLP with Python for Machine Learning Essential Training by Derek Jedamski"""
    
    text = "".join([word for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [wn.lemmatize(word) for word in tokens if word not in stopwords]   
    return text


In [None]:
def clean_text_ngrams(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = " ".join([ps.stem(word) for word in tokens if word not in stopwords])
    return text

In [21]:
# tdf['cleaned'] = tdf['full_text'].apply(lambda x: clean_text(x.lower()))

In [6]:
cv = CountVectorizer(analyzer=clean_text)

In [7]:
X = cv.fit_transform(tdf['full_text'])

In [8]:
X.shape

(7662, 32606)

In [9]:
# cv.get_feature_names()

In [11]:
X_df = pd.DataFrame(X.toarray())
X_df.columns = cv.get_feature_names()

In [12]:
X_df.head()

Unnamed: 0,Unnamed: 1,0,000,0008,001tea,008WORLD,00s,010,0121,01482,...,デュエマ,养皋ｄ,ｐｒ,ﾉ,𝐐𝐔𝐄𝐄𝐍ᵇᵍº,𝑱𝒐𝒌𝒆𝒔,𝗡𝗢𝗪,𝗥𝗘𝗚𝗜𝗦𝗧𝗘𝗥,𝘽𝙍𝙀𝘼𝙆𝙄𝙉𝙂,𝙊𝙏𝘾
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
ngram_vect = CountVectorizer(ngram_range=(2,2))
X_counts = ngram_vect.fit_transform(data['cleaned_text'])

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer


In [22]:
tdf['cleaned'] = tdf['full_text'].apply(lambda x: clean_text(x.lower()))

In [23]:
tdf.head()

Unnamed: 0_level_0,full_text,cleaned
tid,Unnamed: 1_level_1,Unnamed: 2_level_1
1333476068192366593,"Teen pregnancy is high, HIV infection rate is growing fastest among teens, and teens are experim...","[teen, pregnancy, high, hiv, infection, rate, growing, fastest, among, teen, teen, experimenting..."
1364161232270487553,Even though it was a charity. Stevens thought that speaking to DHSS rather than those suffering...,"[even, though, charity, stevens, thought, speaking, dhss, rather, suffering, 1980, 1990, hiv, ha..."
1364161201291153414,SCORA\nStanding Committee on Sexual &amp; Reproductive Health and Rights including HIV&amp;AIDS\...,"[scora, standing, committee, sexual, amp, reproductive, health, right, including, hivampaids, ci..."
1364161184505737217,"many females are HIV+, we wish you well, blessings, we can't heal you totally, but providing rel...","[many, female, hiv, wish, well, blessing, cant, heal, totally, providing, relief, medicine, salv..."
1363439109948149760,@TheRustler83 Yep. Imagine if the government were demanding HIV tests weekly in every high school.,"[therustler83, yep, imagine, government, demanding, hiv, test, weekly, every, high, school]"


In [24]:

tfidf_vect = TfidfVectorizer(analyzer=clean_text)
X_tfidf = tfidf_vect.fit_transform(tdf['full_text'])

In [25]:
X_tfidf_df = pd.DataFrame(X_tfidf.toarray())
X_tfidf_df.columns = tfidf_vect.get_feature_names()
X_tfidf_df.head()

Unnamed: 0,Unnamed: 1,0,000,0008,001tea,008WORLD,00s,010,0121,01482,...,デュエマ,养皋ｄ,ｐｒ,ﾉ,𝐐𝐔𝐄𝐄𝐍ᵇᵍº,𝑱𝒐𝒌𝒆𝒔,𝗡𝗢𝗪,𝗥𝗘𝗚𝗜𝗦𝗧𝗘𝗥,𝘽𝙍𝙀𝘼𝙆𝙄𝙉𝙂,𝙊𝙏𝘾
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
