In [1]:
import pandas as pd
import json
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import re
import string

idx = pd.IndexSlice
pd.set_option('display.max_colwidth', 100)

In [16]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
stopwords = nltk.corpus.stopwords.words('english')
custom_stopwords = ['amp']
stopwords = stopwords + custom_stopwords

In [3]:
data_fol = r"..\data_files"

In [68]:
df = pd.read_pickle(os.path.join(data_fol, "dataset.pkl"))

In [69]:
text_df = df[['text']]
text_df = text_df.reset_index().drop(columns='uid')
text_df.head()

Unnamed: 0,tid,text
0,1333476068192366593,"Teen pregnancy is high, HIV infection rate is growing fastest among teens, and teens are experim..."
1,1364161232270487553,Even though it was a charity. Stevens thought that speaking to DHSS rather than those suffering...
2,1364161201291153414,SCORA\nStanding Committee on Sexual &amp; Reproductive Health and Rights including HIV&amp;AIDS\...
3,1364161184505737217,"many females are HIV+, we wish you well, blessings, we can't heal you totally, but providing rel..."
4,1363439109948149760,@TheRustler83 Yep. Imagine if the government were demanding HIV tests weekly in every high school.


In [70]:
wn = nltk.WordNetLemmatizer()

In [71]:
def clean_tweet(tweet):
    """code adpated from LinkedIn Learning class NLP with Python for Machine Learning Essential Training by Derek Jedamski"""    
    tweet = "".join([word.lower() for word in tweet if word not in string.punctuation])
    tokens = re.split('\W+', tweet)
    tweet = [wn.lemmatize(word) for word in tokens if word not in stopwords]   
    return tweet

In [72]:
text_df['clean'] = text_df['text'].apply(lambda x: clean_tweet(x))

In [73]:
text_df['hashtags'] = text_df['text'].apply(lambda x: re.findall(r"#(\w+)", x.lower().strip()))
text_df['hashtags_count'] = text_df['hashtags'].apply(lambda x: len(x))
text_df['mentions'] = text_df['text'].apply(lambda x: re.findall(r"@(\w+)", x.lower().strip()))


In [74]:
text_df.head()

Unnamed: 0,tid,text,clean,hashtags,hashtags_count,mentions
0,1333476068192366593,"Teen pregnancy is high, HIV infection rate is growing fastest among teens, and teens are experim...","[teen, pregnancy, high, hiv, infection, rate, growing, fastest, among, teen, teen, experimenting...",[],0,[]
1,1364161232270487553,Even though it was a charity. Stevens thought that speaking to DHSS rather than those suffering...,"[even, though, charity, stevens, thought, speaking, dhss, rather, suffering, 1980, 1990, hiv, ha...","[hiv, haemophilia]",2,[bloodinquiry]
2,1364161201291153414,SCORA\nStanding Committee on Sexual &amp; Reproductive Health and Rights including HIV&amp;AIDS\...,"[scora, standing, committee, sexual, reproductive, health, right, including, hivampaids, cimsa, ...",[],0,[]
3,1364161184505737217,"many females are HIV+, we wish you well, blessings, we can't heal you totally, but providing rel...","[many, female, hiv, wish, well, blessing, cant, heal, totally, providing, relief, medicine, salv...",[],0,[]
4,1363439109948149760,@TheRustler83 Yep. Imagine if the government were demanding HIV tests weekly in every high school.,"[therustler83, yep, imagine, government, demanding, hiv, test, weekly, every, high, school]",[],0,[therustler83]


In [80]:
new_tweets = {}
for row in text_df.index:
    text = text_df.at[row, 'clean']
    hashtags = text_df.at[row, 'hashtags']
    mentions = text_df.at[row, 'mentions']
    new_text = [word for word in text if word not in ' '.join(hashtags)]
    new_text = ' '.join([word for word in new_text if word not in ' '.join(mentions)])
    ind = text_df.at[row, 'tid']
    new_tweets[ind] = new_text


In [83]:
text_df = text_df.merge(pd.Series(new_tweets, name='clean_no_@#'), left_on='tid', right_index=True)

In [84]:
text_df.head()

Unnamed: 0,tid,text,clean,hashtags,hashtags_count,mentions,clean_no_@#
0,1333476068192366593,"Teen pregnancy is high, HIV infection rate is growing fastest among teens, and teens are experim...","[teen, pregnancy, high, hiv, infection, rate, growing, fastest, among, teen, teen, experimenting...",[],0,[],teen pregnancy high hiv infection rate growing fastest among teen teen experimenting sex need srh
1,1364161232270487553,Even though it was a charity. Stevens thought that speaking to DHSS rather than those suffering...,"[even, though, charity, stevens, thought, speaking, dhss, rather, suffering, 1980, 1990, hiv, ha...","[hiv, haemophilia]",2,[bloodinquiry],even though charity stevens thought speaking dhss rather suffering 1980 1990 httpstcohzrl1mm0rx
2,1364161201291153414,SCORA\nStanding Committee on Sexual &amp; Reproductive Health and Rights including HIV&amp;AIDS\...,"[scora, standing, committee, sexual, reproductive, health, right, including, hivampaids, cimsa, ...",[],0,[],scora standing committee sexual reproductive health right including hivampaids cimsa empowering ...
3,1364161184505737217,"many females are HIV+, we wish you well, blessings, we can't heal you totally, but providing rel...","[many, female, hiv, wish, well, blessing, cant, heal, totally, providing, relief, medicine, salv...",[],0,[],many female hiv wish well blessing cant heal totally providing relief medicine salve specialty m...
4,1363439109948149760,@TheRustler83 Yep. Imagine if the government were demanding HIV tests weekly in every high school.,"[therustler83, yep, imagine, government, demanding, hiv, test, weekly, every, high, school]",[],0,[therustler83],yep imagine government demanding hiv test weekly every high school


In [85]:
pd.to_pickle(text_df, os.path.join(data_fol, 'dataset_text_only.pkl'))

In [None]:
text_df = pd.read_pickle(os.path.join(data_fol, 'dataset_text_only.pkl'))
clean_df = pd.read_pickle(os.path.join(data_fol, 'dataset_no_outliers_engagement.pkl'))

In [None]:
keepers = list(clean_df.index.get_level_values('tid'))
new_text_df = text_df[text_df.tid.isin(keepers)]

In [None]:
pd.to_pickle(new_text_df, os.path.join(data_fol, 'dataset_text_only_no_outliers.pkl'))