In [45]:
import pandas as pd
import re
import nltk

from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [46]:

df = pd.read_csv('Coachella-2015-2-DFE.csv', encoding='latin1')
print(df.head(10))



  coachella_sentiment coachella_yn             name  retweet_count  \
0            positive          yes         kokombil              0   
1            positive          yes    MisssTaraaa10              2   
2            positive          yes    NMcCracken805              0   
3            positive          yes           wxpnfm              1   
4            positive          yes         Caesears              0   
5            positive          yes       donbosco_6              0   
6            positive          yes        __Mia630_              0   
7            positive          yes      OlgaKhmylev              0   
8            positive          yes  Ben_BigB_Davies              0   
9            positive          yes          brevamo             83   

                                                text tweet_coord  \
0  #Coachella2015 tickets selling out in less tha...  [0.0, 0.0]   
1  RT @sudsybuddy: WAIT THIS IS ABSOLUTE FIRE _ÙÓ...         NaN   
2  #Coachella2015 #VIP pa

In [47]:
def extract_hashtags(text):
    if isinstance(text, str):
        return re.findall(r"#(\w+)", text)
    return []



In [48]:
def extract_emails(text):
    if isinstance(text, str):
        return re.findall(r"\S+@\S+", text)
    return []



In [49]:
def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = re.sub(r"#(\w+)|\S+@\S+", "", text)
    text = re.sub(r"@\w+", "", text)
    text = re.sub(r"http\S+|https\S+", "", text)
    text = ''.join(c for c in text if ord(c) < 128)
    text = re.sub(r"[0-9]", "", text)
    text = re.sub(r"[.,!@?#$%&*()+=\-_{}\[\];:'\"/\\|<>`~]", "", text)
    text = text.lower()
    tokens = [word for word in text.split() if word not in stop_words]
    return " ".join(tokens)



In [50]:
hashtags_list = []
emails_list = []
cleaned_text_list = []
final_text_list = []


In [51]:
for i, row in df.iterrows():
    text = row['text']
    hashtags_list.append(extract_hashtags(text))
    emails_list.append(extract_emails(text))

    cleaned = clean_text(text)
    cleaned_text_list.append(cleaned)

    final_text_list.append(cleaned)




In [52]:
df['hashtags'] = hashtags_list
df['emails'] = emails_list
df['clean_text'] = cleaned_text_list
df['final_text'] = final_text_list



In [53]:
df[['final_text']].to_csv("coachella_final_text.csv", index=False, encoding="utf-8")

print(" Cleaning complete. Saved to 'coachella_final_text.csv'.")

 Cleaning complete. Saved to 'coachella_final_text.csv'.
