# Coachella Tweets Cleaning and Extraction

This notebook cleans tweets related to the Coachella music festival by:
- Extracting hashtags and emails
- Removing usernames, links, non-ASCII characters, digits, stopwords, and special characters
- Producing a cleaned version of tweet text


In [23]:
import numpy as np
import pandas as pd
import nltk
import re
import string


In [24]:
from nltk.corpus import stopwords

# Download stopwords the first time you run this
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ruzgh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [25]:
df = pd.read_csv('Coachella-2015-2-DFE.csv', encoding='latin-1')

df


Unnamed: 0,coachella_sentiment,coachella_yn,name,retweet_count,text,tweet_coord,tweet_created,tweet_id,tweet_location,user_timezone
0,positive,yes,kokombil,0,#Coachella2015 tickets selling out in less tha...,"[0.0, 0.0]",1/7/15 15:02,5.529630e+17,,Quito
1,positive,yes,MisssTaraaa10,2,RT @sudsybuddy: WAIT THIS IS ABSOLUTE FIRE _ÙÓ...,,1/7/15 15:02,5.529630e+17,united states,
2,positive,yes,NMcCracken805,0,#Coachella2015 #VIP passes secured! See you th...,,1/7/15 15:01,5.529630e+17,"Costa Mesa, CA",
3,positive,yes,wxpnfm,1,PhillyÛªs @warondrugsjams will play #Coachell...,,1/7/15 15:01,5.529630e+17,"Philadelphia, PA and Worldwide",Quito
4,positive,yes,Caesears,0,If briana and her mom out to #Coachella2015 i...,,1/7/15 15:00,5.529630e+17,,
...,...,...,...,...,...,...,...,...,...,...
3841,cant tell,yes,MissXOverdose,0,Excuse me while I go cry now. _Ù÷¢ #Coachella2...,,1/6/15 10:32,5.525330e+17,atx,Pacific Time (US & Canada)
3842,cant tell,yes,NedRaggett,1,RT @touchofallright: Ride and Steely Dan. It w...,,1/6/15 10:28,5.525320e+17,"Costa Mesa, CA",Pacific Time (US & Canada)
3843,cant tell,yes,nicolejackieee,0,I've been callin that since day one #Coachella...,,1/6/15 10:26,5.525320e+17,,
3844,cant tell,yes,NiqueWobbitz,2,Is this for real?!?! #Coachella2015 http://t.c...,,1/6/15 10:21,5.525300e+17,Huntington Beach X Long Beach,Pacific Time (US & Canada)


In [26]:
# Replace NaN values in tweet_text with empty strings
df['text'] = df['text'].fillna("")

In [27]:
# Extract hashtags from tweets
df['hashtags'] = df['text'].apply(lambda x: re.findall(r"#\w+", str(x)))
df[['text', 'hashtags']].head()

Unnamed: 0,text,hashtags
0,#Coachella2015 tickets selling out in less tha...,[#Coachella2015]
1,RT @sudsybuddy: WAIT THIS IS ABSOLUTE FIRE _ÙÓ...,[#Coachella2015]
2,#Coachella2015 #VIP passes secured! See you th...,"[#Coachella2015, #VIP]"
3,PhillyÛªs @warondrugsjams will play #Coachell...,"[#Coachella2015, #GovBall2015]"
4,If briana and her mom out to #Coachella2015 i...,[#Coachella2015]


In [28]:
# Extract emails from tweets
df['emails'] = df['text'].apply(lambda x: re.findall(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}", str(x)))
df[['text', 'emails']].head()

Unnamed: 0,text,emails
0,#Coachella2015 tickets selling out in less tha...,[]
1,RT @sudsybuddy: WAIT THIS IS ABSOLUTE FIRE _ÙÓ...,[]
2,#Coachella2015 #VIP passes secured! See you th...,[]
3,PhillyÛªs @warondrugsjams will play #Coachell...,[]
4,If briana and her mom out to #Coachella2015 i...,[]


In [29]:
# Remove usernames like @coachella
def remove_usernames(text):
    return re.sub(r"@\w+", "", str(text))

# Remove links starting with http or https
def remove_links(text):
    return re.sub(r"http\S+|www\S+", "", str(text))

# Remove non-ASCII characters like emojis
def remove_non_ascii_symbols(text):
    return text.encode("ascii", "ignore").decode()

# Convert text to lowercase
def to_lower(text):
    return str(text).lower()

# Remove stop words
def remove_stop_words(text):
    words = text.split()
    filtered_words = [word for word in words if word not in stop_words]
    return " ".join(filtered_words)

# Remove digits
def remove_digits(text):
    return re.sub(r"\d+", "", str(text))

# Remove punctuation and special characters
def remove_special_characters(text):
    return re.sub(r"[^\w\s]", "", str(text))

In [30]:
df['cleaned_tweet'] = (
    df['text']
    .apply(remove_usernames)
    .apply(remove_links)
    .apply(remove_non_ascii_symbols)
    .apply(to_lower)
    .apply(remove_digits)
    .apply(remove_special_characters)
    .apply(remove_stop_words)
)

df[['text', 'cleaned_tweet']].head()

Unnamed: 0,text,cleaned_tweet
0,#Coachella2015 tickets selling out in less tha...,coachella tickets selling less minutes _______...
1,RT @sudsybuddy: WAIT THIS IS ABSOLUTE FIRE _ÙÓ...,rt wait absolute fire ___ coachella
2,#Coachella2015 #VIP passes secured! See you th...,coachella vip passes secured see bitchesssss
3,PhillyÛªs @warondrugsjams will play #Coachell...,phillys play coachella amp govball watch jimmy...
4,If briana and her mom out to #Coachella2015 i...,briana mom coachella im ____


In [31]:
# Compare raw vs cleaned text for first 5 rows
comparison = df[['text', 'cleaned_tweet']]
comparison.head(10)

Unnamed: 0,text,cleaned_tweet
0,#Coachella2015 tickets selling out in less tha...,coachella tickets selling less minutes _______...
1,RT @sudsybuddy: WAIT THIS IS ABSOLUTE FIRE _ÙÓ...,rt wait absolute fire ___ coachella
2,#Coachella2015 #VIP passes secured! See you th...,coachella vip passes secured see bitchesssss
3,PhillyÛªs @warondrugsjams will play #Coachell...,phillys play coachella amp govball watch jimmy...
4,If briana and her mom out to #Coachella2015 i...,briana mom coachella im ____
5,West side is the best side!\n#west #coas #Coac...,west side best side west coas coachella
6,Coachella tickets are now sold out _Ù÷_ &amp; ...,coachella tickets sold __ amp opportunity boug...
7,#Coachella2015 I absolutely can NOT wait. This...,coachella absolutely wait weekend exceed epic ...
8,If someone got me to Coachella if be your frie...,someone got coachella friend life truth despra...
9,RT @brownjenjen:  Õ http://t.co/mxCREvIlGP 71...,rt coachella coachella makes space rockers rav...


In [32]:
# Save to a new CSV file
df.to_csv("coachella_tweets_cleaned.csv", index=False)

In [33]:
# Summary of extracted hashtags and emails
print("Top 10 hashtags:")
print(df['hashtags'].explode().value_counts().head(10))

print("\nTotal tweets containing emails:", df['emails'].apply(len).gt(0).sum())

Top 10 hashtags:
hashtags
#Coachella2015      3458
#coachella2015       346
#Coachella           194
#coachella            93
#coachellalineup      61
#ACDC                 29
#Drake                28
#music                25
#COACHELLA2015        24
#lineup               18
Name: count, dtype: int64

Total tweets containing emails: 0
