# **Coachella Tweets Data Cleaning and Extraction**

**1. Upload CSV and create DataFrame**

In [1]:
from google.colab import files

uploaded = files.upload()

import pandas as pd

df = pd.read_csv("Coachella-2015-2-DFE.csv", encoding="latin1")

print(df.head())

Saving Coachella-2015-2-DFE.csv to Coachella-2015-2-DFE.csv
  coachella_sentiment coachella_yn           name  retweet_count  \
0            positive          yes       kokombil              0   
1            positive          yes  MisssTaraaa10              2   
2            positive          yes  NMcCracken805              0   
3            positive          yes         wxpnfm              1   
4            positive          yes       Caesears              0   

                                                text tweet_coord  \
0  #Coachella2015 tickets selling out in less tha...  [0.0, 0.0]   
1  RT @sudsybuddy: WAIT THIS IS ABSOLUTE FIRE _ÙÓ...         NaN   
2  #Coachella2015 #VIP passes secured! See you th...         NaN   
3  PhillyÛªs @warondrugsjams will play #Coachell...         NaN   
4  If briana and her mom out to #Coachella2015  i...         NaN   

  tweet_created      tweet_id                  tweet_location user_timezone  
0  1/7/15 15:02  5.529630e+17               

**2. Extract hashtags from tweets**

In [9]:
import re

hashtags_pattern = r"#\w+"

df['hashtags'] = df['text'].apply(lambda x: re.findall(hashtags_pattern, x))
df[['text', 'hashtags']].head()

Unnamed: 0,text,hashtags
0,#Coachella2015 tickets selling out in less tha...,[#Coachella2015]
1,RT @sudsybuddy: WAIT THIS IS ABSOLUTE FIRE _ÙÓ...,[#Coachella2015]
2,#Coachella2015 #VIP passes secured! See you th...,"[#Coachella2015, #VIP]"
3,PhillyÛªs @warondrugsjams will play #Coachell...,"[#Coachella2015, #GovBall2015]"
4,If briana and her mom out to #Coachella2015 i...,[#Coachella2015]


**3. Extract emails from tweets**

In [10]:
emails_pattern = r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"

df['emails'] = df['text'].apply(lambda x: re.findall(emails_pattern, x))
df[['text', 'emails']].head()

Unnamed: 0,text,emails
0,#Coachella2015 tickets selling out in less tha...,[]
1,RT @sudsybuddy: WAIT THIS IS ABSOLUTE FIRE _ÙÓ...,[]
2,#Coachella2015 #VIP passes secured! See you th...,[]
3,PhillyÛªs @warondrugsjams will play #Coachell...,[]
4,If briana and her mom out to #Coachella2015 i...,[]


**4. Remove usernames (@…)**

In [11]:
def remove_usernames(text):
    return re.sub(r'@\w+', '', text)

df['cleaned_tweet_text'] = df['text'].apply(remove_usernames)

print(df['cleaned_tweet_text'])


0       #Coachella2015 tickets selling out in less tha...
1       RT : WAIT THIS IS ABSOLUTE FIRE _ÙÓ´_ÙÓ´_ÙÓ´ #...
2       #Coachella2015 #VIP passes secured! See you th...
3       PhillyÛªs  will play #Coachella2015 &amp; #Go...
4       If briana and her mom out to #Coachella2015  i...
                              ...                        
3841    Excuse me while I go cry now. _Ù÷¢ #Coachella2...
3842    RT : Ride and Steely Dan. It would only be bet...
3843    I've been callin that since day one #Coachella...
3844    Is this for real?!?! #Coachella2015 http://t.c...
3845    RT : C'monnnnnn lineup! #coachella2015 #Impatient
Name: cleaned_tweet_text, Length: 3846, dtype: object


**5. Remove links**

In [12]:
def remove_links(text):
    return re.sub(r"(http|https)://\S+", '', text)

df['cleaned_tweet_text'] = df['cleaned_tweet_text'].apply(remove_links)

print(df['cleaned_tweet_text'])

0       #Coachella2015 tickets selling out in less tha...
1       RT : WAIT THIS IS ABSOLUTE FIRE _ÙÓ´_ÙÓ´_ÙÓ´ #...
2       #Coachella2015 #VIP passes secured! See you th...
3       PhillyÛªs  will play #Coachella2015 &amp; #Go...
4       If briana and her mom out to #Coachella2015  i...
                              ...                        
3841    Excuse me while I go cry now. _Ù÷¢ #Coachella2...
3842    RT : Ride and Steely Dan. It would only be bet...
3843    I've been callin that since day one #Coachella...
3844                 Is this for real?!?! #Coachella2015 
3845    RT : C'monnnnnn lineup! #coachella2015 #Impatient
Name: cleaned_tweet_text, Length: 3846, dtype: object


**6. Remove non-ASCII symbols**

In [13]:
def remove_non_ascii_symbols(text):
    return text.encode('ascii', errors='ignore').decode()

df['cleaned_tweet_text'] = df['cleaned_tweet_text'].apply(remove_non_ascii_symbols)

print(df['cleaned_tweet_text'])

0       #Coachella2015 tickets selling out in less tha...
1       RT : WAIT THIS IS ABSOLUTE FIRE ___ #Coachella...
2       #Coachella2015 #VIP passes secured! See you th...
3       Phillys  will play #Coachella2015 &amp; #GovBa...
4       If briana and her mom out to #Coachella2015  i...
                              ...                        
3841      Excuse me while I go cry now. _ #Coachella2015 
3842    RT : Ride and Steely Dan. It would only be bet...
3843    I've been callin that since day one #Coachella...
3844                 Is this for real?!?! #Coachella2015 
3845    RT : C'monnnnnn lineup! #coachella2015 #Impatient
Name: cleaned_tweet_text, Length: 3846, dtype: object


**7. Convert to lower case**

In [14]:
def to_lower(text):
  return text.lower()

df['cleaned_tweet_text'] = df['cleaned_tweet_text'].apply(to_lower)

print(df['cleaned_tweet_text'])

0       #coachella2015 tickets selling out in less tha...
1       rt : wait this is absolute fire ___ #coachella...
2       #coachella2015 #vip passes secured! see you th...
3       phillys  will play #coachella2015 &amp; #govba...
4       if briana and her mom out to #coachella2015  i...
                              ...                        
3841      excuse me while i go cry now. _ #coachella2015 
3842    rt : ride and steely dan. it would only be bet...
3843    i've been callin that since day one #coachella...
3844                 is this for real?!?! #coachella2015 
3845    rt : c'monnnnnn lineup! #coachella2015 #impatient
Name: cleaned_tweet_text, Length: 3846, dtype: object


**8. Remove stop words**

In [15]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

def remove_stop_words(text):
  words = text.split()
  filtered_words = [word for word in words if word not in stop_words]
  return " ".join(filtered_words)

df['cleaned_tweet_text'] = df['cleaned_tweet_text'].apply(remove_stop_words)

print(df['cleaned_tweet_text'])

0       #coachella2015 tickets selling less 40 minutes...
1              rt : wait absolute fire ___ #coachella2015
2       #coachella2015 #vip passes secured! see bitche...
3       phillys play #coachella2015 &amp; #govball2015...
4                   briana mom #coachella2015 im !!! ____
                              ...                        
3841                  excuse go cry now. _ #coachella2015
3842    rt : ride steely dan. would better supergroup....
3843                  callin since day one #coachella2015
3844                              real?!?! #coachella2015
3845    rt : c'monnnnnn lineup! #coachella2015 #impatient
Name: cleaned_tweet_text, Length: 3846, dtype: object


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


**9. Remove digits**

In [16]:
def remove_digits(text):
  return re.sub(r'\d+', '', text)

df['cleaned_tweet_text'] = df['cleaned_tweet_text'].apply(remove_digits)

print(df['cleaned_tweet_text'])

0       #coachella tickets selling less  minutes _____...
1                  rt : wait absolute fire ___ #coachella
2         #coachella #vip passes secured! see bitchesssss
3       phillys play #coachella &amp; #govball! watch ...
4                       briana mom #coachella im !!! ____
                              ...                        
3841                      excuse go cry now. _ #coachella
3842    rt : ride steely dan. would better supergroup....
3843                      callin since day one #coachella
3844                                  real?!?! #coachella
3845        rt : c'monnnnnn lineup! #coachella #impatient
Name: cleaned_tweet_text, Length: 3846, dtype: object


**10. Remove special characters**

In [17]:
def remove_special_characters(text):
  return re.sub(r'[^\w\s]', '', text)

df['cleaned_tweet_text'] = df['cleaned_tweet_text'].apply(remove_special_characters)

print(df['cleaned_tweet_text'])

0       coachella tickets selling less  minutes ______...
1                    rt  wait absolute fire ___ coachella
2            coachella vip passes secured see bitchesssss
3       phillys play coachella amp govball watch jimmy...
4                           briana mom coachella im  ____
                              ...                        
3841                        excuse go cry now _ coachella
3842    rt  ride steely dan would better supergroup co...
3843                       callin since day one coachella
3844                                       real coachella
3845             rt  cmonnnnnn lineup coachella impatient
Name: cleaned_tweet_text, Length: 3846, dtype: object


In [18]:
df[['text', 'cleaned_tweet_text']].head(10)

Unnamed: 0,text,cleaned_tweet_text
0,#Coachella2015 tickets selling out in less tha...,coachella tickets selling less minutes ______...
1,RT @sudsybuddy: WAIT THIS IS ABSOLUTE FIRE _ÙÓ...,rt wait absolute fire ___ coachella
2,#Coachella2015 #VIP passes secured! See you th...,coachella vip passes secured see bitchesssss
3,PhillyÛªs @warondrugsjams will play #Coachell...,phillys play coachella amp govball watch jimmy...
4,If briana and her mom out to #Coachella2015 i...,briana mom coachella im ____
5,West side is the best side!\n#west #coas #Coac...,west side best side west coas coachella
6,Coachella tickets are now sold out _Ù÷_ &amp; ...,coachella tickets sold __ amp opportunity boug...
7,#Coachella2015 I absolutely can NOT wait. This...,coachella absolutely wait weekend exceed epic ...
8,If someone got me to Coachella if be your frie...,someone got coachella friend life truth despra...
9,RT @brownjenjen:  Õ http://t.co/mxCREvIlGP 71...,rt coachella coachella makes space rockers...


In [19]:
df['cleaned_tweet_text'] = df['text'] \
    .apply(remove_usernames) \
    .apply(remove_links) \
    .apply(remove_non_ascii_symbols) \
    .apply(to_lower) \
    .apply(remove_stop_words) \
    .apply(remove_digits) \
    .apply(remove_special_characters)

**11. Save cleaned CSV**

In [21]:
df[['cleaned_tweet_text', 'hashtags', 'emails']].to_csv('cleaned_coachella_tweets.csv', index=False)