In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv("../Data/test_tweets_anuFYb8.csv")

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17197 entries, 0 to 17196
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      17197 non-null  int64 
 1   tweet   17197 non-null  object
dtypes: int64(1), object(1)
memory usage: 268.8+ KB


In [5]:
df.sample(5)

Unnamed: 0,id,tweet
3278,35241,abiball 2016 ðð»ðð #abiball #abitur...
14778,46741,looking forward to the @user conference &amp; ...
13371,45334,mister @user should we #makeitawkward for and...
9806,41769,guess it is to much to stop the man/father bas...
7509,39472,i am so to watch #mitb (money in the bank) #...


In [6]:
tweets_df = df[['tweet']].copy()

In [7]:
tweets_df.head(5)

Unnamed: 0,tweet
0,#studiolife #aislife #requires #passion #dedic...
1,@user #white #supremacists want everyone to s...
2,safe ways to heal your #acne!! #altwaystohe...
3,is the hp and the cursed child book up for res...
4,"3rd #bihday to my amazing, hilarious #nephew..."


In [8]:
import re

def clean_tweet(text):
    text = text.lower()
    text = re.sub(r'http\S+', '', text)         # Remove URLs
    text = re.sub(r'@\w+', '', text)            # Remove @mentions
    text = re.sub(r'#', '', text)               # Remove hashtag symbol
    text = re.sub(r'[^\w\s]', '', text)         # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()    # Normalize whitespace
    return text


In [9]:
tweets_df['tweet'] = tweets_df['tweet'].apply(clean_tweet)

In [10]:
tweets_df['tweet'][17193]

'feeling like a mermaid ð hairflip neverready formal wedding gown dresses mermaid â'

### Normalize Unicode (fix weird characters like ð, â)

In [11]:
import unicodedata

def normalize_unicode(text):
    return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8')

In [12]:
tweets_df['tweet'] = tweets_df['tweet'].apply(normalize_unicode)

In [13]:
tweets_df.sample(10)

Unnamed: 0,tweet
10832,will b getting my license yayayayayayay 18
5953,best essentialoils for anxiety healthy peace a...
7283,isle of wight is creeping up on us schooljourn...
1002,find a workout routine that gets you this amp ...
7728,i know goner is a a song but its actually my f...
9217,use the power of your mind to heal your body a...
4136,the winery in jasper georgia goodhealth goodti...
15065,this mornings happy socks its the only way to ...
8164,july 23 i see mr luke bryan lovehimforreal luk...
6216,image via we hea it restinpeace rip triste


### Correcting space problem

In [27]:
%pip install wordninja

Note: you may need to restart the kernel to use updated packages.


In [28]:
import wordninja

In [29]:
def fix_spacing(text):
    return ' '.join(wordninja.split(text))

In [30]:
tweets_df['tweet'] = tweets_df['tweet'].apply(fix_spacing)

### Removing stop words

In [31]:
import nltk
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

In [33]:
tweets_df['tweet'] = tweets_df['tweet'].apply(lambda x: ' '.join([word for word in nltk.word_tokenize(x) if word not in stop_words]))

In [35]:
tweets_df.sample(5)

Unnamed: 0,tweet
8847,love saturday feeling weekend love play guitar
15655,happy wednesday hump day wednesday midweek day...
3515,petal polar bear climb racing angry polar bear...
16201,late ff game dev died ev indie game dev squad
13888,offered unconditional offer fda business manag...


### Expanding slangs

In [36]:
slang_dict = {
    "u": "you",
    "ur": "your",
    "bc": "because",
    "b4": "before",
    "idk": "i don't know",
    "lol": "laughing out loud",
    "omg": "oh my god",
    "btw": "by the way",
    "gr8": "great",
    "pls": "please",
    "thx": "thanks",
    "im": "i am",
    "dont": "do not",
    "wanna": "want to",
    "gonna": "going to"
}

In [39]:
def expand_slang(text):
    words = text.split()
    expanded_words = []

    for word in words:
        if word in slang_dict:
            expanded_words.append(slang_dict[word])
        else:
            expanded_words.append(word)
    
    return ' '.join(expanded_words)

In [40]:
tweets_df['tweet'].apply(expand_slang)

0        studio life life requires passion dedication w...
1        white supremacists want everyone see new birds...
2        safe ways heal acne alt ways heal healthy healing
3        hp cursed child book reservations already yes ...
4        3 rd bih day amazing hilarious nephew eli ah m...
                               ...                        
17192    thought factory left right polarisation trump ...
17193    feeling like mermaid hair flip never ready for...
17194    hillary campaigned today ohio oh my god amp us...
17195    happy work conference right mindset leads cult...
17196    song glad free download shoe gaze new music ne...
Name: tweet, Length: 17197, dtype: object