In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("../Data/test_tweets_anuFYb8.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 484 entries, 0 to 483
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      484 non-null    int64 
 1   tweet   484 non-null    object
dtypes: int64(1), object(1)
memory usage: 7.7+ KB


In [4]:
df.sample(5)

Unnamed: 0,id,tweet
363,32326,@user @user oh.my.word! i don't think she's a...
412,32375,i am thankful for love. #thankful #positive
106,32069,l o v e #sky #photoofday #moments #cool #ins...
169,32132,about to watch these movies alone
236,32199,we are nearly ready for the off!!! #nohampto...


In [5]:
tweets_df = df[['tweet']].copy()

In [6]:
tweets_df.head(5)

Unnamed: 0,tweet
0,#studiolife #aislife #requires #passion #dedic...
1,@user #white #supremacists want everyone to s...
2,safe ways to heal your #acne!! #altwaystohe...
3,is the hp and the cursed child book up for res...
4,"3rd #bihday to my amazing, hilarious #nephew..."


In [7]:
import re

def clean_tweet(text):
    text = text.lower()
    text = re.sub(r'http\S+', '', text)         # Remove URLs
    text = re.sub(r'@\w+', '', text)            # Remove @mentions
    text = re.sub(r'#', '', text)               # Remove hashtag symbol
    text = re.sub(r'[^\w\s]', '', text)         # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()    # Normalize whitespace
    return text


In [8]:
tweets_df['tweet'] = tweets_df['tweet'].apply(clean_tweet)

In [9]:
# Access the tweet at position 0 (first row) as an example
tweets_df['tweet'].iloc[0]

'studiolife aislife requires passion dedication willpower to find newmaterialsâ'

### Normalize Unicode (fix weird characters like ð, â)

In [10]:
import unicodedata

def normalize_unicode(text):
    return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8')

In [11]:
tweets_df['tweet'] = tweets_df['tweet'].apply(normalize_unicode)

In [12]:
tweets_df.sample(10)

Unnamed: 0,tweet
322,you are sleeping and i crying
383,think happy stay happy
47,morning sunshine smile sexy bigboobs
6,something inside me dies a eyes ness smokeyeye...
91,life right now is amazing successful positive
458,few things worse than putting words into the m...
411,model i love u take with u all the time in ur
368,the last formteachers lesson byebye8year
303,character customization confirmed pokemonsunmo...
261,it could be worse embarrassed unfounate trauma...


### Correcting space problem

In [13]:
import wordninja

In [14]:
def fix_spacing(text):
    return ' '.join(wordninja.split(text))

In [15]:
tweets_df['tweet'] = tweets_df['tweet'].apply(fix_spacing)

### Expanding slangs

In [16]:
slang_dict = {
    "u": "you",
    "ur": "your",
    "bc": "because",
    "b4": "before",
    "idk": "i don't know",
    "lol": "laughing out loud",
    "omg": "oh my god",
    "btw": "by the way",
    "gr8": "great",
    "pls": "please",
    "thx": "thanks",
    "im": "i am",
    "dont": "do not",
    "wanna": "want to",
    "gonna": "going to",
    "bt": "but",
    "smh": "shaking my head",
    "tbh": "to be honest",
    "ikr": "i know right",
    "lmao": "laughing my ass off",
    "rofl": "rolling on the floor laughing",
    "omw": "on my way",
    "afaik": "as far as i know",
    "brb": "be right back",
    "bff": "best friends forever",
    "fyi": "for your information",
    "ftw": "for the win",
    "imo": "in my opinion",
    "dm": "direct message",
    "irl": "in real life",
    "nsfw": "not safe for work",
    "jk": "just kidding",
    "np": "no problem",
    "rn": "right now",
    "tho": "though",
    "ya": "yeah",
    "yolo": "you only live once"
}

In [17]:
def expand_slang(text):
    words = text.split()
    expanded_words = []

    for word in words:
        if word in slang_dict:
            expanded_words.append(slang_dict[word])
        else:
            expanded_words.append(word)
    
    return ' '.join(expanded_words)

In [18]:
tweets_df['tweet'].apply(expand_slang)

0      studio life a is life requires passion dedicat...
1      white supremacists want everyone to see the ne...
2      safe ways to heal your acne alt ways to heal h...
3      is the hp and the cursed child book up for res...
4      3 rd bih day to my amazing hilarious nephew el...
                             ...                        
479      it is amazing what a day has surprises movement
480    caffeine fix a quick cup of coffee positiv it ...
481          green james mca voy atonement mca voy f lim
482    pass last exam made new lashes thanks to studi...
483                                   easter my new hero
Name: tweet, Length: 484, dtype: object

In [19]:
# missing values
tweets_df.isnull().sum()

tweet    0
dtype: int64

In [20]:

# Check for duplicate values
tweets_df.duplicated().sum()

9

In [21]:
tweets_df = tweets_df.drop_duplicates( keep='first')

In [22]:
tweets_df.sample(5)

Unnamed: 0,tweet
137,oh there is tons of stuff its like they wont d...
335,its so dark
29,tried that but nothing will try again know you...
161,phil spencer criticizes sony of course what el...
70,lip o light helped shape her and it can help s...


## Text preprocessing

In [23]:
import string
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')
stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Pralo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Pralo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 'her',
 'here',
 'hers',
 'herself',
 "he's",
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 'if',
 "i'll",
 "i'm",
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 "i've",
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [24]:
# For stemming
from nltk.stem import PorterStemmer,WordNetLemmatizer
ps = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [25]:
def transform_text(text):
    text = text.lower()  # Lowercase
    tokens = nltk.word_tokenize(text)  # Tokenization

    y = []
    for word in tokens:
        if word.isalnum():  # Keep only alphanumeric tokens
            y.append(word)

    filtered = []
    for word in y:
        if word not in stopwords.words('english') and word not in string.punctuation:
            filtered.append(word)

    processed = []
    for word in filtered:
        lemma = lemmatizer.lemmatize(word)     # Lemmatization
        stemmed = ps.stem(lemma)               # Stemming
        processed.append(stemmed)

    return processed  # Return final list

In [26]:

tweets_df['tweet'] = tweets_df['tweet'].apply(transform_text)

In [27]:
tweets_df.sample(10)

Unnamed: 0,tweet
158,"[day, best]"
422,"[easter, meanwhil, white, hous]"
145,"[4, u, non, hockey, peopl, hockey, babe, ruth,..."
211,"[monday, insta, gram, insta, grammer, motor, p..."
256,"[rest, peac, christina, rip, rip, christina, c..."
200,"[draft, 1, b, 4, pick, uni, liverpool, recognit]"
174,"[sterl, attack, bull, chase, leav, lot, despit..."
441,"[chatter, girl, guid, cho, cie, alicia, marco,..."
263,"[happi, father, day, father, father, day, sund..."
1,"[white, supremacist, want, everyon, see, new, ..."


In [28]:
%pip install textblob
from textblob import TextBlob

Note: you may need to restart the kernel to use updated packages.


In [29]:
def correct_tokens_textblob(token_list):
    sentence = ' '.join(token_list)
    corrected = str(TextBlob(sentence).correct())
    return corrected.split()

In [30]:
tweets_df['tweet'] = tweets_df['tweet'].apply(correct_tokens_textblob)           

In [31]:
tweets_df.sample(10)

Unnamed: 0,tweet
303,"[character, custom, confirm, poleon, sun, moon..."
206,"[london, self, in, smile, love, photo, day, pi..."
452,"[photo, last, year]"
341,"[shock, amp, hear, christian, grim, in, past, ..."
479,"[may, day, surprise, movement]"
366,"[great, screw, job, 2016, catch, break, impact..."
424,"[weekend, she, may, time, friend, family, summ..."
480,"[caffein, fix, quick, cup, coffee, positive, a..."
209,"[think, go, best, match, give, ever, watch, we..."
349,"[never, previous, secret, screen, star, move, ..."


### Text is good enough for further tasks like (Indexing, Emebedding)

In [39]:
tweets_df.to_csv("../data/tweets.csv", index=False)