In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("../Data/test_tweets_anuFYb8.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17197 entries, 0 to 17196
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      17197 non-null  int64 
 1   tweet   17197 non-null  object
dtypes: int64(1), object(1)
memory usage: 268.8+ KB


In [4]:
df.sample(5)

Unnamed: 0,id,tweet
12453,44416,second day last
5274,37237,are you locked brother? @user woah spooky got ...
1041,33004,they still aren't seeing my messages...
12266,44229,not ashamed to admit there were tears #happy ...
5783,37746,â #us: retail sales steady in may â nomur...


In [5]:
tweets_df = df[['tweet']].copy()

In [6]:
tweets_df.head(5)

Unnamed: 0,tweet
0,#studiolife #aislife #requires #passion #dedic...
1,@user #white #supremacists want everyone to s...
2,safe ways to heal your #acne!! #altwaystohe...
3,is the hp and the cursed child book up for res...
4,"3rd #bihday to my amazing, hilarious #nephew..."


In [7]:
import re

def clean_tweet(text):
    text = text.lower()
    text = re.sub(r'http\S+', '', text)         # Remove URLs
    text = re.sub(r'@\w+', '', text)            # Remove @mentions
    text = re.sub(r'#', '', text)               # Remove hashtag symbol
    text = re.sub(r'[^\w\s]', '', text)         # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()    # Normalize whitespace
    return text


In [8]:
tweets_df['tweet'] = tweets_df['tweet'].apply(clean_tweet)

In [9]:
tweets_df['tweet'][17193]

'feeling like a mermaid ð hairflip neverready formal wedding gown dresses mermaid â'

### Normalize Unicode (fix weird characters like ð, â)

In [10]:
import unicodedata

def normalize_unicode(text):
    return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8')

In [11]:
tweets_df['tweet'] = tweets_df['tweet'].apply(normalize_unicode)

In [12]:
tweets_df.sample(10)

Unnamed: 0,tweet
4870,we salute the brave police officers who suffer...
8302,always be platyctenea
7915,looking forward to helping a new client get or...
1023,beautifulsunday a
16668,these comments from the failed republican cand...
6493,buttrump amp his followers will swear it was h...
3243,sex blonde best way of having sex
12129,no greater test of a psychologists abilities t...
193,there wasnt so much violence at the last euro ...
3167,cloudchaser bull hill climb you have to reach ...


### Correcting space problem

In [13]:
%pip install wordninja

Note: you may need to restart the kernel to use updated packages.


In [14]:
import wordninja

In [15]:
def fix_spacing(text):
    return ' '.join(wordninja.split(text))

In [16]:
tweets_df['tweet'] = tweets_df['tweet'].apply(fix_spacing)

### Expanding slangs

In [17]:
slang_dict = {
    "u": "you",
    "ur": "your",
    "bc": "because",
    "b4": "before",
    "idk": "i don't know",
    "lol": "laughing out loud",
    "omg": "oh my god",
    "btw": "by the way",
    "gr8": "great",
    "pls": "please",
    "thx": "thanks",
    "im": "i am",
    "dont": "do not",
    "wanna": "want to",
    "gonna": "going to",
    "bt" : "but"
}

In [18]:
def expand_slang(text):
    words = text.split()
    expanded_words = []

    for word in words:
        if word in slang_dict:
            expanded_words.append(slang_dict[word])
        else:
            expanded_words.append(word)
    
    return ' '.join(expanded_words)

In [19]:
tweets_df['tweet'].apply(expand_slang)

0        studio life a is life requires passion dedicat...
1        white supremacists want everyone to see the ne...
2        safe ways to heal your acne alt ways to heal h...
3        is the hp and the cursed child book up for res...
4        3 rd bih day to my amazing hilarious nephew el...
                               ...                        
17192    thought factory left right polarisation trump ...
17193    feeling like a mermaid hair flip never ready f...
17194    hillary campaigned today in ohio oh my god amp...
17195    happy at work conference right mindset leads t...
17196    my song so glad free download shoe gaze new mu...
Name: tweet, Length: 17197, dtype: object

In [20]:
# missing values
tweets_df.isnull().sum()

tweet    0
dtype: int64

In [21]:

# Check for duplicate values
tweets_df.duplicated().sum()

1426

In [22]:
tweets_df = tweets_df.drop_duplicates( keep='first')

In [23]:
tweets_df.sample(5)

Unnamed: 0,tweet
14764,happy kidd ooo w aaa treats minion
4869,love my little princess loads love
10573,did anyone else think the girls choreography a...
6036,personalised d gbp 2500 get here shop cool hom...
14032,my live cam is back on at come visit me sexy n...


## Text preprocessing

In [24]:
import string
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stopwords.words('english')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Pralo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 'her',
 'here',
 'hers',
 'herself',
 "he's",
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 'if',
 "i'll",
 "i'm",
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 "i've",
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [25]:
# For stemming
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [29]:

def transform_text(text):
    text = text.lower()  # Lowercasing
    text = nltk.word_tokenize(text)  # Tokenization

    y = []
    for i in text:
        if i.isalnum():
            y.append(i)  # Remove special characters

    text = y[:]
    y.clear()

    for i in text:
        if i not in stopwords.words('english') and i not in string.punctuation:
            y.append(i)  # Remove stopwords

    text = y[:]
    y.clear()

    for i in text:
        y.append(ps.stem(i))  # Stemming

    return y  # Return list of processed tokens



In [30]:

tweets_df['tweet'].apply(transform_text)

0        [studio, life, life, requir, passion, dedic, w...
1        [white, supremacist, want, everyon, see, new, ...
2        [safe, way, heal, acn, alt, way, heal, healthi...
3        [hp, curs, child, book, reserv, alreadi, ye, h...
4        [3, rd, bih, day, amaz, hilari, nephew, eli, a...
                               ...                        
17191    [2, damn, tuff, ruff, muff, techno, citi, ng, ...
17192    [thought, factori, left, right, polaris, trump...
17193    [feel, like, mermaid, hair, flip, never, readi...
17194    [hillari, campaign, today, ohio, omg, amp, use...
17196    [song, glad, free, download, shoe, gaze, new, ...
Name: tweet, Length: 15771, dtype: object