In [14]:
import pandas as pd
import numpy as np
import re
import string
import calendar
import nltk 

In [2]:
ori_twit_sexism_dataset = '/Users/churnika/Desktop/Projects/Crime_classification/dataset/twitter_sexism_parsed_dataset.csv'
ori_twit_sexism = pd.read_csv(ori_twit_sexism_dataset)
ori_twit_sexism = ori_twit_sexism.dropna()
ori_twit_sexism.columns

Index(['index', 'id', 'Text', 'Annotation', 'oh_label'], dtype='object')

In [3]:
ori_twit_sexism = ori_twit_sexism.drop(columns=['index','id','Annotation'])
print(ori_twit_sexism.columns)

Index(['Text', 'oh_label'], dtype='object')


In [4]:
msg_exp = ori_twit_sexism['Text'].str.lower()

In [5]:
# removing the urls that are present.

text_without_urls = []

def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    text_without_urls = url_pattern.sub('',text)
    return text_without_urls

for i in range(len(msg_exp)):
    try:
        msg_exp[i] = remove_urls(msg_exp[i])
    except KeyError as e:
        # print(f"Error at index {i}: {e}. Skipping...")
        continue

In [12]:
# removing punctuations

def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

for i in range(len(msg_exp)):
    try:
        msg_exp[i] = remove_punctuation(msg_exp[i])

    except KeyError as e:
        # print(f"Error at index {i}: {e}. Skipping...")
        continue

In [38]:
# TOKENIZATION
# REMOVING URLS
# REMOVING NUMBERS
# REMOVING STOP WORDS

from nltk.corpus import stopwords

nltk.download('punkt')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

months_days = [calendar.month_name[i].lower() for i in range(1, 13)] + [calendar.day_name[i].lower() for i in range(7)]
remove_words = ["vo","n","m","c","ra","xx","r","date","hii","hi","ye","pa","xxx","p","sir","mam","good","morning","time","ur","you","status","father"]

cleaned_tokens = []

for i in range(len(msg_exp)):
    try:
        tokens = nltk.word_tokenize(msg_exp[i])
        # Filter out URLs
        tokens = [token for token in tokens if not re.match(r'http[s]?://', token)]
        tokens = [token for token in tokens if token.isalpha()]
        tokens = [token for token in tokens if token not in stop_words]
        tokens = [token for token in tokens if token not in months_days]
        tokens = [token for token in tokens if token not in remove_words]
        cleaned_tokens.append(tokens)
    except Exception as e:
        print(f"Error at index {i}: {e}. Skipping...")
        continue

[nltk_data] Downloading package punkt to /Users/churnika/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/churnika/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Error at index 9781: 9781. Skipping...
Error at index 9782: 9782. Skipping...
Error at index 9783: 9783. Skipping...


In [39]:
# LEMMATIZATION

from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

lemmatized_tokens = [[lemmatizer.lemmatize(token) for token in tokens]for tokens in cleaned_tokens]

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/churnika/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [40]:
# REMOVING RARE OR FREQUENTLY USED WORDS

from collections import Counter

freq = Counter([token for tokens in lemmatized_tokens for token in tokens])
freq_threshold = 10

lemmatized_tokens = [[token for token in tokens if freq[token] > freq_threshold] for tokens in lemmatized_tokens]

In [43]:
#REMOVING WHITESPACES

cleaned_lemmatized_tokens = [' '.join(tokens) for tokens in lemmatized_tokens]
cleaned_lemmatized_tokens = [text.split() for text in cleaned_lemmatized_tokens]

In [53]:
# Dictionary mapping abbreviations to their full forms
abbreviation_dict = {
    "don't": "do not",
    "can't": "cannot",
    "u": "you",
    "rt": "right",
    "ur":"your"
}

# Function to expand abbreviations
def expand_abbreviations(tokens):
    expanded_tokens = []
    for token in tokens:
        if token in abbreviation_dict:
            expanded_tokens.extend(abbreviation_dict[token].split())
        else:
            expanded_tokens.append(token)
    return expanded_tokens

# Expand abbreviations in cleaned_lemmatized_tokens
expanded_lemmatized_tokens = [expand_abbreviations(tokens) for tokens in cleaned_lemmatized_tokens]

In [54]:
#REMOVE ACCENTS AND DIACRITICS FROM EXPANDED_LEMMATIZED_TOKENS

from unidecode import unidecode

def remove_accents_diacritics(tokens):
    cleaned_tokens = []
    for token in tokens:
        cleaned_token = unidecode(token)
        cleaned_tokens.append(cleaned_token)
    return cleaned_tokens

expanded_lemmatized_tokens = [remove_accents_diacritics(tokens) for tokens in expanded_lemmatized_tokens]


In [62]:
# PART-OF-SPEECH

pos_tagged_tokens = [nltk.pos_tag(tokens) for tokens in expanded_lemmatized_tokens]

In [64]:
# VECTORIZATION

from sklearn.feature_extraction.text import TfidfVectorizer

joined_tokens = [' '.join(tokens) for tokens in expanded_lemmatized_tokens]

vectorizer = TfidfVectorizer()

X = vectorizer.fit_transform(joined_tokens)

In [65]:
print(X)

  (0, 1600)	0.4455058076594877
  (0, 1402)	0.3363509472305524
  (0, 638)	0.27008483741701184
  (0, 330)	0.34025049444276384
  (0, 236)	0.29514886007889957
  (0, 1745)	0.23736679121237694
  (0, 1785)	0.3343447541789211
  (0, 613)	0.33842142273898995
  (0, 1384)	0.23711558434163998
  (0, 766)	0.22248264330112588
  (0, 1301)	0.15550804856259412
  (1, 1010)	0.16340759775146943
  (1, 1773)	0.42631334353764494
  (1, 1549)	0.4172371543125554
  (1, 1)	0.4993678815981324
  (1, 703)	0.40572634929254836
  (1, 1569)	0.4510941093585056
  (2, 1719)	0.45827374121284514
  (2, 1489)	0.46303796100502376
  (2, 1781)	0.45307176644103936
  (2, 504)	0.5351559779916485
  (2, 1010)	0.19152087503149923
  (2, 1301)	0.21733576071153088
  (3, 336)	0.42485577193191154
  (3, 1131)	0.3223788750904217
  :	:
  (14868, 1301)	0.12816489819033958
  (14869, 479)	0.43315229629066093
  (14869, 1096)	0.34557442254664233
  (14869, 1762)	0.40027399126151064
  (14869, 1416)	0.43672030002973034
  (14869, 1038)	0.3358503299497264

In [60]:
# SENTIMENT ANALYSIS

from nltk.sentiment import SentimentIntensityAnalyzer

sia = SentimentIntensityAnalyzer()

tokens = expanded_lemmatized_tokens

# Function to get sentiment
def get_sentiment(token_list):
    text = ' '.join(token_list)
    sentiment = sia.polarity_scores(text)
    return sentiment

sentiments = [get_sentiment(token_list) for token_list in tokens]

In [58]:
print(pos_tagged_tokens)
# print(expanded_lemmatized_tokens)
# print(cleaned_lemmatized_tokens)

[[('right', 'JJ'), ('im', 'NN'), ('sexist', 'JJ'), ('fuck', 'JJ'), ('youre', 'NN'), ('woman', 'NN'), ('cant', 'NN'), ('cook', 'NN'), ('get', 'VBP'), ('shit', 'VBN'), ('together', 'RB')], [('there', 'RB'), ('hate', 'NN'), ('able', 'JJ'), ('team', 'NN'), ('year', 'NN'), ('mkr', 'NN')], [('right', 'RB'), ('everyone', 'NN'), ('you', 'PRP'), ('still', 'RB'), ('well', 'RB'), ('mkr', 'VB')], [('right', 'JJ'), ('mkr', 'NN'), ('actually', 'RB'), ('check', 'VB'), ('people', 'NNS'), ('could', 'MD'), ('cook', 'VB')], [('dont', 'NN'), ('thought', 'VBD'), ('really', 'RB'), ('funny', 'JJ'), ('joke', 'NN'), ('promise', 'NN'), ('im', 'JJ'), ('sexist', 'NN'), ('say', 'VBP')], [('right', 'RB'), ('might', 'MD'), ('like', 'VB')], [('right', 'JJ'), ('bet', 'NN'), ('camper', 'JJ'), ('vote', 'NN'), ('least', 'JJS'), ('kat', 'NNS'), ('say', 'VBP'), ('mkr', 'NN')], [('evvykube', 'RB'), ('absurd', 'RB'), ('much', 'JJ'), ('amazon', 'IN'), ('wish', 'JJ'), ('list', 'NN'), ('sock', 'NN')], [('right', 'JJ'), ('colin'

In [44]:
print(cleaned_tokens)



In [7]:
# print(text_without_urls[785])

In [35]:
print(len(cleaned_tokens[3]))


9
