In [2]:
import pandas as pd

In [81]:

df = pd.read_csv('../data/df_sample25.csv')

In [82]:
new_row = pd.DataFrame({
    'TweetID': [1344796664837637121],
    'LangID': [1],
    'TopicID': [1],
    'HateLabel': [2],
    'TweetText': ['I hate all people in this word!, I am really angry kill kill kill, death!']
})

In [83]:
df = pd.concat([df, new_row], ignore_index=True)

In [84]:
df.value_counts('HateLabel')

HateLabel
0    23
1     2
2     1
dtype: int64

In [85]:
df['TweetText'].fillna('', inplace=True)

In [86]:
import string
from nltk.corpus import stopwords 
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

def preprocessing(sentence):
    # Basic cleaning
    sentence = sentence.strip() ## remove whitespaces
    
    sentence = sentence.lower() ## lowercase 
    sentence = ''.join(char for char in sentence if not char.isdigit()) ## stay with letter
    
    # Advanced cleaning
    for punctuation in string.punctuation:
        sentence = sentence.replace(punctuation, '') ## remove punctuation
    
    tokenized_sentence = word_tokenize(sentence) ## tokenize 
    stop_words = set(stopwords.words('english')) ## define stopwords
    
    tokenized_sentence_cleaned = [ w for w in tokenized_sentence if not w in stop_words] ## remove stopwords

    lemmatized = [WordNetLemmatizer().lemmatize(word, pos = "v") for word in tokenized_sentence_cleaned]
    
    noun_lemmatized = [WordNetLemmatizer().lemmatize(word, pos = "n") for word in lemmatized]
    
    cleaned_text = ' '.join(word for word in noun_lemmatized)
    
    return cleaned_text

In [87]:
df['Cleaned_text'] = df['TweetText'].apply(preprocessing)

In [88]:
y = df[['HateLabel']]

In [89]:
y

Unnamed: 0,HateLabel
0,0
1,0
2,0
3,0
4,0
5,1
6,0
7,0
8,0
9,0


In [90]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

vectorizer_text = vectorizer.fit_transform(df['Cleaned_text'])
vectorizer_text = pd.DataFrame(
    vectorizer_text.toarray(), 
    columns = vectorizer.get_feature_names_out()
)

In [93]:
df_vectorized = vectorizer_text
df_vectorized['HateLabel'] = y


In [94]:
df_vectorized

Unnamed: 0,abide,actively,administration,afghanistan,agency,ali,alotta,alum,among,amp,...,whatever,whine,who,whole,win,without,woman,worcester,word,HateLabel
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.330931,0.0,0.0,0.0,0.330931,0.0,0.0,0.0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.253387,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.224869,0.0,0.0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.292513,0.0,0.0,0.0,0.0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.423518,0.0,0.0,0.0,0.0,0.0,0.0,1
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.186156,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
8,0.0,0.0,0.424246,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.176949,0.0,0.0,0.0,0.0,0.0,0


In [104]:

split_ratio: float = 0.2
# Create (X_train_processed, y_train, X_val_processed, y_val)
train_length = int(len(df_vectorized)*(1-split_ratio))

data_processed_train = df_vectorized.iloc[:train_length, :].sample(frac=1).to_numpy()
data_processed_val = df_vectorized.iloc[train_length:, :].sample(frac=1).to_numpy()

X_train_processed = data_processed_train[:, :-1]
y_train = data_processed_train[:, -1]

X_val_processed = data_processed_val[:, :-1]
y_val = data_processed_val[:, -1]