Import Libraries

In [18]:

import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle

nltk.download('punkt')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\danie\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

Load the cleaned data

In [19]:
df = pd.read_csv('C:\\group-1-main\\Data-Preprocessing\\cleaned_data.csv')

# the first few rows of the dataset
print("First 5 rows of the dataset:")
print(df.head())


First 5 rows of the dataset:
   Unnamed: 0  count  hate_speech  offensive_language  neither  class  \
0           0      3            0                   0        3      2   
1           1      3            0                   3        0      1   
2           2      3            0                   3        0      1   
3           3      3            0                   2        1      1   
4           4      6            0                   6        0      1   

                                               tweet  
0  woman shouldnt complain cleaning house man alw...  
1          boy that coldtyga bad cuffin hoe st place  
2       dawg ever fuck bitch start cry confused shit  
3                                   look like tranny  
4        shit hear might true might faker bitch told  


Tokenization

In [20]:
# Convert text column to string
text_column = 'tweet'
df[text_column] = df[text_column].astype(str)


# Tokenize the text data
def tokenize_text(text):
    return word_tokenize(text)

def tokenize_data(df, text_column):
    df[text_column + '_tokens'] = df[text_column].apply(tokenize_text)
    return df

# Define the text column
text_column = 'tweet'

# Apply tokenization
df = tokenize_data(df, text_column)


# last few rows after tokenization
print("Last 5 rows after tokenization:")
print(df.tail())


Last 5 rows after tokenization:
       Unnamed: 0  count  hate_speech  offensive_language  neither  class  \
24761       25291      3            0                   2        1      1   
24762       25292      3            0                   1        2      2   
24763       25294      3            0                   3        0      1   
24764       25295      6            0                   6        0      1   
24765       25296      3            0                   0        3      2   

                                                   tweet  \
24761  yous muthafin lie right tl trash mine bible sc...   
24762  youve gone broke wrong heart baby drove rednec...   
24763   young buck wanna eat nigguh like aint fuckin dis   
24764                     youu got wild bitch tellin lie   
24765  ruffled ntac eileen dahlia beautiful color com...   

                                            tweet_tokens  
24761  [yous, muthafin, lie, right, tl, trash, mine, ...  
24762  [youve, gone, broke

TF-IDF Encoding

In [21]:
# TF-IDF encoding
def tfidf_encoding(df, text_column):

    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(df[text_column])
    feature_names = tfidf_vectorizer.get_feature_names_out()
    return tfidf_matrix, feature_names

# Apply TF-IDF encoding
tfidf_matrix, feature_names = tfidf_encoding(df, text_column)



print("Shape of TF-IDF matrix:", tfidf_matrix.shape)
print("First 5 feature names:", feature_names[:5])


Shape of TF-IDF matrix: (24766, 18144)
First 5 feature names: ['aa' 'aaaaaaaaand' 'aaahhhhh' 'aahahah' 'aaliyah']


Save TF-IDF Matrix and Original Data

In [22]:
import pickle

# Create a dictionary to save the TF-IDF matrix and the original dataframe
data_to_save = {
    'tfidf_matrix': tfidf_matrix,
    'feature_names': feature_names,
    'original_data': df[['Unnamed: 0', 'count', 'hate_speech', 'offensive_language', 'neither', 'class', 'tweet']]
}

# Save the dictionary to a Pickle file
with open('tfidf_encoded_data.pkl', 'wb') as file:
    pickle.dump(data_to_save, file)

print("TF-IDF encoded data saved to tfidf_encoded_data.pkl")


TF-IDF encoded data saved to tfidf_encoded_data.pkl


Load and Verify TF-IDF Encoded Data

In [23]:
# Load the Pickle file
with open('tfidf_encoded_data.pkl', 'rb') as file:
    loaded_data = pickle.load(file)

# Extract the TF-IDF matrix, feature names, and original data from the loaded dictionary
tfidf_matrix_loaded = loaded_data['tfidf_matrix']
feature_names_loaded = loaded_data['feature_names']
original_data_loaded = loaded_data['original_data']

# Display the shape of the loaded TF-IDF matrix and the first 5 feature names
print(f"Shape of loaded TF-IDF matrix: {tfidf_matrix_loaded.shape}")
print(f"First 5 feature names: {feature_names_loaded[:5]}")

# Display the first few rows of the original data
print("First 5 rows of the original data:")
print(original_data_loaded.head())


Shape of loaded TF-IDF matrix: (24766, 18144)
First 5 feature names: ['aa' 'aaaaaaaaand' 'aaahhhhh' 'aahahah' 'aaliyah']
First 5 rows of the original data:
   Unnamed: 0  count  hate_speech  offensive_language  neither  class  \
0           0      3            0                   0        3      2   
1           1      3            0                   3        0      1   
2           2      3            0                   3        0      1   
3           3      3            0                   2        1      1   
4           4      6            0                   6        0      1   

                                               tweet  
0  woman shouldnt complain cleaning house man alw...  
1          boy that coldtyga bad cuffin hoe st place  
2       dawg ever fuck bitch start cry confused shit  
3                                   look like tranny  
4        shit hear might true might faker bitch told  


Sample Data Modeling


In [24]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [25]:
X = tfidf_matrix
y = df['class'] #class is the target variable

#spliting into test and train
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state=42)


In [26]:
#training the RFC

clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

In [27]:
#model evaluvation

y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("Model accuracy:", accuracy)
print("Model precision:", precision)
print("Model recall:", recall)
print("Model F1-score:", f1)

Model accuracy: 0.8968510294711345
Model precision: 0.8836290592906999
Model recall: 0.8968510294711345
Model F1-score: 0.8788525858390563
