In [12]:

import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle

nltk.download('punkt')

# Load the cleaned dataset
df = pd.read_csv('cleaned_data.csv')

# Display the first few rows of the dataset
print("First 5 rows of the dataset:")
print(df.head())

# Display the last few rows of the dataset
print("\nLast 5 rows of the dataset:")
print(df.tail())


First 5 rows of the dataset:
   Unnamed: 0  count  hate_speech  offensive_language  neither  class  \
0           0      3            0                   0        3      2   
1           1      3            0                   3        0      1   
2           2      3            0                   3        0      1   
3           3      3            0                   2        1      1   
4           4      6            0                   6        0      1   

                                               tweet  
0  woman shouldnt complain cleaning house man alw...  
1          boy that coldtyga bad cuffin hoe st place  
2       dawg ever fuck bitch start cry confused shit  
3                                   look like tranny  
4        shit hear might true might faker bitch told  

Last 5 rows of the dataset:
       Unnamed: 0  count  hate_speech  offensive_language  neither  class  \
24761       25291      3            0                   2        1      1   
24762       25292  

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\danie\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [13]:
df[text_column] = df[text_column].astype(str)
# Tokenize the text

def tokenize_text(text):
    return word_tokenize(text)

def tokenize_data(df, text_column):
    """
    Apply the tokenize_text function to a specific column in the DataFrame.

    """
    df[text_column + '_tokens'] = df[text_column].apply(tokenize_text)
    return df

# Define the text column
text_column = 'tweet'

# Tokenize the text data
df = tokenize_data(df, text_column)

# Display the first few rows of the tokenized data
print("First 5 rows after tokenization:")
print(df.head())

# Display the last few rows of the tokenized data
print("\nLast 5 rows after tokenization:")
print(df.tail())


First 5 rows after tokenization:
   Unnamed: 0  count  hate_speech  offensive_language  neither  class  \
0           0      3            0                   0        3      2   
1           1      3            0                   3        0      1   
2           2      3            0                   3        0      1   
3           3      3            0                   2        1      1   
4           4      6            0                   6        0      1   

                                               tweet  \
0  woman shouldnt complain cleaning house man alw...   
1          boy that coldtyga bad cuffin hoe st place   
2       dawg ever fuck bitch start cry confused shit   
3                                   look like tranny   
4        shit hear might true might faker bitch told   

                                        tweet_tokens  
0  [woman, shouldnt, complain, cleaning, house, m...  
1  [boy, that, coldtyga, bad, cuffin, hoe, st, pl...  
2  [dawg, ever, fuck, bitc

In [14]:
# TF-IDF encoding
def tfidf_encoding(df, text_column):
    """
    Apply TF-IDF encoding to a specific column in the DataFrame.
    
    """
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(df[text_column])
    feature_names = tfidf_vectorizer.get_feature_names_out()
    return tfidf_matrix, feature_names

# Apply TF-IDF encoding
tfidf_matrix, feature_names = tfidf_encoding(df, text_column)

# Display the shape of the TF-IDF matrix
print("Shape of TF-IDF matrix:", tfidf_matrix.shape)

# Display the first 5 feature names
print("First 5 feature names:", feature_names[:5])


Shape of TF-IDF matrix: (24766, 18144)
First 5 feature names: ['aa' 'aaaaaaaaand' 'aaahhhhh' 'aahahah' 'aaliyah']


In [15]:
# Create a dictionary to save the TF-IDF matrix and the original dataframe
data_to_save = {
    'tfidf_matrix': tfidf_matrix,
    'feature_names': feature_names,
    'original_data': df[['Unnamed: 0', 'count', 'hate_speech', 'offensive_language', 'neither', 'class', 'tweet']]
}

# Save the dictionary to a  pickle file
with open('tfidf_encoded_data.pkl', 'wb') as file:
    pickle.dump(data_to_save, file)

print("TF-IDF encoded data saved to tfidf_encoded_data.pkl")


TF-IDF encoded data saved to tfidf_encoded_data.pkl


In [16]:
# Load the Pickle file
with open('tfidf_encoded_data.pkl', 'rb') as file:
    loaded_data = pickle.load(file)

# Extract the TF-IDF matrix, feature names, and original data from the loaded dictionary
tfidf_matrix_loaded = loaded_data['tfidf_matrix']
feature_names_loaded = loaded_data['feature_names']
original_data_loaded = loaded_data['original_data']

# Display the shape of the loaded TF-IDF matrix and the first 5 feature names
print(f"Shape of loaded TF-IDF matrix: {tfidf_matrix_loaded.shape}")
print(f"First 5 feature names: {feature_names_loaded[:5]}")

# Display the first few rows of the original data
print("First 5 rows of the original data:")
print(original_data_loaded.head())


Shape of loaded TF-IDF matrix: (24766, 18144)
First 5 feature names: ['aa' 'aaaaaaaaand' 'aaahhhhh' 'aahahah' 'aaliyah']
First 5 rows of the original data:
   Unnamed: 0  count  hate_speech  offensive_language  neither  class  \
0           0      3            0                   0        3      2   
1           1      3            0                   3        0      1   
2           2      3            0                   3        0      1   
3           3      3            0                   2        1      1   
4           4      6            0                   6        0      1   

                                               tweet  
0  woman shouldnt complain cleaning house man alw...  
1          boy that coldtyga bad cuffin hoe st place  
2       dawg ever fuck bitch start cry confused shit  
3                                   look like tranny  
4        shit hear might true might faker bitch told  
