Import Libraries

In [5]:

import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle

nltk.download('punkt')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\danie\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

Load the cleaned data

In [6]:
df = pd.read_csv('C:\group-1-main\Data-Preprocessing\cleaned_data.csv')

# the first few rows of the dataset
print("First 5 rows of the dataset:")
print(df.head())


FileNotFoundError: [Errno 2] No such file or directory: 'Data-Preprocessing\\cleaned_data.csv'

Tokenization

In [None]:
# Convert text column to string
text_column = 'tweet'
df[text_column] = df[text_column].astype(str)


# Tokenize the text data
def tokenize_text(text):
    return word_tokenize(text)

def tokenize_data(df, text_column):
    df[text_column + '_tokens'] = df[text_column].apply(tokenize_text)
    return df

# Define the text column
text_column = 'tweet'

# Apply tokenization
df = tokenize_data(df, text_column)


# last few rows after tokenization
print("Last 5 rows after tokenization:")
print(df.tail())


TF-IDF Encoding

In [None]:
# TF-IDF encoding
def tfidf_encoding(df, text_column):

    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(df[text_column])
    feature_names = tfidf_vectorizer.get_feature_names_out()
    return tfidf_matrix, feature_names

# Apply TF-IDF encoding
tfidf_matrix, feature_names = tfidf_encoding(df, text_column)



print("Shape of TF-IDF matrix:", tfidf_matrix.shape)
print("First 5 feature names:", feature_names[:5])


Save TF-IDF Matrix and Original Data

In [None]:
import pickle

# Create a dictionary to save the TF-IDF matrix and the original dataframe
data_to_save = {
    'tfidf_matrix': tfidf_matrix,
    'feature_names': feature_names,
    'original_data': df[['Unnamed: 0', 'count', 'hate_speech', 'offensive_language', 'neither', 'class', 'tweet']]
}

# Save the dictionary to a Pickle file
with open('tfidf_encoded_data.pkl', 'wb') as file:
    pickle.dump(data_to_save, file)

print("TF-IDF encoded data saved to tfidf_encoded_data.pkl")


Load and Verify TF-IDF Encoded Data

In [None]:
# Load the Pickle file
with open('tfidf_encoded_data.pkl', 'rb') as file:
    loaded_data = pickle.load(file)

# Extract the TF-IDF matrix, feature names, and original data from the loaded dictionary
tfidf_matrix_loaded = loaded_data['tfidf_matrix']
feature_names_loaded = loaded_data['feature_names']
original_data_loaded = loaded_data['original_data']

# Display the shape of the loaded TF-IDF matrix and the first 5 feature names
print(f"Shape of loaded TF-IDF matrix: {tfidf_matrix_loaded.shape}")
print(f"First 5 feature names: {feature_names_loaded[:5]}")

# Display the first few rows of the original data
print("First 5 rows of the original data:")
print(original_data_loaded.head())


Sample Data Modeling


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
X = tfidf_matrix
y = df['class'] #class is the target variable

#spliting into test and train
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state=42)


In [None]:
#training the RFC

clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

In [None]:
#model evaluvation

y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("Model accuracy:", accuracy)
print("Model precision:", precision)
print("Model recall:", recall)
print("Model F1-score:", f1)