# Import Necessary Libraries and  loaad the dataset

In [13]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from imblearn.over_sampling import SMOTE
import joblib
# Load the dataset
df = pd.read_csv('final.csv')
# Ensure all entries in 'tweet' column are strings and handle NaNs
df['tweet'] = df['tweet'].astype(str).fillna('')


# Initialize and Apply TF-IDF Vectorizer

In [14]:
# Initialize the TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=5000)  # Adjust max_features as needed

# Fit and transform the tweets to TF-IDF features
X = vectorizer.fit_transform(df['tweet']).toarray()
y = df['class'].values


# Save TF-IDF Matrix and Original Data

In [15]:
# Save the TF-IDF matrix and original data
joblib.dump(X, 'tfidf_matrix.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')
df.to_csv('original_data.csv', index=False)


# Load and Verify TF-IDF Encoded Data

In [16]:
# Load the saved TF-IDF matrix and vectorizer
X_loaded = joblib.load('tfidf_matrix.pkl')
vectorizer_loaded = joblib.load('tfidf_vectorizer.pkl')

# Load the original data
df_loaded = pd.read_csv('original_data.csv')

# Verify the loaded data
print(X_loaded.shape)
print(df_loaded.head())


(24783, 5000)
   Unnamed: 0  count  hate_speech  offensive_language  neither  class  \
0           0      3            0                   0        3      2   
1           1      3            0                   3        0      1   
2           2      3            0                   3        0      1   
3           3      3            0                   2        1      1   
4           4      6            0                   6        0      1   

                                               tweet  
0  woman complain cleaning house man always take ...  
1              boy coldtyga bad cuffin hoe 1st place  
2     friend ever fuck bitch start cry confused shit  
3                                   look like tranny  
4        shit hear might true might faker bitch told  


# Split the Data and Handle Imbalanced Data with SMOTE

In [17]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_loaded, y, test_size=0.2, random_state=42)
# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)


# Model evaluation()

In [18]:
# Define the model
model = Sequential()
model.add(Dense(64, input_dim=X_train_resampled.shape[1], activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(3, activation='softmax'))  # Adjust the output layer as per the number of classes
# Compile the model
model.compile(optimizer=Adam(),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# Print model summary
model.summary()
# Train the model
history = model.fit(X_train_resampled, y_train_resampled, epochs=10, batch_size=32, validation_split=0.2)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m1152/1152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 4ms/step - accuracy: 0.7008 - loss: 0.7534 - val_accuracy: 0.9444 - val_loss: 0.4024
Epoch 2/10
[1m1152/1152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 4ms/step - accuracy: 0.9009 - loss: 0.3033 - val_accuracy: 0.9325 - val_loss: 0.2859
Epoch 3/10
[1m1152/1152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - accuracy: 0.9306 - loss: 0.2201 - val_accuracy: 0.9314 - val_loss: 0.2323
Epoch 4/10
[1m1152/1152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - accuracy: 0.9475 - loss: 0.1752 - val_accuracy: 0.9634 - val_loss: 0.1699
Epoch 5/10
[1m1152/1152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - accuracy: 0.9558 - loss: 0.1507 - val_accuracy: 0.9332 - val_loss: 0.1766
Epoch 6/10
[1m1152/1152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - accuracy: 0.9654 - loss: 0.1188 - val_accuracy: 0.9368 - val_loss: 0.1541
Epoch 7/10
[1m

# Evaluate the Model and Calculate Accuracy and Precision

In [19]:
# Predict on the test set
y_pred = model.predict(X_test)
y_pred_classes = y_pred.argmax(axis=1)
# Calculate accuracy and precision
accuracy = accuracy_score(y_test, y_pred_classes)
precision = precision_score(y_test, y_pred_classes, average='weighted')

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
# Detailed classification report
print(classification_report(y_test, y_pred_classes, target_names=['Normal', 'Offensive', 'Hate']))


[1m155/155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
Accuracy: 0.8491022796045996
Precision: 0.8535050427329369
              precision    recall  f1-score   support

      Normal       0.24      0.28      0.26       290
   Offensive       0.91      0.91      0.91      3832
        Hate       0.80      0.75      0.77       835

    accuracy                           0.85      4957
   macro avg       0.65      0.65      0.65      4957
weighted avg       0.85      0.85      0.85      4957

