In [16]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import classification_report, accuracy_score

# Load the CSV file into a DataFrame
df = pd.read_csv('models/hate_speech_model.csv')

# Split the data into feature and target variables
x = df['text']
y = df['is_toxic']

toxicity = 0

# Convert the text data into numerical vectors using a CountVectorizer
vectorizer = CountVectorizer()
x = vectorizer.fit_transform(x)

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42)

# Train a logistic regression model on the training data
model = LogisticRegression()
model.fit(x_train, y_train)

# Evaluate the model on the testing data
y_pred = model.predict(x_test)

print("Logistic Regression")

# Generate a classification report
report = classification_report(y_test, y_pred)
print(report)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Logistic Regression
              precision    recall  f1-score   support

   Not Toxic       0.82      0.77      0.79       104
       Toxic       0.76      0.81      0.79        96

    accuracy                           0.79       200
   macro avg       0.79      0.79      0.79       200
weighted avg       0.79      0.79      0.79       200

Accuracy: 0.79


In [17]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score

# Create a Naive Bayes classifier
naive_bayes = MultinomialNB()

# Train the model
naive_bayes.fit(x_train, y_train)

# Make predictions
naive_bayes_predictions = naive_bayes.predict(x_test)

print("Naive Bayes")

# Generate classification report for Naive Bayes
naive_bayes_report = classification_report(y_test, naive_bayes_predictions)
print("Naive Bayes Classification Report:")
print(naive_bayes_report)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, naive_bayes_predictions)
print("Accuracy:", accuracy)


Naive Bayes
Naive Bayes Classification Report:
              precision    recall  f1-score   support

   Not Toxic       0.89      0.84      0.86       104
       Toxic       0.83      0.89      0.86        96

    accuracy                           0.86       200
   macro avg       0.86      0.86      0.86       200
weighted avg       0.86      0.86      0.86       200

Accuracy: 0.86


In [18]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report

# Create an SVM classifier
svm = SVC()

# Train the model
svm.fit(x_train, y_train)

# Make predictions
svm_predictions = svm.predict(x_test)

print("SVM")

# Generate classification report for SVM
svm_report = classification_report(y_test, svm_predictions)
print("SVM Classification Report:")
print(svm_report)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, svm_predictions)
print("Accuracy:", accuracy)

SVM
SVM Classification Report:
              precision    recall  f1-score   support

   Not Toxic       0.76      0.75      0.76       104
       Toxic       0.73      0.75      0.74        96

    accuracy                           0.75       200
   macro avg       0.75      0.75      0.75       200
weighted avg       0.75      0.75      0.75       200

Accuracy: 0.75


In [20]:
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

# Create the models
logistic_regression = LogisticRegression()
naive_bayes = MultinomialNB()
svm = SVC()

# Perform cross-validation and get accuracy scores
logistic_regression_scores = cross_val_score(logistic_regression, x, y, cv=5)
naive_bayes_scores = cross_val_score(naive_bayes, x, y, cv=5)
svm_scores = cross_val_score(svm, x, y, cv=5)

# Print the mean accuracy scores for each model
print("Logistic Regression Accuracy:", logistic_regression_scores.mean())
print("Naive Bayes Accuracy:", naive_bayes_scores.mean())
print("SVM Accuracy:", svm_scores.mean())

Logistic Regression Accuracy: 0.8019999999999999
Naive Bayes Accuracy: 0.8479999999999999
SVM Accuracy: 0.7360000000000001


In [22]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

# Load the CSV file into a DataFrame
df = pd.read_csv('models/hate_speech_model.csv')

# Data preprocessing and statistics
print("Data Preprocessing and Statistics")
print("-----------------------------------------")
# Check the number of samples
num_samples = len(df)
print("Number of samples:", num_samples)

# Check the distribution of classes
class_counts = df['is_toxic'].value_counts()
print("Class Distribution:")
print(class_counts)

# Split the data into feature and target variables
x = df['text']
y = df['is_toxic']

# Convert the text data into numerical vectors using a CountVectorizer
vectorizer = CountVectorizer()
x = vectorizer.fit_transform(x)

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42)

# Train the models
logistic_regression = LogisticRegression()
naive_bayes = MultinomialNB()
svm = SVC()

logistic_regression.fit(x_train, y_train)
naive_bayes.fit(x_train, y_train)
svm.fit(x_train, y_train)

# Evaluate the models on the testing data
logistic_regression_predictions = logistic_regression.predict(x_test)
naive_bayes_predictions = naive_bayes.predict(x_test)
svm_predictions = svm.predict(x_test)

# Calculate accuracy for each model
logistic_regression_accuracy = accuracy_score(y_test, logistic_regression_predictions)
naive_bayes_accuracy = accuracy_score(y_test, naive_bayes_predictions)
svm_accuracy = accuracy_score(y_test, svm_predictions)

# Generate classification report for each model
logistic_regression_report = classification_report(y_test, logistic_regression_predictions)
naive_bayes_report = classification_report(y_test, naive_bayes_predictions)
svm_report = classification_report(y_test, svm_predictions)

# Print the statistical report
print("\nStatistical Report")
print("-----------------------------------------")
print("Logistic Regression Accuracy: ", logistic_regression_accuracy)
print("Logistic Regression Classification Report:")
print(logistic_regression_report)
print("-----------------------------------------")
print("Naive Bayes Accuracy: ", naive_bayes_accuracy)
print("Naive Bayes Classification Report:")
print(naive_bayes_report)
print("-----------------------------------------")
print("SVM Accuracy: ", svm_accuracy)
print("SVM Classification Report:")
print(svm_report)


Data Preprocessing and Statistics
-----------------------------------------
Number of samples: 1000
Class Distribution:
is_toxic
Toxic        501
Not Toxic    499
Name: count, dtype: int64

Statistical Report
-----------------------------------------
Logistic Regression Accuracy:  0.79
Logistic Regression Classification Report:
              precision    recall  f1-score   support

   Not Toxic       0.82      0.77      0.79       104
       Toxic       0.76      0.81      0.79        96

    accuracy                           0.79       200
   macro avg       0.79      0.79      0.79       200
weighted avg       0.79      0.79      0.79       200

-----------------------------------------
Naive Bayes Accuracy:  0.86
Naive Bayes Classification Report:
              precision    recall  f1-score   support

   Not Toxic       0.89      0.84      0.86       104
       Toxic       0.83      0.89      0.86        96

    accuracy                           0.86       200
   macro avg       0

In [24]:
import pickle
# Export the best model as a pickle file
best_model = naive_bayes  # Change this to the best model
pickle.dump(best_model, open('toxicity_analysis.pkl', 'wb'))

In [25]:
import os
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

# Load the CSV file into a DataFrame
df = pd.read_csv('models/hate_speech_model.csv')

# Data preprocessing and statistics
print("Data Preprocessing and Statistics")
print("-----------------------------------------")
# Check the number of samples
num_samples = len(df)
print("Number of samples:", num_samples)

# Check the distribution of classes
class_counts = df['is_toxic'].value_counts()
print("Class Distribution:")
print(class_counts)

# Split the data into feature and target variables
x = df['text']
y = df['is_toxic']

# Convert the text data into numerical vectors using a CountVectorizer
vectorizer = CountVectorizer()
x = vectorizer.fit_transform(x)

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42)

# Train the models
logistic_regression = LogisticRegression()
naive_bayes = MultinomialNB()
svm = SVC()

logistic_regression.fit(x_train, y_train)
naive_bayes.fit(x_train, y_train)
svm.fit(x_train, y_train)

# Save the trained models
model_dir = 'models'
os.makedirs(model_dir, exist_ok=True)

pickle.dump(logistic_regression, open(os.path.join(model_dir, 'logistic_regression.pkl'), 'wb'))
pickle.dump(naive_bayes, open(os.path.join(model_dir, 'naive_bayes.pkl'), 'wb'))
pickle.dump(svm, open(os.path.join(model_dir, 'svm.pkl'), 'wb'))

# Save the vectorizer
pickle.dump(vectorizer, open(os.path.join(model_dir, 'vectorizer.pkl'), 'wb'))

# Evaluate the models on the testing data
logistic_regression_predictions = logistic_regression.predict(x_test)
naive_bayes_predictions = naive_bayes.predict(x_test)
svm_predictions = svm.predict(x_test)

# Calculate accuracy for each model
logistic_regression_accuracy = accuracy_score(y_test, logistic_regression_predictions)
naive_bayes_accuracy = accuracy_score(y_test, naive_bayes_predictions)
svm_accuracy = accuracy_score(y_test, svm_predictions)

# Generate classification report for each model
logistic_regression_report = classification_report(y_test, logistic_regression_predictions)
naive_bayes_report = classification_report(y_test, naive_bayes_predictions)
svm_report = classification_report(y_test, svm_predictions)

# Print the statistical report
print("\nStatistical Report")
print("-----------------------------------------")
print("Logistic Regression Accuracy: ", logistic_regression_accuracy)
print("Logistic Regression Classification Report:")
print(logistic_regression_report)
print("-----------------------------------------")
print("Naive Bayes Accuracy: ", naive_bayes_accuracy)
print("Naive Bayes Classification Report:")
print(naive_bayes_report)
print("-----------------------------------------")
print("SVM Accuracy: ", svm_accuracy)
print("SVM Classification Report:")
print(svm_report)


Data Preprocessing and Statistics
-----------------------------------------
Number of samples: 1000
Class Distribution:
is_toxic
Toxic        501
Not Toxic    499
Name: count, dtype: int64

Statistical Report
-----------------------------------------
Logistic Regression Accuracy:  0.79
Logistic Regression Classification Report:
              precision    recall  f1-score   support

   Not Toxic       0.82      0.77      0.79       104
       Toxic       0.76      0.81      0.79        96

    accuracy                           0.79       200
   macro avg       0.79      0.79      0.79       200
weighted avg       0.79      0.79      0.79       200

-----------------------------------------
Naive Bayes Accuracy:  0.86
Naive Bayes Classification Report:
              precision    recall  f1-score   support

   Not Toxic       0.89      0.84      0.86       104
       Toxic       0.83      0.89      0.86        96

    accuracy                           0.86       200
   macro avg       0