In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, accuracy_score
from sklearn.multioutput import MultiOutputClassifier
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk
import re
import zipfile

In [2]:
# Specify the path to your zip file
zip_file_path = '/content/archive (12).zip'

# Specify the directory to unzip to
unzip_dir = '/jigsaw-toxic-comment-classifier/dataset'

# Unzipping the file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(unzip_dir)

print(f"Dataset unzipped successfully to {unzip_dir}")

Dataset unzipped successfully to /jigsaw-toxic-comment-classifier/dataset


In [3]:
# Download stopwords from nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:
# Load the dataset
df = pd.read_csv('/jigsaw-toxic-comment-classifier/dataset/train.csv')

In [7]:
df.shape

(159571, 8)

In [8]:
# Prepare the data
X = df['comment_text']
y = df[df.columns[2:]]

In [11]:
def preprocess_text(text):
    try:
        # Remove non-alphabetic characters
        text = re.sub(r'[^a-zA-Z]', ' ', text)
        # Convert to lowercase
        text = text.lower()
        # Tokenize
        words = text.split()
        # Remove stopwords
        stop_words = set(stopwords.words('english'))
        words = [word for word in words if word not in stop_words]
        # Stemming
        ps = PorterStemmer()
        stemmed_words = []
        for word in words:
            try:
                stemmed_words.append(ps.stem(word))
            except RecursionError:
                continue  # Skip words causing recursion error
        return ' '.join(stemmed_words)
    except Exception as e:
        return ''  # Return empty string in case of any unexpected error

In [12]:
# Apply preprocessing to the text data
X = X.apply(preprocess_text)

In [13]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
# TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=200000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [15]:
# Logistic Regression Model
log_reg_tfidf = MultiOutputClassifier(LogisticRegression(max_iter=1000))
log_reg_tfidf.fit(X_train_tfidf, y_train)

In [16]:
# Make predictions
y_pred_tfidf = log_reg_tfidf.predict(X_test_tfidf)

In [17]:
# Evaluate the model
precision_tfidf = precision_score(y_test, y_pred_tfidf, average='micro')
recall_tfidf = recall_score(y_test, y_pred_tfidf, average='micro')
accuracy_tfidf = accuracy_score(y_test, y_pred_tfidf)

In [18]:
print(f'TF-IDF Logistic Regression - Precision: {precision_tfidf}, Recall: {recall_tfidf}, Accuracy: {accuracy_tfidf}')

TF-IDF Logistic Regression - Precision: 0.8743386243386243, Recall: 0.5606446140797285, Accuracy: 0.9189096036346546


In [19]:
# Count Vectorizer for comparison
count_vectorizer = CountVectorizer(max_features=200000)
X_train_count = count_vectorizer.fit_transform(X_train)
X_test_count = count_vectorizer.transform(X_test)

In [20]:
# Logistic Regression Model
log_reg_count = MultiOutputClassifier(LogisticRegression(max_iter=1000))
log_reg_count.fit(X_train_count, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [21]:
# Make predictions
y_pred_count = log_reg_count.predict(X_test_count)

In [22]:
# Evaluate the model
precision_count = precision_score(y_test, y_pred_count, average='micro')
recall_count = recall_score(y_test, y_pred_count, average='micro')
accuracy_count = accuracy_score(y_test, y_pred_count)


In [23]:
print(f'Count Vectorizer Logistic Regression - Precision: {precision_count}, Recall: {recall_count}, Accuracy: {accuracy_count}')

Count Vectorizer Logistic Regression - Precision: 0.8141698001187413, Recall: 0.5815662991235511, Accuracy: 0.9126116246279179
