In [10]:
import pandas as pd
import numpy as np
import re
import pickle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

In [11]:
# Load the dataset
data = pd.read_csv('data/train.csv')
print(data.head())

                 id                                       comment_text  toxic  \
0  0000997932d777bf  Explanation\nWhy the edits made under my usern...      0   
1  000103f0d9cfb60f  D'aww! He matches this background colour I'm s...      0   
2  000113f07ec002fd  Hey man, I'm really not trying to edit war. It...      0   
3  0001b41b1c6bb37e  "\nMore\nI can't make any real suggestions on ...      0   
4  0001d958c54c6e35  You, sir, are my hero. Any chance you remember...      0   

   severe_toxic  obscene  threat  insult  identity_hate  
0             0        0       0       0              0  
1             0        0       0       0              0  
2             0        0       0       0              0  
3             0        0       0       0              0  
4             0        0       0       0              0  


In [12]:
print(data.info())
print(data['toxic'].value_counts())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   id             159571 non-null  object
 1   comment_text   159571 non-null  object
 2   toxic          159571 non-null  int64 
 3   severe_toxic   159571 non-null  int64 
 4   obscene        159571 non-null  int64 
 5   threat         159571 non-null  int64 
 6   insult         159571 non-null  int64 
 7   identity_hate  159571 non-null  int64 
dtypes: int64(6), object(2)
memory usage: 9.7+ MB
None
toxic
0    144277
1     15294
Name: count, dtype: int64


In [13]:
def clean_text(text):
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+|\#', '', text)
    text = re.sub(r'[^A-Za-z0-9\s]', '', text)
    return text.lower()

In [15]:
X_train, X_val, y_train, y_val = train_test_split(data['cleaned_comments'], data['toxic'], test_size=0.2, random_state=42)

In [16]:
vectorizer = TfidfVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)

# Transform the validation data
X_val_vectorized = vectorizer.transform(X_val)

In [17]:
model = LogisticRegression()
model.fit(X_train_vectorized, y_train)

In [18]:
y_pred = model.predict(X_val_vectorized)
print(classification_report(y_val, y_pred))
print(f'Accuracy: {accuracy_score(y_val, y_pred)}')

              precision    recall  f1-score   support

           0       0.96      0.99      0.98     28859
           1       0.91      0.60      0.73      3056

    accuracy                           0.96     31915
   macro avg       0.93      0.80      0.85     31915
weighted avg       0.95      0.96      0.95     31915

Accuracy: 0.956258812470625


In [20]:
with open("Pickle_LR_Model.pkl", "wb") as model_file:
    pickle.dump(model, model_file)

with open("Vectorize.pickle", "wb") as vectorizer_file:
    pickle.dump(vectorizer, vectorizer_file)