In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from collections import Counter

In [2]:
df_test = pd.read_csv("toxic_test.csv")
df_test_l = pd.read_csv("toxic_labels.csv").replace(-1,0)
print(df_test.shape)
df_test = pd.merge(df_test, df_test_l, how="left", on = "id")
print(df_test.shape)

(153164, 2)
(153164, 8)


In [3]:
df = pd.read_csv("toxic_train.csv")
print(df.shape)
df = pd.concat([df, df_test])
print(df.shape)
del df_test

(159571, 8)
(312735, 8)


In [4]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re
import pandas as pd

def preprocess_text(comment_text):
    
    
    # Remove links
    comment_text = re.sub(r'http\S+', '', comment_text)
    comment_text = comment_text.replace('\n', ' ')
    
    # Additional text cleaning
    comment_text = comment_text.lower()
    comment_text = re.sub(r"what's", "what is ", comment_text)
    comment_text = re.sub(r"\'s", " ", comment_text)
    comment_text = re.sub(r"\'ve", " have ", comment_text)
    comment_text = re.sub(r"can't", "cannot ", comment_text)
    comment_text = re.sub(r"n't", " not ", comment_text)
    comment_text = re.sub(r"i'm", "i am ", comment_text)
    comment_text = re.sub(r"\'re", " are ", comment_text)
    comment_text = re.sub(r"\'d", " would ", comment_text)
    comment_text = re.sub(r"\'ll", " will ", comment_text)
    comment_text = re.sub(r"\'scuse", " excuse ", comment_text)
    comment_text = re.sub('\W', ' ', comment_text)
    comment_text = re.sub('\s+', ' ', comment_text)
    comment_text = comment_text.strip(' ')

    # Tokenize the text
    words = word_tokenize(comment_text)
    
    # Remove digits
    words = [word for word in words if not word.isdigit()]
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    
    # Join the words back into a string
    cleaned_text = ' '.join(words)
    
    return cleaned_text







In [5]:
from imblearn.under_sampling import RandomUnderSampler
df['cleaned_text'] = df['comment_text'].apply(preprocess_text)

# Split the data into input features and target variable
df['y'] = (df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum(axis=1) > 0).astype(int)
df = df[['comment_text', 'y']].rename(columns={'comment_text': 'text'})

tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X = tfidf_vectorizer.fit_transform(df['text'])


y = df['y']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

#undersampler = RandomUnderSampler(sampling_strategy='auto', random_state=42)
#X_resampled, y_resampled = undersampler.fit_resample(X_train, y_train)


from imblearn.over_sampling import SMOTE
from collections import Counter
os = SMOTE()
X_Otrain, y_Otrain = os.fit_resample(X_train, y_train)
count = Counter(y_train)
print(count)



Counter({0: 203228, 1: 15686})


In [6]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV
dt_classifier = DecisionTreeClassifier()

# Define the hyperparameter grid to search
param_dist = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [100,200],
    'min_samples_split': [2, 5, 10,20],
    'min_samples_leaf': [1, 2, 4,8],
}
#dt_classifier = DecisionTreeClassifier(criterion= "gini", max_depth= 400, min_samples_split=2, min_samples_leaf=2)


# Create the RandomizedSearchCV object
random_search = RandomizedSearchCV(estimator=dt_classifier, param_distributions=param_dist, n_iter=3, cv=5, scoring='accuracy', random_state=42,n_jobs=-1)

# Fit the random search to the data
random_search.fit(X_Otrain, y_Otrain)

# Access the best estimator found
best_dt_model = random_search.best_estimator_

# You can use the best_dt_model for predictions
y_pred = best_dt_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

# Print the best hyperparameters and best estimator
print("Best Hyperparameters:", random_search.best_params_)
print("Best Estimator:", best_dt_model)
print("Test Accuracy:", accuracy)

Best Hyperparameters: {'splitter': 'best', 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_depth': 200, 'criterion': 'gini'}
Best Estimator: DecisionTreeClassifier(max_depth=200, min_samples_leaf=2)
Test Accuracy: 0.8719156691998593


In [7]:
from sklearn.metrics import classification_report
best_dt_model = random_search.best_estimator_
y_pred = best_dt_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.91      0.93     87039
           1       0.25      0.38      0.30      6782

    accuracy                           0.87     93821
   macro avg       0.60      0.65      0.61     93821
weighted avg       0.90      0.87      0.88     93821



In [10]:
random_search.best_score_

0.9075423678211104