In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import RandomOverSampler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import re
from scipy.stats import randint


In [None]:
# Load the dataset
df = pd.read_csv('/content/dataset1_utf8.csv')

# Randomly sample 10% of the dataset for faster processing
df_sampled = df.sample(frac=0.1, random_state=42)


In [None]:
# Count number of columns
num_columns = df_sampled.shape[1]
print(f"Number of columns: {num_columns}")

# Function to preprocess text
def preprocess_text(text):
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    return text

# Apply preprocessing to 'comment' column
df_sampled['processed_comment'] = df_sampled['comment'].apply(preprocess_text)

# Split dataset into features (X) and target (y)
X = df_sampled['processed_comment']
y = df_sampled['label']

Number of columns: 3


In [None]:
# Vectorize the text data (convert text to numerical features)
vectorizer = TfidfVectorizer(max_features=1000)  # You can adjust max_features as needed
X_vec = vectorizer.fit_transform(X)

# Print shape of data before sampling
print(f"Shape of data before sampling: {X_vec.shape}, {y.shape}")

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.2, random_state=42)

# Define class weights based on their frequency (for cost-sensitive training)
class_weights = {'N': 1, 'P': 2}  # Adjust as per your dataset distribution

# Define the parameter distribution for Randomized Search
param_dist = {
    'n_estimators': randint(50, 150),  # Reduced range of number of trees in the forest
    'max_depth': [10, 20, None],  # Maximum depth of the tree
    'min_samples_split': randint(2, 10),  # Minimum number of samples required to split an internal node
    'min_samples_leaf': randint(1, 4),  # Minimum number of samples required to be at a leaf node
    'bootstrap': [True, False]  # Whether bootstrap samples are used when building trees
}

# Initialize the classifier with class weights for cost-sensitive training
rf = RandomForestClassifier(class_weight=class_weights, random_state=42)

# Initialize Randomized Search with cross-validation (reduced to 3 folds)
random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_dist, n_iter=20, cv=3, verbose=2, n_jobs=-1, random_state=42)

# Fit Randomized Search on the training data
random_search.fit(X_train, y_train)

# Get the best parameters from Randomized Search
best_params = random_search.best_params_
print(f"Best parameters found: {best_params}")

# Initialize the RandomForestClassifier with the best parameters
model_best = RandomForestClassifier(**best_params, class_weight=class_weights, random_state=42)

# Fit the model on the training data
model_best.fit(X_train, y_train)

# Predict on the test set
y_pred_best = model_best.predict(X_test)

# Evaluate the model for cost-sensitive training with the best parameters
print("Cost-Sensitive Training with Best Parameters:")
print(classification_report(y_test, y_pred_best))

# Random Oversampling
oversampler = RandomOverSampler(random_state=42)

# Apply random oversampling to the training data
X_train_resampled, y_train_resampled = oversampler.fit_resample(X_train, y_train)

# Print shape of balanced data after oversampling
print(f"Shape of balanced data after random oversampling: {X_train_resampled.shape}, {y_train_resampled.shape}")

# Fit the best model on the resampled data
model_best.fit(X_train_resampled, y_train_resampled)

# Predict on the test set
y_pred_resampled = model_best.predict(X_test)

# Evaluate the model after random oversampling with the best parameters
print("Random Oversampling with Best Parameters:")
print(classification_report(y_test, y_pred_resampled))


Shape of data before sampling: (4114, 1000), (4114,)
Fitting 3 folds for each of 20 candidates, totalling 60 fits




Best parameters found: {'bootstrap': True, 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 9, 'n_estimators': 96}
Cost-Sensitive Training with Best Parameters:
              precision    recall  f1-score   support

           N       0.66      0.50      0.57       440
           P       0.55      0.70      0.62       383

    accuracy                           0.59       823
   macro avg       0.60      0.60      0.59       823
weighted avg       0.61      0.59      0.59       823

Shape of balanced data after random oversampling: (5151, 1000), (5151,)
Random Oversampling with Best Parameters:
              precision    recall  f1-score   support

           N       0.69      0.47      0.56       440
           P       0.55      0.76      0.64       383

    accuracy                           0.60       823
   macro avg       0.62      0.61      0.60       823
weighted avg       0.63      0.60      0.60       823

