In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import RandomOverSampler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
import re

In [None]:
# Load the dataset (replace 'your_dataset.csv' with your actual dataset file)
df = pd.read_csv('/content/dataset1_utf8.csv')


In [None]:
# Count number of columns
num_columns = df.shape[1]
print(f"Number of columns: {num_columns}")

# Function to preprocess text
def preprocess_text(text):
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    return text

Number of columns: 3


In [None]:
# Apply preprocessing to 'comment' column
df['processed_comment'] = df['comment'].apply(preprocess_text)

# Split dataset into features (X) and target (y)
X = df['processed_comment']
y = df['label']

# Vectorize the text data (convert text to numerical features)
vectorizer = TfidfVectorizer(max_features=1000)  # You can adjust max_features as needed
X_vec = vectorizer.fit_transform(X)

# Print shape of data before sampling
print(f"Shape of data before sampling: {X_vec.shape}, {y.shape}")

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.2, random_state=42)

# Define a parameter grid for Grid Search
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11],  # Number of neighbors to use
    'weights': ['uniform', 'distance'],  # Weight function used in prediction
    'metric': ['euclidean', 'manhattan', 'minkowski']  # Distance metric to use
}

Shape of data before sampling: (41144, 1000), (41144,)


In [None]:
# Initialize the KNN classifier
knn = KNeighborsClassifier()

# Initialize Grid Search with cross-validation
grid_search = GridSearchCV(estimator=knn, param_grid=param_grid, cv=5, verbose=2, n_jobs=-1)

# Fit Grid Search on the training data
grid_search.fit(X_train, y_train)

# Get the best parameters from Grid Search
best_params = grid_search.best_params_
print(f"Best parameters found: {best_params}")

# Initialize the KNN classifier with the best parameters
model_best = KNeighborsClassifier(**best_params)

# Fit the model on the training data
model_best.fit(X_train, y_train)

# Predict on the test set
y_pred_best = model_best.predict(X_test)

# Evaluate the model
print("Evaluation with Best Parameters:")
print(classification_report(y_test, y_pred_best))

# Random Oversampling
oversampler = RandomOverSampler(random_state=42)

# Apply random oversampling to the training data
X_train_resampled, y_train_resampled = oversampler.fit_resample(X_train, y_train)

# Fit the best model on the resampled data# Initialize the KNN classifier
knn = KNeighborsClassifier()

# Initialize Grid Search with cross-validation
grid_search = GridSearchCV(estimator=knn, param_grid=param_grid, cv=5, verbose=2, n_jobs=-1)

# Fit Grid Search on the training data
grid_search.fit(X_train, y_train)

# Get the best parameters from Grid Search
best_params = grid_search.best_params_
print(f"Best parameters found: {best_params}")

# Initialize the KNN classifier with the best parameters
model_best = KNeighborsClassifier(**best_params)

# Fit the model on the training data
model_best.fit(X_train, y_train)

# Predict on the test set
y_pred_best = model_best.predict(X_test)

# Evaluate the model
print("Evaluation with Best Parameters:")
print(classification_report(y_test, y_pred_best))

# Random Oversampling
oversampler = RandomOverSampler(random_state=42)

# Apply random oversampling to the training data
X_train_resampled, y_train_resampled = oversampler.fit_resample(X_train, y_train)

# Fit the best model on the resampled data
model_best.fit(X_train_resampled, y_train_resampled)

# Predict on the test set
y_pred_resampled = model_best.predict(X_test)

# Evaluate the model after random oversampling
print("Evaluation after Random Oversampling with Best Parameters:")
print(classification_report(y_test, y_pred_resampled))


Fitting 5 folds for each of 30 candidates, totalling 150 fits
Best parameters found: {'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'uniform'}
Evaluation with Best Parameters:
              precision    recall  f1-score   support

           N       0.57      0.89      0.69      4375
           O       0.00      0.00      0.00         4
           P       0.65      0.23      0.35      3850

    accuracy                           0.58      8229
   macro avg       0.41      0.38      0.35      8229
weighted avg       0.61      0.58      0.53      8229



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Fitting 5 folds for each of 30 candidates, totalling 150 fits
