In [None]:
import numpy as np

# Remove rows with missing values from the DataFrame
cleaned_data = train_data.dropna()

# Separate features and target variable
features = cleaned_data['text']
target = cleaned_data['label']

# Print the shape of the features and target arrays
print("Shape of features (X):", features.shape)
print("Shape of target (y):", target.shape)


In [None]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
training_features, testing_features, training_labels, testing_labels = train_test_split(
    features, target, test_size=0.2, random_state=42, stratify=target
)

# Print the shapes of the resulting datasets
print("Training features shape:", training_features.shape)
print("Testing features shape:", testing_features.shape)
print("Training labels shape:", training_labels.shape)
print("Testing labels shape:", testing_labels.shape)


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TF-IDF Vectorizer with specified parameters
tfidf_vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))

# Fit and transform the training data
X_train_tfidf_matrix = tfidf_vectorizer.fit_transform(training_features)

# Transform the test data based on the fitted TF-IDF vectorizer
X_test_tfidf_matrix = tfidf_vectorizer.transform(testing_features)

# Print shapes of TF-IDF matrices for training and testing sets
print("Shape of training TF-IDF matrix:", X_train_tfidf_matrix.shape)
print("Shape of testing TF-IDF matrix:", X_test_tfidf_matrix.shape)


In [None]:
from imblearn.over_sampling import SMOTE

# Initialize SMOTE for handling class imbalance
smote_resampler = SMOTE(random_state=42)

# Apply SMOTE to the training data to generate synthetic samples
X_resampled_features, y_resampled_labels = smote_resampler.fit_resample(X_train_tfidf_matrix, training_labels)

# Display the distribution of the resampled labels
print("Resampled label distribution:\n", pd.Series(y_resampled_labels).value_counts())


In [None]:
from sklearn.linear_model import LogisticRegression

# Initialize the Logistic Regression model
logistic_regression_model = LogisticRegression()

# Train the model using the resampled training data
logistic_regression_model.fit(X_resampled_features, y_resampled_labels)

# Optional: Print a message confirming that the model has been trained
print("Logistic Regression model has been trained successfully.")


For logistic Regression

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Create an instance of the RandomForestClassifier
random_forest_clf = RandomForestClassifier(random_state=42)

# Optional: Print a message indicating the Random Forest model has been initialized
print("Random Forest Classifier has been initialized.")


In [None]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the Random Forest Classifier with a fixed random seed
rf_classifier = RandomForestClassifier(random_state=42)

# Train the Random Forest model using the resampled data
rf_classifier.fit(X_resampled_features, y_resampled_labels)

# Optional: Print a message confirming that the Random Forest model has been trained
print("Random Forest Classifier has been trained successfully.")


In [None]:
from sklearn.ensemble import RandomForestClassifier

# Create an instance of the RandomForestClassifier with a fixed random seed for reproducibility
forest_classifier = RandomForestClassifier(random_state=42)

# Optional: Print a message confirming the classifier initialization
print("RandomForestClassifier instance has been created.")


In [None]:
# Evaluate the model's performance on the test set
test_accuracy = rf_classifier.score(X_test_tfidf_matrix, testing_labels)

# Print the accuracy of the Random Forest Classifier on the test data
print(f"Accuracy of the Random Forest Classifier on the test set: {test_accuracy:.4f}")


In [None]:
from sklearn.metrics import confusion_matrix, classification_report

# Make predictions using the trained Random Forest model
predicted_labels = rf_classifier.predict(X_test_tfidf_matrix)

# Compute and print the confusion matrix
conf_matrix = confusion_matrix(testing_labels, predicted_labels)
print("Confusion Matrix:\n", conf_matrix)

# Generate and print the classification report
class_report = classification_report(testing_labels, predicted_labels)
print("Classification Report:\n", class_report)


In [None]:
from sklearn.metrics import classification_report

# Generate predictions on the test set using the trained Random Forest model
predicted_labels = rf_classifier.predict(X_test_tfidf_matrix)

# Create a classification report for the predictions
classification_summary = classification_report(testing_labels, predicted_labels)

# Print the classification report
print("Classification Report:\n", classification_summary)


In [None]:
param_grid = {
    'n_estimators': [100, 150,200,300,350,400], 
    'max_features': [1,2,'sqrt', 'log2', None], 
    'max_depth': [4, 6, 10,15,20], 
    'max_leaf_nodes': [2, 4, 6,12,20]
    

In [None]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize GridSearchCV with the Random Forest Classifier and parameter grid
grid_search = GridSearchCV(estimator=rf_classifier, 
                           param_grid=param_grid, 
                           cv=5, 
                           verbose=2, 
                           n_jobs=-1)

# Optionally, fit GridSearchCV to the training data
grid_search.fit(X_resampled_features, y_resampled_labels)

# Print the best parameters found by GridSearchCV
print("Best parameters found:\n", grid_search.best_params_)


In [None]:
from sklearn.metrics import classification_report, confusion_matrix

# Make predictions on the test set using the best model from GridSearchCV
predicted_labels_best = grid_search.predict(X_test_tfidf_matrix)

# Generate and print the classification report
report_summary = classification_report(testing_labels, predicted_labels_best)
print("Classification Report:\n", report_summary)

# Compute and print the confusion matrix
confusion_mat = confusion_matrix(testing_labels, predicted_labels_best)
print("Confusion Matrix:\n", confusion_mat)
