In [5]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Load the training and test data
# Load data
training_data = pd.read_csv("train_essays.csv")
test_data = pd.read_csv("test_essays.csv")

# Define a custom text preprocessing function
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Tokenization, stopwords removal, and other steps are already covered by TfidfVectorizer
    
    return text

# Split the training data into features and labels
X_train, X_validation, y_train, y_validation = train_test_split(training_data['text'], training_data['generated'], test_size=0.2, random_state=42)

# Create a pipeline with TF-IDF vectorizer and Logistic Regression classifier
vectorizer_lr = TfidfVectorizer(preprocessor=preprocess_text)
X_train_tfidf_lr = vectorizer_lr.fit_transform(X_train)
X_validation_tfidf_lr = vectorizer_lr.transform(X_validation)

# Create Logistic Regression classifier
lr_model = LogisticRegression()

# Hyperparameter tuning using GridSearchCV
param_grid_lr = {
    'C': [0.001, 0.01, 0.1, 1, 10],
    'penalty': ['l1', 'l2'],
}

grid_search_lr = GridSearchCV(lr_model, param_grid_lr, cv=5, scoring='accuracy')
grid_search_lr.fit(X_train_tfidf_lr, y_train)

# Get the best hyperparameters
best_hyperparameters_lr = grid_search_lr.best_params_
print("Best Hyperparameters for Logistic Regression:", best_hyperparameters_lr)

from sklearn.metrics import classification_report
y_validation_pred_lr = grid_search_lr.predict(X_validation_tfidf_lr)
print(classification_report(y_validation, y_validation_pred_lr))

# Validate the model on the validation set
y_validation_pred_lr = grid_search_lr.predict(X_validation_tfidf_lr)
accuracy_lr = accuracy_score(y_validation, y_validation_pred_lr)
print("Logistic Regression Accuracy on Validation Set:", accuracy_lr)

# Make predictions on the test set
X_test_tfidf_lr = vectorizer_lr.transform(test_data['text'])
test_predictions_lr = grid_search_lr.predict_proba(X_test_tfidf_lr)[:, 1]

# Create a submission file for Logistic Regression
submission_data_lr = pd.DataFrame({'id': test_data['id'], 'generated': test_predictions_lr})
submission_data_lr.to_csv("submission_logistic_regression.csv", index=False)


25 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\HP\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\HP\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\HP\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\linear_model\_logistic.py", line 54, in _

Best Hyperparameters for Logistic Regression: {'C': 0.001, 'penalty': 'l2'}
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       275
           1       0.00      0.00      0.00         1

    accuracy                           1.00       276
   macro avg       0.50      0.50      0.50       276
weighted avg       0.99      1.00      0.99       276

Logistic Regression Accuracy on Validation Set: 0.9963768115942029


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
