In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn.model_selection import learning_curve
from joblib import dump, load
import tensorflow as tf

In [None]:
# Load and preprocess the data
def preprocess_data(file_path):
    df = pd.read_csv(file_path)

    # Dropping rows where 'Metric' column has NaN values
    df = df.dropna(subset=['Metric'])

    # Convert 'Metric' to binary class
    df['Quality'] = df['Metric'].apply(lambda x: 1 if x >= 0.51 else 0)

    # Selecting relevant features
    X = df[['Sentence Count', 'Word Count', 'Similarity Score']]
    y = df['Quality']

    # Splitting the data into training and validation sets
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    # Normalizing the features
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_val = scaler.transform(X_val)

    dump(scaler, 'scaler.joblib') # for adding new data later

    return X_train, X_val, y_train, y_val

# Train the model and evaluate it using logistic regression
def train_and_evaluate(X_train, X_val, y_train, y_val):
    # Training the Logistic Regression model
    model = LogisticRegression()
    model.fit(X_train, y_train)

    # Making predictions
    y_pred = model.predict(X_val)
    y_pred_proba = model.predict_proba(X_val)[:, 1]

    # Calculating metrics
    accuracy = accuracy_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)
    fpr, tpr, thresholds = roc_curve(y_val, y_pred_proba)
    roc_auc = auc(fpr, tpr)

    # Output metrics
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")

    # Plotting ROC Curve
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, color='blue', label=f'ROC curve (area = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='darkgrey', linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend()
    plt.show()

    # Plotting Confusion Matrix
    cm = confusion_matrix(y_val, y_pred)
    plt.figure(figsize=(6, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix')
    plt.show()

    dump(model, 'logistic_model.joblib')

# Train the model and evaluate it using SGDClassifier
def train_and_evaluate_sgd(X_train, y_train, X_val, y_val, classes, epochs=1):
    """
    Train and evaluate the SGDClassifier.

    Parameters:
    - X_train, y_train: Training features and labels.
    - X_val, y_val: Validation features and labels.
    - classes: Array of unique classes.
    - epochs: Number of passes over the training data.
    """
    # Check if the number of samples in X_train and y_train are the same
    if X_train.shape[0] != y_train.shape[0]:
        raise ValueError("The number of samples in X_train and y_train must be the same.")

    # Initialize the SGDClassifier
    sgd_model = SGDClassifier()

    # Training the model incrementally
    for epoch in range(epochs):
        sgd_model.partial_fit(X_train, y_train, classes=classes)

    # Making predictions on the validation set
    y_pred = sgd_model.predict(X_val)

    # Evaluating the model
    accuracy = accuracy_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred, average='weighted')
    recall = recall_score(y_val, y_pred, average='weighted')
    f1 = f1_score(y_val, y_pred, average='weighted')

    # Output the metrics
    print(f"Epoch {epoch+1}: Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1 Score: {f1}")

    # Save the model
    dump(sgd_model, 'sgd_model.joblib')

    return sgd_model

# Train the model and evaluate it using neural network
def train_and_evaluate_nn(X_train, y_train, X_val, y_val, epochs=10, batch_size=32):
    # Define the neural network architecture
    model = tf.keras.models.Sequential([
        tf.keras.layers.Dense(32, activation='relu'),
    ]) # various models tried

    # Compile the model
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    # Train the model
    model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, verbose=1)

    print(model.summary())

    # Evaluate the model
    y_pred_prob = model.predict(X_val)
    y_pred = (y_pred_prob > 0.51).astype("int32")

    # Calculate metrics
    accuracy = accuracy_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    # Print metrics
    print(f"Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1 Score: {f1}")

    # Save the model
    model.save('nn_model.h5')

    return model


In [None]:
file_path = 'data.csv'

In [None]:
# For Logistic Regression
X_train, X_val, y_train, y_val = preprocess_data(file_path)
train_and_evaluate(X_train, X_val, y_train, y_val)

In [None]:
# For Random Forest
X_train, X_val, y_train, y_val = preprocess_data(file_path)
train_and_evaluate_rf(X_train, X_val, y_train, y_val)

In [None]:
# For SGDClassifier
X_train, X_val, y_train, y_val = preprocess_data(file_path)
train_and_evaluate_sgd(X_train, y_train, X_val, y_val, classes=np.array([0, 1]), epochs=42)

In [None]:
# For neural network:
X_train, X_val, y_train, y_val = preprocess_data(file_path)
nn_model = train_and_evaluate_nn(X_train, y_train, X_val, y_val, epochs=1, batch_size=32)