
# PRECURSOR ACTIVTIES

## Step 1: Import modules and read in the dataset about the SNAP variable

In [None]:
# Import the required modules
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from scipy.stats import randint
import tensorflow as tf
from tensorflow.keras.models import Sequential

In [None]:
# If .csv, read the .csv file from the Resources folder into a Pandas DataFrame
file_path = Path("<add path>")
df_snap_stats = pd.read_csv(file_path)

# Review the DataFrame
df_snap_stats.head()


## Step 2: Split the data into X and y and then into testing and training sets.

In [None]:
# Split the data into X (features) and y (target)
# The y variable should focus on the target column
y = df_snap_stats['<add target>']

# The X variable should include all features except the target
X = df_snap_stats.drop(columns=['<add target>'])


In [None]:
# Scale X before splitting
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
# Split into testing and training sets using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, random_state=1, stratify=y)


# SUPERVISED LEARNING MODELING

## Step 1: Fit a logistic regression classifier.

In [None]:
# Declare a logistic regression model.
logistic_regression_model = LogisticRegression(solver='lbfgs', random_state=42)

# Fit and save the logistic regression model using the training data
lr_model = logistic_regression_model.fit(X_train, y_train)


## Step 2: Create the predicted values for the testing and the training data.

In [None]:
#Generate training predictions
training_predictions = logistic_regression_model.predict(X_train)

#Generate testing predictions
testing_predictions = logistic_regression_model.predict(X_test)


## Step 3: Print a confusion matrix for the training data.

In [None]:
# Create and save the confusion matrix for the training data
training_matrix = confusion_matrix(y_train, training_predictions)

# Print the confusion matrix for the training data
print(training_matrix)


## Step 4: Print a confusion matrix for the testing data.

In [None]:
# Create and save the confusion matrix for the testing data
test_matrix = confusion_matrix(y_test, testing_predictions)

# Print the confusion matrix for the testing data
print(test_matrix)


## Step 5: Print the training and testing classification reports.

In [None]:
# Create and save the training classification report
class_labels = ["<add> [labeled ?]", "<add> [labeled ?]"]

# Create and print the training classification report
training_report = classification_report(y_train, training_predictions, target_names=class_labels)
print("Training Classification Report:\n", training_report)

# Create and print the testing classification report
testing_report = classification_report(y_test, testing_predictions, target_names=class_labels)
print("Testing Classification Report:\n", testing_report)


# HYPERPARAMETER TUNING

## Step 1: Initiate hyperparametric tuning

In [None]:
# Create a Random Forest classifier
rf_model = RandomForestClassifier()

# Define the hyperparameter grid
param_dist = {
    'n_estimators': randint(100, 1000),
    'max_depth': randint(5, 20),
    'min_samples_split': randint(2, 11)
}

# Create RandomizedSearchCV object
random_search = RandomizedSearchCV(rf_model, param_distributions=param_dist, n_iter=10, cv=5)

# Fit the RandomizedSearchCV object to your data
random_search.fit(X_train, y_train)

# Get the best parameters and estimator
best_params = random_search.best_params_
best_model = random_search.best_estimator_

In [None]:
# Print the best parameters
print("Best parameters:", best_params)

# Use the best model for further predictions or evaluation and print
predictions = best_model.predict(X_test)
print("Predictions:", predictions)

## Step 2: Evaluate initial model peformance via classification report

In [None]:
# Create, then print classification report
testing_report = classification_report(y_test, predictions, target_names=class_labels)
print("Testing Classification Report:\n", testing_report)

## Step 3: Evaluate feature importance

In [None]:
# Generate feature importances and feature names
feature_importances = best_model.feature_importances_
feature_names = X.columns

In [None]:
# Create and sort a DataFrame for visualization
importance_df = pd.DataFrame({'feature': feature_names, 'importance': feature_importances})
importance_df = importance_df.sort_values(by='importance', ascending=False)
importance_df.head()

In [None]:
# Build plot
plt.figure(figsize=(4, 4))
plt.barh(importance_df['feature'], importance_df['importance'], color='blue')
plt.xlabel('Importance Score')
plt.ylabel('Feature Name')
plt.title('Feature Importances in Random Forest Model')
plt.tight_layout()
plt.show()

## Step 4: Optional: retune hyperparameters for optimization

In [None]:
# OPTIONAL: Define a new hyperparameter grid here if needed

In [None]:
# Create a second Random Forest classifier
rf_model_2 = RandomForestClassifier()

# Create a second RandomizedSearchCV object with potentially different parameters (optional)
random_search_2 = RandomizedSearchCV(rf_model_2, param_distributions=param_dist_2, n_iter=10, cv=5)

# Fit the second RandomizedSearchCV object to your data
random_search_2.fit(X_train, y_train)

# Get the second best parameters and estimator
best_params_2 = random_search_2.best_params_
best_model_2 = random_search_2.best_estimator_

In [None]:
# Print the second best parameters
print("Second best parameters:", best_params_2)

# Use the second best model for further predictions or evaluation and print
predictions_2 = best_model.predict(X_test)
print("Second Best Model Predictions:", predictions_2)

## Step 4: Evaluate secondary model peformance via classification report

In [None]:
# Run second best model using X_test data
predictions_2 = best_model.predict(X_test)

In [None]:
# Create, then print classification report
testing_report_2 = classification_report(y_test, predictions_2, target_names=class_labels)
print("Testing Classification Report", testing_report_2)

# NEURAL NETWORK TESTING

## Step 1: Compile, Train, and Evaluate the Model

In [None]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
# Checking the number of possible input features
count_input_features = X_train.shape[1]
print("Number of input features:", count_input_features)

NameError: name 'X_train' is not defined

In [None]:
# Configuring layers and nodes
snap_model = tf.keras.models.Sequential()

# First hidden layer
snap_model.add(tf.keras.layers.Dense(units=, activation="", input_dim=count_input_features))

# Second hidden layer
snap_model.add(tf.keras.layers.Dense(units=, activation=""))

# < > hidden layer
snap_model.add(tf.keras.layers.Dense(units=, activation=""))

# Output layer
snap_model.add(tf.keras.layers.Dense(units=1, activation=""))

# Check the structure of the model
snap_model.summary()

In [None]:
# Compile the model
snap_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [None]:
# Train the model
fit_model = snap_model.fit(X_train, y_train, epochs=)

In [None]:
# Evaluate the model using the test data
model_loss, model_accuracy = snap_model.evaluate(X_test, y_test, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

## Step 2: Export the Model

In [None]:
# Export model
snap_model.save("snap_participation.keras")