# COVID-19 & Health Predictor 

#### - Data Bootcamp Final Project
##### - Cordell, Gibbs, Miller, Ross

# Purpose: 
### Analyze COVID-19 Positivity or Negativity Based on Correlation to Nutrition

# Model: Linear Regression Model

In [None]:
from sklearn.linear_model import LinearRegression

## Library & Dependency Installation

In [None]:
# Scikit-Learn Libraries/Dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [None]:
# Data Frame & Data Manipulation Libraries/Dependencies
import pandas as pd

## Loading the Data

#### Physical Path Import

In [None]:
# Create File Path
file_path = "insert_file_path"

# Build the Dataframe
dataframe = pd.read_csv(file_path)

In [None]:
# Preview the Data
dataframe.head(15)

## Connect to Database

In [None]:
# Install Postgres Driver
!wget https://jdbc.postgresql.org/download/postgresql-42.2.16.jar

##### Build Database Connection

In [None]:
# Import Module to Communicate with PostgreSQL
import psycopg2 as pg

# Import Password Protector
from getpass import getpass

# Build Engine for Connection
engine = pg.connect(
    "dbname='my_db_name' 
    user='pguser' 
    host='127.0.0.1' 
    port='15432' 
    password=getpass('pgpassword')"
)

dataframe = pd.read_sql('select * from Stat_Table', con=engine)

In [None]:
# # Import Module to Communicate with PostgreSQL
# import psycopg2

# # Connection parameters - yours will be different
# param_dic = {
#     "host"      : "localhost",
#     "database"  : "globaldata",
#     "user"      : "myuser",
#     "password"  : "Passw0rd"
# }

# # Define the connection function
# def connect(params_dic):
#     """ Connect to the PostgreSQL database server """
#     conn = None
#     try:
#         # connect to the PostgreSQL server
#         print('Connecting to the PostgreSQL database...')
#         conn = psycopg2.connect(**params_dic)
#     except (Exception, psycopg2.DatabaseError) as error:
#         print(error)
#         sys.exit(1) 
#     print("Connection successful")
#     return conn


In [None]:
# # Connect to Database
# connect(param_dic)

##### Retrieve Table/DataFrame from Database

In [None]:
# # Define our Data Table Retreival Function
# def postgresql_to_dataframe(conn, select_query, column_names):
#     """
#     Tranform a SELECT query into a pandas dataframe
#     """
#     cursor = conn.cursor()
#     try:
#         cursor.execute(select_query)
#     except (Exception, psycopg2.DatabaseError) as error:
#         print("Error: %s" % error)
#         cursor.close()
#         return 1
    
#     # Naturally we get a list of tupples
#     tupples = cursor.fetchall()
#     cursor.close()
    
#     # We just need to turn it into a pandas dataframe
#     df = pd.DataFrame(tupples, columns=column_names)
#               # *** Potentially change this ^ so that we don't have to input column names by hand ***
#     return df

In [None]:
# # Retreive Data
# postgresql_to_dataframe(conn, select_query, column_names)

# # Rename DataFrame Variable
# dataframe = df

## Process & Preprocess the Data

### Clean the Data

##### Identify Categorical Variables/Columns

In [None]:
# Generate our categorical variable list
categorical_columns = dataframe.types[dataframe.dtypes == "object"].index.tolist()

# Check the number of unique values in each column
dataframe[categorical_columns].nunique()

##### Determing Bucketing - if categorical column has more than 10 unique values, consider binning!


In [1]:
## OPTIONAL ##

## Analyze frequencies of unique values in categorical column(s)
# column1_counts = dataframe.insert_column_name_here.value_counts()

## Plot the value counts - then determine the cut off(s) for binning
# column1_counts.plot.density()

In [2]:
## OPTIONAL ##

## Determine which values to replace
# replace_column1 = list(column1_counts[column1_counts < cutoff_value].index)

## Replace the column values you'd like to bucket together in the DataFrame
# for value in replace_column1:
  # dataframe.insert_column_name_here = dataframe.insert_column_name_here.replace(value,"New_Bucketed_Value_Name-Ex:'Other'")

## Check to make sure the bucketing/binning was successful
# dataframe.insert_column_name_here.value_counts()

##### Encode the Categorical Values

In [None]:
# Import the LabelEncoder
from sklearn.preprocessing import LabelEncoder

In [None]:
# Instantiate LabelEncoder
encoder = LabelEncoder()

# Create a copy of the dataframe and encode the data
dataframe_copy = dataframe.copy()

# Train the LabelEncoder, convert the text columns to string columns
for x in categorical_columns:
  dataframe_copy[f"{x}"] = encoder.fit_transform(dataframe_copy[f"{x}"])

# Preview the converted tables
dataframe_copy.head(15)

### Dataset Training & Test Splitting

##### Separate Features from Outcomes

##### Features(X)

In [None]:
# Create Features Dataset
X = dataframe_copy.copy()
X = X.drop("insert_outcome_variable/column_name_here", axis=1)

# Preview the Dataset
X.head(15)

##### Outcomes (y)

In [None]:
# Create Outcomes Dataset
y = dataframe["insert_outcome_variable/column_name_here"].values

# Preview the data
y.head(15)

#### Create Training & Testing Splits

In [None]:
# Split Data into Training & Testing (Default: 75%/25% Split)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=615, stratify=y, train_size=None)

# | Change the split % by editting the "train_size parameter to your training split percentage" |
      # Example: train_size = 0.80 results in an 80%/20% split

In [None]:
# Preview the shapes of the Split Datasets
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

#### Standardizing/Scaling the Data

In [None]:
# Instantiate the Scaler
scaler = StandardScaler()

In [None]:
# Fit/Train Scaler to the Training Data
X_scaler = scaler.fit(X_train)

In [None]:
# Scale / Transform the X data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Apply the Model

#### Instantiate, Fit, & Evaluate the Model

In [None]:
# Instantiate the Model
lr_model = LinearRegression()

In [None]:
# Fit the Model to the Training Data
lr_model.fit(X_train_scaled,y_train)

In [None]:
# Create predictions with the model using X_test Data.
y_pred = lr_model.predict(X_test)

# View the Data
print(y_pred.shape)

#### Evaluate the Model

In [None]:
# Import the Evaluation Metrics & Dependencies
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [None]:
# Evaluate the Model's Accuracy (Against y_test)
model_accuracy = accuracy_score(y_test,y_pred)

In [None]:
# Evaluate the Model's Precision & Sensitivity (Against y_test)
matrix = confusion_matrix(y_test,y_pred)

# Convert Confusion Matrix into a DataFrame
matrix_df = pd.DataFrame(
    matrix, index=["Actual Positive", "Actual Negative"], columns=["Predicted Positive", "Predicted Negative"]
)

In [None]:
# Evaluate the Model with Statistical Metrics (Against y_test)
model_classification_report = classification_report(y_test, y_pred)

##### Display Evaluation

In [None]:
# Display the results
print(f"This model's accuracy is {model_accuracy}/100.")
print(f"Confusion Matrix displaying:   True Positive | False Negative \n
                                  False Positive | True Negative")
display(matrix_df)
print(f"Statistical Metrics via Classification Report")
print(model_classification_report)


## Results & Takeaways

#### Model Effectiveness & Applications Summary

*Insert Model Summary & Results Write-Up Here*

#### Feature Importances & *Weights*

In [None]:
# Create a sorted list of features from most important to overall outcome to least important!
sorted(zip(lr_model.feature_importances_, X.columns), reverse=True)

*Insert Feature Importances Summary & Results Write-Up Here*