# Import dependencies and data

In [None]:
import pandas as pd
from pathlib import Path
import numpy as np

# Python SQL toolkit and Object Relational Mapper
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func
import psycopg2
from config import db_password

import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
#Load data
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/University_Salary"
engine = create_engine(db_string)

df = pd.read_sql_table("college_statistics",
                            con=engine)
df.head()

# Data cleaning and feature engineering

In [None]:
#Check for nulls
df.count()

In [None]:
#Replace nulls with 0 and recheck
df.fillna(0, inplace=True)
df.count()

In [None]:
#Drop unnecessary columns and check column types
df = df.drop(columns=["University_Name","Rank","Mid_Career_Pay"])
df.dtypes

In [None]:
#Convert the target column values to low, medium, and high income





In [None]:
#Convert features to integers and recheck column types
df_binary = pd.get_dummies(df, columns=["State","Region","Division","Type","Degree_Length","Early_Career_Pay"])
df_binary.dtypes

# Create features and target and split into training and testing

In [None]:
#Create features and target
X = df_binary.drop(columns="Early_Career_Pay")
y = df_binary["Early_Career_Pay"]

In [None]:
# Visualizing both classes
plt.scatter(X[:, 0], X[:, 1], c=y)

In [None]:
#Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1)

# Logistic Regression Model

In [None]:
#Create Logistic Regression model and train the data
classifier = LogisticRegression(solver='lbfgs', random_state=1)
classifier.fit(X_train, y_train)

#Predict outcomes for test data set
predictions = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)
results.head(10)

In [None]:
#Confusion maxtrix
matrix = confusion_matrix(y_test, y_pred)
print(matrix)

In [None]:
#Classification report
report = classification_report(y_test, y_pred)
print(report)

In [None]:
#Validate the model using the test data
print(accuracy_score(y_test, predictions))