<hr style="border:5px solid #141F31"> </hr>

# 365 DataScience Confusion matrix with statsmodels Python

<hr style="border:4.5px solid #141F31"> </hr>

## Step 1 Import the libraries

In [None]:
# Data manipulation
import pandas as pd 

# Running the regression
import statsmodels.api as sm

# The Python package for scientific computing
import numpy as np

# Main visualization module
import matplotlib.pyplot as plt

# Seaborn's style on top of matplotlib
import seaborn as sns
# Override the default matplotlib styles with those of seaborn
sns.set()

## Step 2 Load the data

In [None]:
# Load the data from a .csv file located in the same folder as the notebook
data = pd.read_csv('Admittance - logistic regression.csv', sep = ',')

In [None]:
# Display the first few lines of the data
data.head()

## Step 3 Map the categorical values to numerical ones

In [None]:
# Create a copy of the data in a variable called 'data_mapped'
data_mapped = data.copy()

# Map all 'No' entries from the 'Admitted' column to 0, and all 'Yes' entries to 1.
# Store these new values in the 'Admitted' column of the 'data_mapped' dataframe
data_mapped['Admitted'] = data_mapped['Admitted'].map({'Yes': 1, 'No': 0})
data_mapped

## Step 4 Define the inputs and the targets

In [None]:
# Define the dependent variable to be the Admittance
y = data_mapped['Admitted']

# Define the independent variable to be the SAT score
x1 = data_mapped['SAT']

## Step 5 Visualize the data

In [None]:
# Create a scatter plot of y versus x1
plt.scatter(x1, y)

# Label the axes
plt.xlabel('SAT', fontsize = 20)
plt.ylabel('Admitted', fontsize = 20)
plt.show()

## Step 6 Perform the regression

In [None]:
# Add a new column (equal in lenght to x1) consisting only of ones
x = sm.add_constant(x1)
x.head()

In [None]:
# For the logistic regression, we make use of the Logit class from the statsmodels library.
# Create an instance of the Logit class
reg_log = sm.Logit(y,x)

# Fit the regression
results_log = reg_log.fit()

## Step 7 Construct the confusion matrix

In [None]:
# Apply formatting on the np array below, so that the results are read more easily
np.set_printoptions(formatter={'float': lambda x: "{0:0.2f}".format(x)})

# Call the 'predict()' method to return the values predicted by the model
results_log.predict()

In [None]:
# Display the true values
np.array(data_mapped['Admitted'])

In [None]:
# Our task is to compare the two arrays above.
# This is achieved throguh the 'pred_table()' method.
# The output is called a confusion matrix
results_log.pred_table()

In [None]:
# Apply formatting to the confusion matrix, so that it is read more easily
cm_df = pd.DataFrame(results_log.pred_table())
cm_df.columns = ['Predicted 0', 'Predicted 1']
cm_df = cm_df.rename(index={0: 'Actual 0', 1:'Actual 1'})
cm_df

<a href="https://365datascience.com/resources-center/offer/?utm_medium=web&utm_source=rs&utm_campaign=web-rs-confusion-matrix-with-statsmodels-in-python&utm_content=confusion-matrix-with-statsmodels-in-python&utm_term=t">Start your 365 Journey!</a>
