# Task for Today  

***

## Student Admission Prediction  

Given *data about students' grades and test scores*, let's try to predict whether a given student will be **admitted** to graduate school.

We will use a logistic regression model to make our predictions. 

# Getting Started

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression

In [None]:
data = pd.read_csv('../input/admission-predict/Admission_Predict.csv')

In [None]:
data

In [None]:
data.info()

# Exploratory Data Analysis

In [None]:
data.describe()

In [None]:
fig = make_subplots(
    rows=1,
    cols=2,
    specs=[[{'type': 'polar'}, {'type': 'polar'}]],
    subplot_titles=["Not Admitted", "Admitted"]
)

data0 = data.query("admitted == 0")
data1 = data.query("admitted == 1")

fig.add_trace(
    go.Scatterpolar(
        r=[
            data0['gre'].mean() / data['gre'].max(),
            data0['sop'].mean() / data['sop'].max(),
            data0['cgpa'].mean() / data['cgpa'].max()
        ],
        theta=["GRE", "SOP", "CGPA"],
        fill='toself',
        name="Not Admitted"
    ), row=1, col=1
)

fig.add_trace(
    go.Scatterpolar(
        r=[
            data1['gre'].mean() / data['gre'].max(),
            data1['sop'].mean() / data['sop'].max(),
            data1['cgpa'].mean() / data['cgpa'].max()
        ],
        theta=["GRE", "SOP", "CGPA"],
        fill='toself',
        name="Admitted"
    ), row=1, col=2
)

fig.update_layout(title_text="Student Performance")

fig.show()

In [None]:
corr = data.corr()

plt.figure(figsize=(12, 10))
sns.heatmap(corr, annot=True, vmin=-1.0, cmap='mako')
plt.title("Correlation Matrix")
plt.show()

# Training

In [None]:
data

In [None]:
y = data['admitted'].copy()
X = data.drop('admitted', axis=1).copy()

In [None]:
results = []

skf = StratifiedKFold(n_splits=10)

for train_idx, test_idx in skf.split(X, y):
    
    X_train = X.iloc[train_idx, :].copy()
    y_train = y.iloc[train_idx].copy()
    X_test = X.iloc[test_idx, :].copy()
    y_test = y.iloc[test_idx].copy()
    
    scaler = StandardScaler()
    scaler.fit(X_train)
    
    X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)
    
    model = LogisticRegression()
    model.fit(X_train, y_train)
    
    results.append(model.score(X_test, y_test))

# Results

In [None]:
for i in range(len(results)):
    print("Fold {}: {:.2f}%".format(i + 1, results[i] * 100))

print("\nAverage Test Accuracy: {:.2f}%".format(np.mean(results) * 100))

# Data Every Day  

This notebook is featured on Data Every Day, a YouTube series where I train models on a new dataset each day.  

***

Check it out!  
https://youtu.be/no2sbnV-y2s