In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import set_config
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_validate

# Load Dataset

In [4]:
df = pd.read_csv("../input/students-performance-in-exams/StudentsPerformance.csv")
df.head()

## EDA

I did a seperate notebook of the EDA for this data sometime ago, find linke [Here](https://www.kaggle.com/raimiazeezbabatunde/student-performance-eda/notebook).

Please drop comments, suggestions and recommendations. Thanks 

## Data Prep

In [5]:
#Average student score

avg = df.iloc[:, 5:8].mean(axis = 1)

In [6]:
df["Average Score"] = round(avg, 1)
df.head()

In [7]:
#Create grade group
# 0-30.9 = Fail
#40-49.9 = Pass
#50-69.9 = Second Class Lower
#70-79.9 = Second Class Upper
#80-100 = First Class


criteria = (df["Average Score"].between(0.0, 39.9), df["Average Score"].between(40.0, 49.9),
      df["Average Score"].between(50.0, 69.9), df["Average Score"].between(70.0, 79.9), df["Average Score"].between(80.0, 100))
values = ["fail", "pass", "second class lower", "second class upper", "first class"]

In [8]:
#Add grade column
df['Grade'] = np.select(criteria, values, 0)
df.head()

In [9]:
#save new dataset to file
df.to_csv("Studentdataperformance2.csv")

In [10]:
df_f = df.drop(["Average Score", "Grade"], axis = 1)

In [11]:
df_f.info()

# Logistic Regression Model

## Create Processing Pipeline
This is done to automate processing, so we can pass our raw data into the model directly and it gets preprocessed in the background. 

In [12]:
#OneHotEncoder for categorical data
cat_preprocessor = OneHotEncoder(handle_unknown = "ignore")

#StandardScaler for Numerical data
num_preprocessor = StandardScaler()

In [13]:
#use Dtype to select categorical features
x_cat = df_f.select_dtypes(include = "object").columns
#x_cat.head()

#use Dtype to select numerical features
x_num = df_f.select_dtypes(include = "int64").columns
#x_num.head()

In [14]:
#create preprocessor
preprocessor = ColumnTransformer([
    ('one-hot-encoder', cat_preprocessor, x_cat),
    ('standard-scaler', num_preprocessor, x_num)])

In [16]:
#Creatse instance of LogisticRegression
lr_model = make_pipeline(preprocessor, LogisticRegression(max_iter=500))

In [17]:
set_config(display='diagram')
lr_model

## Split into Features and Target

In [18]:
features = df_f.iloc[:, :8]
target = df.iloc[:, -1]

## Split Train Test

In [20]:
x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.25, random_state=1 )

In [21]:
x_train.shape

In [22]:
y_train.shape

## Fit & Predict Model

In [23]:
mod = lr_model.fit(x_train, y_train)

In [24]:
pred = lr_model.predict(x_test)

In [25]:
lr_model.score(x_test, y_test)

In [26]:
# evaluate predictions
accuracy = accuracy_score(y_test, pred)
print('Accuracy: %.2f' % (accuracy*100))

In [27]:
#Evaluation with Cross Validation

cv_lr = cross_validate(lr_model, features, target, cv=5)
cv_lr

In [28]:
scores = cv_lr["test_score"]
print("The mean cross-validation accuracy is: "
      f"{scores.mean():.3f} +/- {scores.std():.3f}")

# Na√Øve Bayes

## Create Processing Pipeline

In [29]:
nb_model = make_pipeline(preprocessor, GaussianNB())

In [30]:
nb_model.fit(x_train, y_train)

In [31]:
nb_pred = nb_model.predict(x_test)

In [32]:
# evaluate predictions
accuracy = accuracy_score(y_test, nb_pred)
print('Accuracy: %.2f' % (accuracy*100))

In [33]:
#Evaluation with Cross Validation

cv_nb = cross_validate(nb_model, features, target, cv=5)
cv_nb

In [34]:
scores = cv_nb["test_score"]
print("The mean cross-validation accuracy is: "
      f"{scores.mean():.3f} +/- {scores.std():.3f}")

# Stochastic Gradient Descent

In [35]:
sgd_model = make_pipeline(preprocessor, SGDClassifier(loss = "modified_huber", shuffle = True, random_state = 101))

In [36]:
sgd_model.fit(x_train, y_train)

In [37]:
sgd_pred = sgd_model.predict(x_test)

In [38]:
# evaluate predictions
accuracy = accuracy_score(y_test, sgd_pred)
print('Accuracy: %.2f' % (accuracy*100))

In [39]:
#Evaluation with Cross Validation Accuracy

cv_sgd = cross_validate(sgd_model, features, target, cv=5)
cv_sgd

In [40]:
scores = cv_sgd["test_score"]
print("The mean cross-validation accuracy is: "
      f"{scores.mean():.3f} +/- {scores.std():.3f}")