## **Student Performance analysis using supervised learning**

- Predicting studnet performance with dataset having feature 

| Feature Name     | Description                         | Data Type        |
|------------------|-------------------------------------|------------------|
| gender           | Student gender (Male / Female)      | Categorical      |
| study_hours      | Daily study hours                   | Numerical        |
| attendance       | Attendance percentage               | Numerical (%)    |
| previous_score   | Previous exam score                 | Numerical        |
| assignments      | Assignment completion percentage    | Numerical (%)    |
| internet         | Internet access (Yes / No)          | Categorical      |
| final_result     | Final academic result (Pass / Fail) | Target Variable  |



**Required Libraries**
- pip install pandas matplotlib seaborn scikit-learn joblib


### Import libraries 



In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 


from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import joblib 


### Load Dataset 

In [9]:
df = pd.read_csv("student-mat.csv",sep=';')
print(df.head())


  school sex  age address famsize Pstatus  Medu  Fedu     Mjob      Fjob  ...  \
0     GP   F   18       U     GT3       A     4     4  at_home   teacher  ...   
1     GP   F   17       U     GT3       T     1     1  at_home     other  ...   
2     GP   F   15       U     LE3       T     1     1  at_home     other  ...   
3     GP   F   15       U     GT3       T     4     2   health  services  ...   
4     GP   F   16       U     GT3       T     3     3    other     other  ...   

  famrel freetime  goout  Dalc  Walc health absences  G1  G2  G3  
0      4        3      4     1     1      3        6   5   6   6  
1      5        3      3     1     1      3        4   5   5   6  
2      4        3      2     2     3      3       10   7   8  10  
3      3        2      2     1     1      5        2  15  14  15  
4      4        3      2     1     2      5        4   6  10  10  

[5 rows x 33 columns]


### Data cleaning 


In [10]:
print(df.isnull().sum())
df.dropna(inplace=True)

school        0
sex           0
age           0
address       0
famsize       0
Pstatus       0
Medu          0
Fedu          0
Mjob          0
Fjob          0
reason        0
guardian      0
traveltime    0
studytime     0
failures      0
schoolsup     0
famsup        0
paid          0
activities    0
nursery       0
higher        0
internet      0
romantic      0
famrel        0
freetime      0
goout         0
Dalc          0
Walc          0
health        0
absences      0
G1            0
G2            0
G3            0
dtype: int64


In [11]:
df.shape

(395, 33)

In [12]:
df.describe()

Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
count,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0
mean,16.696203,2.749367,2.521519,1.448101,2.035443,0.334177,3.944304,3.235443,3.108861,1.481013,2.291139,3.55443,5.708861,10.908861,10.713924,10.41519
std,1.276043,1.094735,1.088201,0.697505,0.83924,0.743651,0.896659,0.998862,1.113278,0.890741,1.287897,1.390303,8.003096,3.319195,3.761505,4.581443
min,15.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,3.0,0.0,0.0
25%,16.0,2.0,2.0,1.0,1.0,0.0,4.0,3.0,2.0,1.0,1.0,3.0,0.0,8.0,9.0,8.0
50%,17.0,3.0,2.0,1.0,2.0,0.0,4.0,3.0,3.0,1.0,2.0,4.0,4.0,11.0,11.0,11.0
75%,18.0,4.0,3.0,2.0,2.0,0.0,5.0,4.0,4.0,2.0,3.0,5.0,8.0,13.0,13.0,14.0
max,22.0,4.0,4.0,4.0,4.0,3.0,5.0,5.0,5.0,5.0,5.0,5.0,75.0,19.0,19.0,20.0


### Exploratory Data Analysis 



In [13]:
#Create target variable 

df['final_result'] = df['G3'].apply(lambda x:1 if x>=10 else 0)

In [14]:
#Drop unnecessary column 
df.drop(['G1','G2','G3'], axis=1, inplace=True)

In [15]:
df.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,final_result
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,no,no,4,3,4,1,1,3,6,0
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,yes,no,5,3,3,1,1,3,4,0
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,yes,no,4,3,2,2,3,3,10,1
3,GP,F,15,U,GT3,T,4,2,health,services,...,yes,yes,3,2,2,1,1,5,2,1
4,GP,F,16,U,GT3,T,3,3,other,other,...,no,no,4,3,2,1,2,5,4,1


In [16]:
#Encode categorical variables 

le = LabelEncoder()

for col in df.select_dtypes(include='object').columns: 
    df[col] = le.fit_transform(df[col])

### Spliting Independent(X) variable and dependent variable(y)

In [17]:
# Feature and target selection 

X = df.drop('final_result', axis =1)
y = df['final_result']

### Splitting train and test datasets 

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)


print(X_train.shape)
print(y_train.shape)

(316, 30)
(316,)


### Model Training 

#### Logistic Regression 



In [24]:

lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train) 

lr_pred = lr.predict(X_test)
print("Logistic Regression Accuracy :", accuracy_score(y_test, lr_pred))

Logistic Regression Accuracy : 0.759493670886076


In [None]:
### Decision Tree classifier 




dt = DecisionTreeClassifier()