# Implementing Linear Regression To Predict Student Performance

## Importing Necessary Pakages

In [31]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
import joblib

## Loading The Data

In [2]:
data = pd.read_csv("Student_Performance.csv")

In [3]:
data.head(5)

Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced,Performance Index
0,7,99,Yes,9,1,91
1,4,82,No,4,2,65
2,8,51,Yes,7,2,45
3,5,52,Yes,5,2,36
4,7,75,No,8,5,66


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column                            Non-Null Count  Dtype 
---  ------                            --------------  ----- 
 0   Hours Studied                     10000 non-null  int64 
 1   Previous Scores                   10000 non-null  int64 
 2   Extracurricular Activities        10000 non-null  object
 3   Sleep Hours                       10000 non-null  int64 
 4   Sample Question Papers Practiced  10000 non-null  int64 
 5   Performance Index                 10000 non-null  int64 
dtypes: int64(5), object(1)
memory usage: 468.9+ KB


## Data Cleaning And Pre-Processing

In [5]:
data.isnull().sum()

Hours Studied                       0
Previous Scores                     0
Extracurricular Activities          0
Sleep Hours                         0
Sample Question Papers Practiced    0
Performance Index                   0
dtype: int64

In [6]:
data.isna().sum()

Hours Studied                       0
Previous Scores                     0
Extracurricular Activities          0
Sleep Hours                         0
Sample Question Papers Practiced    0
Performance Index                   0
dtype: int64

In [7]:
data.duplicated().sum()

127

In [8]:
data.drop("Extracurricular Activities",axis=1,inplace = True)

In [9]:
data

Unnamed: 0,Hours Studied,Previous Scores,Sleep Hours,Sample Question Papers Practiced,Performance Index
0,7,99,9,1,91
1,4,82,4,2,65
2,8,51,7,2,45
3,5,52,5,2,36
4,7,75,8,5,66
...,...,...,...,...,...
9995,1,49,4,2,23
9996,7,64,8,5,58
9997,6,83,8,5,74
9998,9,97,7,0,95


## Seprating Input And Output Data

In [10]:
X= data.drop("Performance Index",axis=1)
y= data["Performance Index"]

In [11]:
X

Unnamed: 0,Hours Studied,Previous Scores,Sleep Hours,Sample Question Papers Practiced
0,7,99,9,1
1,4,82,4,2
2,8,51,7,2
3,5,52,5,2
4,7,75,8,5
...,...,...,...,...
9995,1,49,4,2
9996,7,64,8,5
9997,6,83,8,5
9998,9,97,7,0


In [12]:
y

0       91
1       65
2       45
3       36
4       66
        ..
9995    23
9996    58
9997    74
9998    95
9999    64
Name: Performance Index, Length: 10000, dtype: int64

## Splitting Data Into Training And Testing Data

In [13]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=45,test_size=.30)

In [14]:
X_train

Unnamed: 0,Hours Studied,Previous Scores,Sleep Hours,Sample Question Papers Practiced
6298,2,63,5,8
9976,8,93,9,8
1640,8,74,5,6
5280,8,78,8,1
9002,9,97,9,1
...,...,...,...,...
8772,3,68,5,7
163,5,72,8,1
6012,9,60,4,6
6558,2,88,4,3


## Selecting The Model



In [15]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()

## Model Training

In [16]:
model.fit(X_train,y_train)

## Prediction On Training Data

In [17]:
y_predict_train = model.predict(X_train)

In [18]:
from sklearn.metrics import r2_score,mean_absolute_error

In [19]:
y_predict_train

array([40.0483257 , 89.58471597, 67.98740504, ..., 56.12016989,
       64.07528804, 40.27355553])

In [20]:
model.score(X_train,y_train)

0.9884462729180128

In [21]:
r2_score(y_train,y_predict_train)

0.9884462729180128

In [22]:
mean_absolute_error(y_train,y_predict_train)

1.637226709256773

## Prediction On Test Data

In [23]:
y_predict_test = model.predict(X_test)

In [34]:
model.score(X_test,y_test)

0.9886114537511869

In [25]:
r2_score(y_test,y_predict_test)

0.9886114537511869

In [26]:
mean_absolute_error(y_test,y_predict_test)

1.6383416024700819

## Saving And Deploying Model

In [32]:
# saving trained model
joblib.dump(model ,"model.pkl")

['model.pkl']

In [33]:
# Loading trained model
model = joblib.load("model.pkl")

def predict_student_performance():
    print("Please answer the following 4 questions:")

    # Asking for inputs
    study_hours = float(input("1. How many hours do you study daily? "))
    previous_score = float(input("4. What was your previous test score? "))
    sleep_hours = float(input("2. How many hours do you sleep daily? "))
    sample_question_paper = float(input("3. How many past question papers you practised? "))

    # Convert input to appropriate format (e.g., NumPy array)
    inputs = np.array([[study_hours, previous_score, sleep_hours, sample_question_paper]])

    # Predict using model
    predicted_score = model.predict(inputs)

    print(f"\nPredicted performance score: {predicted_score[0]:.2f}")

# Call the function
predict_student_performance()


Please answer the following 4 questions:


1. How many hours do you study daily?  5
4. What was your previous test score?  67
2. How many hours do you sleep daily?  10
3. How many past question papers you practised?  0



Predicted performance score: 53.49


