# Students' Habits Vs Their Academic Performance Data Exploration

## Importing the libraries

In [1]:
import pandas as pd
import sklearn as skl

## Loading the dataset

In [3]:
data=pd.read_csv('student_habits_performance.csv')
data

Unnamed: 0,student_id,age,gender,study_hours_per_day,social_media_hours,netflix_hours,part_time_job,attendance_percentage,sleep_hours,diet_quality,exercise_frequency,parental_education_level,internet_quality,mental_health_rating,extracurricular_participation,exam_score
0,S1000,23,Female,0.0,1.2,1.1,No,85.0,8.0,Fair,6,Master,Average,8,Yes,56.2
1,S1001,20,Female,6.9,2.8,2.3,No,97.3,4.6,Good,6,High School,Average,8,No,100.0
2,S1002,21,Male,1.4,3.1,1.3,No,94.8,8.0,Poor,1,High School,Poor,1,No,34.3
3,S1003,23,Female,1.0,3.9,1.0,No,71.0,9.2,Poor,4,Master,Good,1,Yes,26.8
4,S1004,19,Female,5.0,4.4,0.5,No,90.9,4.9,Fair,3,Master,Good,1,No,66.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,S1995,21,Female,2.6,0.5,1.6,No,77.0,7.5,Fair,2,High School,Good,6,Yes,76.1
996,S1996,17,Female,2.9,1.0,2.4,Yes,86.0,6.8,Poor,1,High School,Average,6,Yes,65.9
997,S1997,20,Male,3.0,2.6,1.3,No,61.9,6.5,Good,5,Bachelor,Good,9,Yes,64.4
998,S1998,24,Male,5.4,4.1,1.1,Yes,100.0,7.6,Fair,0,Bachelor,Average,1,No,69.7


## Data Cleaning

### Checking for duplicates

In [4]:
data[data.duplicated()]

Unnamed: 0,student_id,age,gender,study_hours_per_day,social_media_hours,netflix_hours,part_time_job,attendance_percentage,sleep_hours,diet_quality,exercise_frequency,parental_education_level,internet_quality,mental_health_rating,extracurricular_participation,exam_score


### Checking for missing values

In [6]:
data.isnull().sum()[data.isnull().sum()>0]

parental_education_level    91
dtype: int64

### Filling the missing values
Changing the null values in **parental_education_level** to *Uneducated*

In [15]:
data.loc[data.parental_education_level.isnull(),'parental_education_level']='Uneducated'

## Data Preprocessing

### Changing the gender column

In [18]:
data.gender.unique()

array(['Female', 'Male', 'Other'], dtype=object)

In [19]:
a = {'Female':0,'Male':1,'Other':2}
data.gender=data.gender.apply(lambda x: a[x])

In [20]:
data.gender.unique()

array([0, 1, 2])

### Changing the part time job column

In [26]:
data.part_time_job.unique()

array(['No', 'Yes'], dtype=object)

In [27]:
a = {'No':0,'Yes':1}
data.part_time_job=data.part_time_job.apply(lambda x: a[x])

In [28]:
data.part_time_job.unique()

array([0, 1])

### Changing the diet quality column

In [29]:
data.diet_quality.unique()

array(['Fair', 'Good', 'Poor'], dtype=object)

In [30]:
a = {'Poor':0,'Fair':1,'Good':2}
data.diet_quality=data.diet_quality.apply(lambda x: a[x])

In [31]:
data.diet_quality.unique()

array([1, 2, 0])

### Changing the parental education level column

In [32]:
data.parental_education_level.unique()

array(['Master', 'High School', 'Bachelor', 'Uneducated'], dtype=object)

In [33]:
a = {'Uneducated':0,'High School':1,'Bachelor':2,'Master':3}
data.parental_education_level=data.parental_education_level.apply(lambda x: a[x])

In [34]:
data.parental_education_level.unique()

array([3, 1, 2, 0])

### Changing the internet quality column

In [35]:
data.internet_quality.unique()

array(['Average', 'Poor', 'Good'], dtype=object)

In [36]:
a = {'Poor':0,'Average':1,'Good':2}
data.internet_quality=data.internet_quality.apply(lambda x: a[x])

In [37]:
data.internet_quality.unique()

array([1, 0, 2])

### Changing the extracurricular participation column

In [38]:
data.extracurricular_participation.unique()

array(['Yes', 'No'], dtype=object)

In [39]:
a = {'No':0,'Yes':1}
data.extracurricular_participation=data.extracurricular_participation.apply(lambda x: a[x])

In [40]:
data.extracurricular_participation.unique()

array([1, 0])

## Model Building

In [41]:
X = data[[i for i in data.columns if i not in ('student_id','exam_score')]]
X

Unnamed: 0,age,gender,study_hours_per_day,social_media_hours,netflix_hours,part_time_job,attendance_percentage,sleep_hours,diet_quality,exercise_frequency,parental_education_level,internet_quality,mental_health_rating,extracurricular_participation
0,23,0,0.0,1.2,1.1,0,85.0,8.0,1,6,3,1,8,1
1,20,0,6.9,2.8,2.3,0,97.3,4.6,2,6,1,1,8,0
2,21,1,1.4,3.1,1.3,0,94.8,8.0,0,1,1,0,1,0
3,23,0,1.0,3.9,1.0,0,71.0,9.2,0,4,3,2,1,1
4,19,0,5.0,4.4,0.5,0,90.9,4.9,1,3,3,2,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,21,0,2.6,0.5,1.6,0,77.0,7.5,1,2,1,2,6,1
996,17,0,2.9,1.0,2.4,1,86.0,6.8,0,1,1,1,6,1
997,20,1,3.0,2.6,1.3,0,61.9,6.5,2,5,2,2,9,1
998,24,1,5.4,4.1,1.1,1,100.0,7.6,1,0,2,1,1,0


In [42]:
Y = data['exam_score']
Y

0       56.2
1      100.0
2       34.3
3       26.8
4       66.4
       ...  
995     76.1
996     65.9
997     64.4
998     69.7
999     74.9
Name: exam_score, Length: 1000, dtype: float64

In [43]:
X_train,X_test,Y_train,Y_test=skl.model_selection.train_test_split(X,Y,test_size=0.1,random_state=0)

In [44]:
model = skl.linear_model.LinearRegression()
model

In [45]:
model.fit(X_train,Y_train)

In [46]:
Y_train_pred = model.predict(X_train)

In [47]:
Y_test_pred = model.predict(X_test)

### Prediction accuracy

For both training and test data the **error** in predicted marks is around ***5 marks***.

In [51]:
skl.metrics.root_mean_squared_error(Y_train,Y_train_pred)

5.334507121542313

In [52]:
skl.metrics.root_mean_squared_error(Y_test,Y_test_pred)

5.010782493348715