# 1.Data Loading and Exploration

In [4]:
from enum import unique
from statistics import correlation
from traceback import print_tb

import pandas as pd
df=pd.read_csv('student_habits_performance.csv')
print(df.info()) # Displays the number of entries, data types, and the count of non-null values for each column


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 16 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   student_id                     1000 non-null   object 
 1   age                            1000 non-null   int64  
 2   gender                         1000 non-null   object 
 3   study_hours_per_day            1000 non-null   float64
 4   social_media_hours             1000 non-null   float64
 5   netflix_hours                  1000 non-null   float64
 6   part_time_job                  1000 non-null   object 
 7   attendance_percentage          1000 non-null   float64
 8   sleep_hours                    1000 non-null   float64
 9   diet_quality                   1000 non-null   object 
 10  exercise_frequency             1000 non-null   int64  
 11  parental_education_level       909 non-null    object 
 12  internet_quality               1000 non-null   ob

In [8]:
print(df.describe()) #Provides statistical summaries of numeric columns, including count, mean, standard deviation, min, max

             age  study_hours_per_day  social_media_hours  netflix_hours  \
count  1000.0000           1000.00000         1000.000000    1000.000000   
mean     20.4980              3.55010            2.505500       1.819700   
std       2.3081              1.46889            1.172422       1.075118   
min      17.0000              0.00000            0.000000       0.000000   
25%      18.7500              2.60000            1.700000       1.000000   
50%      20.0000              3.50000            2.500000       1.800000   
75%      23.0000              4.50000            3.300000       2.525000   
max      24.0000              8.30000            7.200000       5.400000   

       attendance_percentage  sleep_hours  exercise_frequency  \
count            1000.000000  1000.000000         1000.000000   
mean               84.131700     6.470100            3.042000   
std                 9.399246     1.226377            2.025423   
min                56.000000     3.200000            0.

# 2. Data Cleaning

#### Check for missing values

In [9]:
print(df.isnull().sum())

student_id                        0
age                               0
gender                            0
study_hours_per_day               0
social_media_hours                0
netflix_hours                     0
part_time_job                     0
attendance_percentage             0
sleep_hours                       0
diet_quality                      0
exercise_frequency                0
parental_education_level         91
internet_quality                  0
mental_health_rating              0
extracurricular_participation     0
exam_score                        0
dtype: int64


#### Handle the missing values

In [18]:
# replace the none with no eduction
df['parental_education_level']=df['parental_education_level'].fillna('No formal education')
df['parental_education_level'].unique() # confirms the change

array(['Master', 'High School', 'Bachelor', 'No formal education'],
      dtype=object)


#### Check for duplicate rows

In [19]:
print(df.duplicated().sum())

0


# 3.Understanding the Data

#### Inspect unique values in columns like 'gender', 'part_time_job'

In [25]:
print("Values for gender column: ",df['gender'].unique())
print("Values for Part Time Job column: ",df['part_time_job'].unique())

Values for gender column:  ['Female' 'Male' 'Other']
Values for Part Time Job column:  ['No' 'Yes']


#### count how many students fall into each category

In [31]:
counts = df.groupby(['gender','part_time_job']).size().reset_index(name='counts')
print(counts)

   gender part_time_job  counts
0  Female            No     385
1  Female           Yes      96
2    Male            No     368
3    Male           Yes     109
4   Other            No      32
5   Other           Yes      10


# 4. Statistical Summary

#### Calculate average grades based on gender, study_hours, and part_time_job

In [46]:
# average grades based on gender
grades_gender = df.groupby('gender')['exam_score'].mean()
print("Grades on gender base\n ",grades_gender.round(2))


Grades on gender base
  gender
Female    69.74
Male      69.37
Other     70.65
Name: exam_score, dtype: float64
Grades on gender base
  study_hours_per_day
0.0     40.81
0.1     53.40
0.2     31.50
0.3     32.62
0.5     39.52
        ...  
7.4     99.23
7.5    100.00
7.6    100.00
8.2    100.00
8.3    100.00
Name: exam_score, Length: 78, dtype: float64


In [48]:
#average grades based on study hours
grades_study = df.groupby('study_hours_per_day')['exam_score'].mean().reset_index()
print("Grades on gender base\n ",grades_study.round(2))

Grades on gender base
      study_hours_per_day  exam_score
0                   0.0       40.81
1                   0.1       53.40
2                   0.2       31.50
3                   0.3       32.62
4                   0.5       39.52
..                  ...         ...
73                  7.4       99.23
74                  7.5      100.00
75                  7.6      100.00
76                  8.2      100.00
77                  8.3      100.00

[78 rows x 2 columns]


In [None]:
# average grades bases on part time job
grades_job = df.groupby('part_time_job')['exam_score'].mean().reset_index()
print("Grades on gender base\n ",grades_job.round(2))

#### Find correlation between continuous variables

In [54]:
# Correlation means how much two things are connected i.e. directly proportional or inversely proportional
correlation_cols=df.corr(numeric_only=True) # correlation between numeric values of all columns
print(correlation_cols)

                            age  study_hours_per_day  social_media_hours  \
age                    1.000000             0.003971           -0.009151   
study_hours_per_day    0.003971             1.000000            0.020282   
social_media_hours    -0.009151             0.020282            1.000000   
netflix_hours         -0.001174            -0.031158            0.011477   
attendance_percentage -0.026055             0.026264            0.040479   
sleep_hours            0.037482            -0.027757            0.018236   
exercise_frequency    -0.003836            -0.028701           -0.037319   
mental_health_rating  -0.045101            -0.003768            0.001496   
exam_score            -0.008907             0.825419           -0.166733   

                       netflix_hours  attendance_percentage  sleep_hours  \
age                        -0.001174              -0.026055     0.037482   
study_hours_per_day        -0.031158               0.026264    -0.027757   
social_medi

# 5.Sorting and Filtering

#### List top 10 students with highest grades

In [61]:
sort_grades=df.sort_values(by=['exam_score'], ascending=False)
print(sort_grades.head(10))

    student_id  age  gender  study_hours_per_day  social_media_hours  \
960      S1960   17    Male                  7.1                 1.9   
908      S1908   23  Female                  5.6                 2.8   
945      S1945   23    Male                  6.0                 2.9   
885      S1885   21    Male                  5.2                 0.0   
875      S1875   19  Female                  7.6                 3.0   
797      S1797   23    Male                  8.2                 2.2   
835      S1835   20  Female                  5.1                 2.6   
774      S1774   18  Female                  7.5                 3.6   
778      S1778   24    Male                  5.1                 0.5   
69       S1069   22    Male                  6.8                 3.7   

     netflix_hours part_time_job  attendance_percentage  sleep_hours  \
960            1.1           Yes                   69.3          5.6   
908            0.5           Yes                   92.2        

#### Filter students who study more than 4 hours/day but sleep less than 6 hours

In [67]:
std=df[(df['study_hours_per_day']>4) &(df['sleep_hours']<6)]
print(std)

    student_id  age  gender  study_hours_per_day  social_media_hours  \
1        S1001   20  Female                  6.9                 2.8   
4        S1004   19  Female                  5.0                 4.4   
7        S1007   21  Female                  4.3                 1.0   
10       S1010   19  Female                  4.6                 3.7   
37       S1037   17    Male                  4.3                 2.5   
..         ...  ...     ...                  ...                 ...   
970      S1970   24  Female                  4.2                 2.4   
972      S1972   17  Female                  4.5                 0.4   
973      S1973   19    Male                  4.4                 4.3   
985      S1985   18    Male                  5.7                 3.1   
991      S1991   20    Male                  6.0                 2.1   

     netflix_hours part_time_job  attendance_percentage  sleep_hours  \
1              2.3            No                   97.3        

#### Investigate if more sleep_hours correlate with higher grades

In [69]:
corr_sleep_grades=df['sleep_hours'].corr(df['exam_score'])
print(corr_sleep_grades.round(2))

0.12


# 7. Create New Columns (Feature Engineering)

#### Create 'is_high_achiever' column

In [None]:
# create a column where exam score is more than 85
df['is_higher_acheiver']=df['exam_score']>85
print('New column added successfully')

#### Create 'is_sleep_deprived' column

In [75]:
# create a column where sleep hour is more than 6 hours
df['is_sleep_depeived']=df['sleep_hours']>6
print('New column added successfully')


New column added successfully


# 8.Final Output

##### Export cleaned or filtered datasets

In [77]:
df.to_csv('student_habits_performance_cleaned.csv')
print("New file is generated")

New file is generated
