In [None]:
import pandas as pd  



In [None]:
df = pd.read_csv('Student_performance_data .csv')
df.head()

In [None]:
def assign_grade(gpa):
    if gpa >= 3.5:
        return 'A'
    elif 3.0 <= gpa < 3.5:
        return 'B'
    elif 2.5 <= gpa < 3.0:
        return 'C'
    elif 2.0 <= gpa < 2.5:
        return 'D'
    else:
        return 'F'

df['Grade'] = df['GPA'].apply(assign_grade)

df.head(10)




In [None]:
import numpy as np
import math
import matplotlib.pyplot as plt

def calculate_grid(total_plots, columns_per_row=5):
    """Calculates the grid dimensions for subplots."""
    rows = math.ceil(total_plots / columns_per_row)
    columns = min(total_plots, columns_per_row)
    return rows, columns

def Plot_Histograms(x_data, y_data, title=None, figsize=(15, 5), columns_per_row=5):
    """Plots histograms for each dataset in y_data, aligned with x_data if needed."""

    num_rows, num_columns = calculate_grid(len(y_data), columns_per_row)
    print(f'DATA={len(y_data)}')

    fig, axs = plt.subplots(num_rows, num_columns, figsize=(figsize[0], figsize[1] * num_rows))

    if num_rows == 1:
        axs = np.array([axs])

    for i, yn in enumerate(y_data):
        row = i // columns_per_row
        col = i % columns_per_row

        if num_rows > 1:
          ax = axs[row, col]
        else:
          ax = axs[0,col]

        ax.set_title(yn, fontsize=12)
        ax.set_xlabel('Grade', fontsize=10) # Using yn as x label, you can change this.
        ax.set_ylabel('Frequency', fontsize=10)
        ax.hist(y_data[yn], bins=10, alpha=0.7, color='skyblue', edgecolor='black')

        ax.grid(True, linestyle='--', alpha=0.5)
        ax.tick_params(axis='both', which='major', labelsize=8)

    for i in range(len(y_data), num_rows * num_columns):
        row = i // columns_per_row
        col = i % columns_per_row

        if num_rows > 1:
          fig.delaxes(axs[row, col])
        else:
          fig.delaxes(axs[0,col])

    if title is not None:
        fig.suptitle(title, fontsize=16)
    fig.tight_layout(rect=[0, 0.03, 1, 0.95])
    plt.show()

In [None]:
x = np.array(df['Grade'])
y1= np.array(df['ParentalEducation'])
y2 = np.array(df['StudyTimeWeekly'])
y3 = np.array(df['Absences'])
y4 = np.array(df['Tutoring'])
y5= np.array(df['ParentalSupport'])
y6 = np.array(df['Extracurricular'])
y7 = np.array(df['Sports'])
y8 = np.array(df['Music'])
y9 = np.array(df['Volunteering'])

y = {'Parental Education' : y1, 'Study Time Weekly' : y2, 'Absences' : y3, 'Tutoring' : y4,'Parental Support' : y5, 'Extracurricular' : y6, 'Sports' : y7, 'Music' : y8, 'Volunteering': y9}

Plot_Histograms(x,y, title = 'Raw Data')

This data analysis shows us that extracurricular and tutoring can be dropped with the skew data. Maybe after sorting the data we can re-attempt them but as of now we drop them. Study time and absence needs further inspection, but they do not have any relationship with the grade directly.

In [None]:
df.describe().transpose()

In [None]:
x = np.array(df['Grade'])
y1= np.array(df['ParentalEducation'])
y2= np.array(df['ParentalSupport'])

y = {'Parental Education' : y1, 'Parental Support' : y2}

Plot_Histograms(x,y, title = 'Parental influence')

We can see signs that parental education has a positive trend for students up to C but the students that are B and up are adapted enough to school to not need parental education.

Same with the parental support but it also includes B students.

Thus, contacting parents regarding their influence we can identify early who might need extra help.

We can see that linear regression can be used with absence and grade the more absent the less grade.

In [None]:
features = ['ParentalSupport','Absences','StudyTimeWeekly','ParentalEducation','GPA']

fig, ax = plt.subplots(figsize=(10,6))
boxplot = df[features].boxplot(vert = False, ax=ax)  
_ = ax.set_title('Box Plot: Features')

In [None]:
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

# Assuming your data is in a DataFrame called 'df'
features = ['GPA', 'ParentalEducation', 'StudyTimeWeekly', 'ParentalSupport', 'Absences']

# Scale the data using StandardScaler
scaler = StandardScaler()
df[features] = scaler.fit_transform(df[features])  # Scale the specified columns

# Plot the scaled data
fig, ax = plt.subplots(figsize=(10, 6))
df[features].boxplot(vert=False, ax=ax)
ax.set_title('Box Plot: Scaled Features (StandardScaler)')
plt.show()

# Missing Value Treatment

In [None]:
#checks the column for amount of null values. Expected outcome is 0 for all columns
print(df.isnull().sum())

# numeric data is filled with meadian and categorical is filled with mode
df['StudyTimeWeekly'] = df['StudyTimeWeekly'].fillna(df['StudyTimeWeekly'].median())
df['Absences'] = df['Absences'].fillna(df['Absences'].median())
df['ParentalSupport'] = df['ParentalSupport'].fillna(df['ParentalSupport'].mode()[0])

# Outlier Treatment 

In [18]:
# function using the iqr method to remove any outliers
def iqr_remove_outliers(column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    
    IQR = Q3 - Q1
    
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    df[column] = df[column].clip(lower_bound, upper_bound)



for column in ['GPA', 'StudyTimeWeekly', 'Absences']:
    iqr_remove_outliers(column)
