<a href="https://colab.research.google.com/github/sophiagemanuel/Student-Success-Deep-Dive/blob/main/Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
# Dependencies
import pandas as pd
import os
from sklearn.preprocessing import StandardScaler

In [14]:
# Loading the Dataset
# Reading in our CSV file
df = pd.read_csv("StudentsPerformance.csv")
# Printing out the first 10 rows of our data
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [15]:
df.columns

Index(['gender', 'race/ethnicity', 'parental level of education', 'lunch',
       'test preparation course', 'math score', 'reading score',
       'writing score'],
      dtype='object')

In [16]:
# Check for missing values
df.isnull().sum()

Unnamed: 0,0
gender,0
race/ethnicity,0
parental level of education,0
lunch,0
test preparation course,0
math score,0
reading score,0
writing score,0


In [17]:
# Encode categorical variables
categorical_columns = ['gender', 'race/ethnicity', 'parental level of education', 'lunch', 'test preparation course']
df = pd.get_dummies(df, columns=categorical_columns)

In [18]:
# Cleaning data and renaming Ethnicity and Race
# A American Indian/Alaska Native, B Asian, C Black or African American, D Native Hawaiian or other Pacific Islander, E White
df = df.rename(columns={'race/ethnicity_group A': 'American Indian/Alaska Native',
                        'race/ethnicity_group B': 'Asian',
                        'race/ethnicity_group C': 'Black or African American',
                        'race/ethnicity_group D': 'Native Hawaiian',
                        'race/ethnicity_group E': 'White'})
df.head()

Unnamed: 0,math score,reading score,writing score,gender_female,gender_male,American Indian/Alaska Native,Asian,Black or African American,Native Hawaiian,White,parental level of education_associate's degree,parental level of education_bachelor's degree,parental level of education_high school,parental level of education_master's degree,parental level of education_some college,parental level of education_some high school,lunch_free/reduced,lunch_standard,test preparation course_completed,test preparation course_none
0,72,72,74,True,False,False,True,False,False,False,False,True,False,False,False,False,False,True,False,True
1,69,90,88,True,False,False,False,True,False,False,False,False,False,False,True,False,False,True,True,False
2,90,95,93,True,False,False,True,False,False,False,False,False,False,True,False,False,False,True,False,True
3,47,57,44,False,True,True,False,False,False,False,True,False,False,False,False,False,True,False,False,True
4,76,78,75,False,True,False,False,True,False,False,False,False,False,False,True,False,False,True,False,True


In [19]:
# Normalize numerical features
numerical_columns = ['math score', 'reading score', 'writing score']
scaler = StandardScaler()
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

df.head()

Unnamed: 0,math score,reading score,writing score,gender_female,gender_male,American Indian/Alaska Native,Asian,Black or African American,Native Hawaiian,White,parental level of education_associate's degree,parental level of education_bachelor's degree,parental level of education_high school,parental level of education_master's degree,parental level of education_some college,parental level of education_some high school,lunch_free/reduced,lunch_standard,test preparation course_completed,test preparation course_none
0,0.390024,0.193999,0.391492,True,False,False,True,False,False,False,False,True,False,False,False,False,False,True,False,True
1,0.192076,1.427476,1.313269,True,False,False,False,True,False,False,False,False,False,False,True,False,False,True,True,False
2,1.577711,1.770109,1.642475,True,False,False,True,False,False,False,False,False,False,True,False,False,False,True,False,True
3,-1.259543,-0.833899,-1.583744,False,True,True,False,False,False,False,True,False,False,False,False,False,True,False,False,True
4,0.653954,0.605158,0.457333,False,True,False,False,True,False,False,False,False,False,False,True,False,False,True,False,True


In [20]:
# Create an average score feature
df['average score'] = df[['math score', 'reading score', 'writing score']].mean(axis=1)

# Define the target variable (e.g., students with an average score above 70 are considered successful)
df['success'] = df['average score'] > 70

# Drop the original score columns if necessary
df = df.drop(columns=numerical_columns)

# Display the modified dataframe
df.head()

Unnamed: 0,gender_female,gender_male,American Indian/Alaska Native,Asian,Black or African American,Native Hawaiian,White,parental level of education_associate's degree,parental level of education_bachelor's degree,parental level of education_high school,parental level of education_master's degree,parental level of education_some college,parental level of education_some high school,lunch_free/reduced,lunch_standard,test preparation course_completed,test preparation course_none,average score,success
0,True,False,False,True,False,False,False,False,True,False,False,False,False,False,True,False,True,0.325171,False
1,True,False,False,False,True,False,False,False,False,False,False,True,False,False,True,True,False,0.977607,False
2,True,False,False,True,False,False,False,False,False,False,True,False,False,False,True,False,True,1.663432,False
3,False,True,True,False,False,False,False,True,False,False,False,False,False,True,False,False,True,-1.225729,False
4,False,True,False,False,True,False,False,False,False,False,False,True,False,False,True,False,True,0.572148,False


In [21]:
# Save the DataFrame to a new CSV file
df.to_csv('processed_student_performance.csv', index=False)

# Check if the file was saved successfully
print("File saved:", os.path.exists('processed_student_performance.csv'))

File saved: True


In [22]:
# With Google Colab, download the file
from google.colab import files
files.download('processed_student_performance.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>