In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#plotting data key functions
import seaborn as sns

import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# read the files
data_all = pd.read_csv('../input/students-performance-in-exams/StudentsPerformance.csv')

In [None]:
print(data_all.shape)

This is a small dataset with the size of 1000.

In [None]:
data_all.head() #shows the first few data entries and gives a general idea of what we are dealing with

We will now start to optimize the data and prepare for our training process

In [None]:
# missing values
print(pd.isnull(data_all).sum())

In [None]:
# optimize test scores
# add a new column for total score
sum_score = data_all["math score"] + data_all["reading score"] + data_all["writing score"]
data_all["sum_score"] = sum_score

In [None]:
print(data_all.head())

In [None]:
# drop the other scores
data_all = data_all.drop(['math score','reading score','writing score'], axis = 1)

In [None]:
print(data_all.head())

In [None]:
# we will now optimize the scores by setting the range
# A: 0, B: 1, C: 2, D: 3, F: 4
# A: 3 X 90 = 270, B: 3 X 80 = 240, C: 3 X 70 = 210, D: 3 x 60 = 180, F: <180
bins = [0, 180, 210, 240, 270, np.inf]
label = ['F','D','C','B','A']

data_all['grades'] = pd.cut(data_all['sum_score'],bins,labels = label)

In [None]:
print(data_all.head())

In [None]:
# drop sum_score
data_all = data_all.drop(['sum_score'], axis = 1)

In [None]:
print(data_all.head())

In [None]:
# mapping grades
grade_mapping = {"A":0,"B":1,"C":2,"D":3,"F":4}
data_all['grades'] = data_all['grades'].map(grade_mapping)

In [None]:
print(data_all.head())

In [None]:
data_all.gender.unique()

In [None]:
# optimize gender
# female:0, male:1
gender_mapping = {"female":0,"male":1}
data_all['gender'] = data_all['gender'].map(gender_mapping)

In [None]:
print(data_all.head())

In [None]:
data_all['race/ethnicity'].unique()

In [None]:
# optimize race/ethnicity
race_mapping = {"group A": 0, "group B": 1, "group C": 2, "group D": 3, "group E": 4}
data_all['race/ethnicity'] = data_all['race/ethnicity'].map(race_mapping)

In [None]:
print(data_all.head())

In [None]:
data_all['parental level of education'].unique()

In [None]:
# optimize parental level of education
p_education_mapping = {"master's degree": 0, "bachelor's degree": 1, "associate's degree": 2, "some college": 3, "high school": 4, "some high school": 5}
data_all['parental level of education'] = data_all['parental level of education'].map(p_education_mapping)

In [None]:
print(data_all.head())

In [None]:
data_all['lunch'].unique()

In [None]:
# optimize lunch
lunch_mapping = {"standard":0, "free/reduced":1}
data_all['lunch'] = data_all['lunch'].map(lunch_mapping)

In [None]:
print(data_all.head())

In [None]:
data_all['test preparation course'].unique()

In [None]:
# optimize test preparation course
course_mapping = {"none":0, "completed":1}
data_all['test preparation course'] = data_all['test preparation course'].map(course_mapping)

In [None]:
print(data_all.head())

In [None]:
# rename some of the complex names in the dataset
data_all = data_all.rename(columns = {'race/ethnicity':'r/e'})

In [None]:
print(data_all.head())

In [None]:
data_all = data_all.rename(columns = {'parental level of education': 'p_education'})

In [None]:
data_all = data_all.rename(columns = {'test preparation course': 'prep'})

In [None]:
print(data_all.head())