In [61]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [63]:
###load dataset
df = pd.read_csv("/home/nacheli/tinaProgramming/sppm/sppm/data/stud.csv")

In [65]:
df


Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75
...,...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,88,99,95
996,male,group C,high school,free/reduced,none,62,55,55
997,female,group C,high school,free/reduced,completed,59,71,65
998,female,group D,some college,standard,completed,68,78,77


### Ordinal encoding

In [78]:

# define the ordinal encoding mapping for 'parental_level_of_education'
ordinal_mapping = {
    'some high school': 0,
    'high school': 1,
    'some college':2,
    'associate\'s degree': 3,
    'bachelor\'s degree': 4,
    
    
}

# apply ordinal encoding to 'parental_level_of_education'
df['parental_level_of_education_encoded'] = df['parental_level_of_education'].map(ordinal_mapping)

# verify the encoded values
print(df[['parental_level_of_education', 'parental_level_of_education_encoded']].head())

  parental_level_of_education  parental_level_of_education_encoded
0           bachelor's degree                                  4.0
1                some college                                  2.0
2             master's degree                                  NaN
3          associate's degree                                  3.0
4                some college                                  2.0


### one hot encoding 

In [80]:
# define the categorical columns to one-hot encode
categorical_cols = ['lunch','race_ethnicity','lunch' ,'test_preparation_course', 'gender']

# one-hot encode the categorical columns
df_onehot = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# print the resulting DataFrame
print(df_onehot.head())

  parental_level_of_education  math_score  reading_score  writing_score  \
0           bachelor's degree          72             72             74   
1                some college          69             90             88   
2             master's degree          90             95             93   
3          associate's degree          47             57             44   
4                some college          76             78             75   

   parental_level_of_education_encoded  lunch_standard  \
0                                  4.0            True   
1                                  2.0            True   
2                                  NaN            True   
3                                  3.0           False   
4                                  2.0            True   

   race_ethnicity_group B  race_ethnicity_group C  race_ethnicity_group D  \
0                    True                   False                   False   
1                   False                    T

### insight
- Parental Level of Education: The encoded values suggest that students with higher parental education levels (e.g., bachelor's degree) tend to perform better in math, reading, and writing scores.
- Academic Performance: The scores vary widely, with some students excelling in all subjects (e.g., row 104) and others struggling (e.g., row 894).
- Demographics: The data reveals a diverse student population, with different race/ethnicity groups and gender representations.
- Lunch Standard: Most students (4 out of 5) receive a standard lunch, which may indicate a higher likelihood of coming from a lower-income household.
- Test Preparation: Only one student (row 894) did not take a test preparation course, which may impact their academic performance.


### interaction features

In [82]:
# define the categorical columns to create interactions
categorical_cols = ['parental_level_of_education','lunch', 'test_preparation_course', 'gender', 'race_ethnicity']

# create interactions between categorical columns
interactions = pd.DataFrame()
for i in range(len(categorical_cols)):
    for j in range(i+1, len(categorical_cols)):
        col1 = categorical_cols[i]
        col2 = categorical_cols[j]
        interaction_col = f"{col1}_{col2}"
        interactions[interaction_col] = df[col1].astype(str) + "_" + df[col2].astype(str)

# one-hot encode the interaction columns
df.onehot = pd.get_dummies(interactions, drop_first=True)

# concatenate the original DataFrame with the interaction columns
df_interactions = pd.concat([df.onehot], axis=1)

# print the resulting DataFrame
print(df_interactions.head())

   parental_level_of_education_lunch_associate's degree_standard  \
0                                              False               
1                                              False               
2                                              False               
3                                              False               
4                                              False               

   parental_level_of_education_lunch_bachelor's degree_free/reduced  \
0                                              False                  
1                                              False                  
2                                              False                  
3                                              False                  
4                                              False                  

   parental_level_of_education_lunch_bachelor's degree_standard  \
0                                               True              
1                             

  df.onehot = pd.get_dummies(interactions, drop_first=True)


### insights
- Associate's degree: Only one student's parent  (row 894) has an associate's degree and receives a standard lunch.
- Bachelor's degree: One student's parent (row 562) has a bachelor's degree and receives a standard lunch.
- High school: One student's parent (row 366) has a high school education and receives a standard lunch.
- Some college: One student's parent (row 104) has some college education and receives a standard lunch.
- Some high school: One student's parent (row 897) has some high school education and receives a free/reduced lunch.


### polynomial features

In [84]:
# sample dataframe
data = {
    'parental_level_of_education': ['some high school', 'some college', "bachelor's degree", 'associate\'s degree', 'high school'],
    'math_score': [63, 98, 96, 59, 69],
    'reading_score': [78, 86, 90, 62, 58],
    'writing_score': [79, 90, 92, 69, 53]
}

df = pd.DataFrame(data)

# Define the numerical columns to create square and square root features
numerical_cols = ['math_score', 'reading_score', 'writing_score']

# Create square features
square_features = df[numerical_cols].apply(lambda x: x**2)

# Create square root features
sqrt_features = df[numerical_cols].apply(lambda x: np.sqrt(x))

# Rename the columns for square and square root features
square_features.columns = [f"{col}_square" for col in numerical_cols]
sqrt_features.columns = [f"{col}_sqrt" for col in numerical_cols]

# Concatenate the original DataFrame with the square and square root features
df_features = pd.concat([df, square_features, sqrt_features], axis=1)

# Print the resulting DataFrame
print(df_features.head())

  parental_level_of_education  math_score  reading_score  writing_score  \
0            some high school          63             78             79   
1                some college          98             86             90   
2           bachelor's degree          96             90             92   
3          associate's degree          59             62             69   
4                 high school          69             58             53   

   math_score_square  reading_score_square  writing_score_square  \
0               3969                  6084                  6241   
1               9604                  7396                  8100   
2               9216                  8100                  8464   
3               3481                  3844                  4761   
4               4761                  3364                  2809   

   math_score_sqrt  reading_score_sqrt  writing_score_sqrt  
0         7.937254            8.831761            8.888194  
1         9.899495

### insight 
- No changes to categorical variables: The parental_level_of_education column remains unchanged, indicating that the aggregation process only affected the numerical score columns.

### Feature aggregation

In [21]:
#DataFrame 
data = {
    'parental_level_of_education': ['some high school', 'some college', "bachelor's degree", 'associate\'s degree', 'high school'],
    'math_score': [63, 98, 96, 59, 69],
    'reading_score': [78, 86, 90, 62, 58],
    'writing_score': [79, 90, 92, 69, 53]
}

df = pd.DataFrame(data)

# Calculate the average score
df['average_score'] = df[['math_score', 'reading_score', 'writing_score']].mean(axis=1)

# Print the resulting DataFrame
print(df.head())

  parental_level_of_education  math_score  reading_score  writing_score  \
0            some high school          63             78             79   
1                some college          98             86             90   
2           bachelor's degree          96             90             92   
3          associate's degree          59             62             69   
4                 high school          69             58             53   

   average_score  
0      73.333333  
1      91.333333  
2      92.666667  
3      63.333333  
4      60.000000  


In [23]:
df

Unnamed: 0,parental_level_of_education,math_score,reading_score,writing_score,average_score
0,some high school,63,78,79,73.333333
1,some college,98,86,90,91.333333
2,bachelor's degree,96,90,92,92.666667
3,associate's degree,59,62,69,63.333333
4,high school,69,58,53,60.0
