<a href="https://colab.research.google.com/github/steveola/niya_ml_data/blob/main/Module%204.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# NIYA ML MODULE 4 CODE DEMONSTRATION

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

# Load data
df = pd.read_csv('messy_student_data.csv');
print("Original Data:")
print(df.head())
print(df.info()) # Reveals missing values!

# Impute numerical missing values with the mean
num_imputer = SimpleImputer(strategy='mean')
df[['Hours_Studied', 'Attendance']] = num_imputer.fit_transform(df[['Hours_Studied',
'Attendance']])

# Impute categorical missing values with the most frequent value
cat_imputer = SimpleImputer(strategy='most_frequent')
df[['State']] = cat_imputer.fit_transform(df[['State']])

# Direct Print
df

# Use OneHotEncoder for the 'State' column
encoder = OneHotEncoder(sparse_output=False)
encoded_states = encoder.fit_transform(df[['State']])
encoded_df = pd.DataFrame(encoded_states,
columns=encoder.get_feature_names_out(['State']))

# Combine the encoded data with the original dataframe
df = pd.concat([df, encoded_df], axis=1)
df = df.drop(['State'], axis=1) # Drop the original text column

# Initialize the Scaler
scaler = StandardScaler()

# Select numerical columns to scale
numerical_cols = ['Hours_Studied', 'Attendance', 'Previous_Grade']
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

print("\nCleaned and Scaled Data:")
print(df.head())
print(df.info())

# Direct print final output
df

Original Data:
  Student_Name  Hours_Studied  Previous_Grade  Attendance          State  Pass
0      Chinedu           12.5            75.0        85.0          Lagos     1
1        Amina            5.0             NaN        45.0           Kano     0
2        Tunde           18.0            82.0        92.0          Lagos     1
3        Grace            8.5            68.0        78.0          Abuja     0
4          NaN            NaN            88.0        95.0  Port Harcourt     1
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15 entries, 0 to 14
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Student_Name    13 non-null     object 
 1   Hours_Studied   13 non-null     float64
 2   Previous_Grade  14 non-null     float64
 3   Attendance      14 non-null     float64
 4   State           15 non-null     object 
 5   Pass            15 non-null     int64  
dtypes: float64(3), int64(1), object(2)
memory us

Unnamed: 0,Student_Name,Hours_Studied,Previous_Grade,Attendance,Pass,State_Abuja,State_Enugu,State_Kaduna,State_Kano,State_Lagos,State_Port Harcourt
0,Chinedu,0.15068,0.085631,0.434135,1,0.0,0.0,0.0,0.0,1.0,0.0
1,Amina,-1.395777,,-1.71733,0,0.0,0.0,0.0,1.0,0.0,0.0
2,Tunde,1.284749,0.610123,0.810641,1,0.0,0.0,0.0,0.0,1.0,0.0
3,Grace,-0.674097,-0.43886,0.057629,0,1.0,0.0,0.0,0.0,0.0,0.0
4,,0.0,1.059688,0.972001,1,0.0,0.0,0.0,0.0,0.0,1.0
5,Emeka,-1.086485,-1.412917,-0.910531,0,0.0,1.0,0.0,0.0,0.0,0.0
6,Fatima,1.697138,1.209542,1.133361,1,0.0,0.0,0.0,1.0,0.0,0.0
7,James,-0.983388,-0.888425,0.0,0,0.0,0.0,0.0,0.0,1.0,0.0
8,Bola,0.563069,0.834905,0.595495,1,0.0,0.0,0.0,0.0,1.0,0.0
9,Zainab,-1.498874,-1.188135,-1.448397,0,0.0,0.0,1.0,0.0,0.0,0.0
