<a href="https://colab.research.google.com/github/srijan007-svg/Nepali-text-to-image-GAN-Algorthim/blob/main/Assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>



*   Load dataset with Pandas
*   Explore structure (head, info, describe)
*   Handle missing values with imputation
*   Encode categorical variables with Scikit-Learn
*   Scale numerical features
*   Prepare dataset for modeling







In [3]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Create a sample DataFrame for student performance
data = {'StudentID': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
        'Gender': ['Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female'],
        'Major': ['Math', 'Science', 'Physics', 'Math', 'Science', 'Physics', 'Math', 'Science', 'Physics', 'Math'],
        'Score': [85, 92, 78, 88, 95, 80, 82, 90, 75, 88],
        'Attendance': [0.95, 0.98, np.nan, 0.92, 0.99, 0.85, 0.93, np.nan, 0.88, 0.96],
        'Pass': ['Yes', 'Yes', 'No', 'Yes', 'Yes', 'No', 'Yes', 'Yes', 'No', 'Yes']} # Binary target variable
df_encoding = pd.DataFrame(data)

print("Original DataFrame:")
display(df_encoding)

# 3. Handle missing values with imputation
# Impute missing values in 'Attendance' with the mean
imputer = SimpleImputer(strategy='mean')
df_encoding['Attendance'] = imputer.fit_transform(df_encoding[['Attendance']])

print("\nDataFrame after imputing missing values:")
display(df_encoding)

# 4. Encode categorical variables with Scikit-Learn
# Separate features (X) and target (y)
X = df_encoding.drop('Pass', axis=1)
y = df_encoding['Pass']

# Identify categorical and numerical features
categorical_features = ['Gender', 'Major']
numerical_features = ['Score', 'Attendance']

# Create transformers for preprocessing
# One-hot encode categorical features
one_hot_encoder = OneHotEncoder(handle_unknown='ignore')

# Scale numerical features
scaler = StandardScaler()

# Create a column transformer to apply different transformations to different columns
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', one_hot_encoder, categorical_features),
        ('num', scaler, numerical_features)])

# Create a pipeline that first preprocesses the data and then applies an estimator (optional for this example)
# For demonstration, we'll just preprocess the data
pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

# Fit and transform the data
X_processed = pipeline.fit_transform(X)

# Convert the processed data back to a DataFrame for better readability
# Get the feature names after one-hot encoding
categorical_feature_names = pipeline.named_steps['preprocessor'].named_transformers_['cat'].get_feature_names_out(categorical_features)
all_feature_names = list(categorical_feature_names) + numerical_features

X_processed_df = pd.DataFrame(X_processed, columns=all_feature_names)

print("\nProcessed DataFrame (Encoded and Scaled):")
display(X_processed_df.head())

# 5. Prepare dataset for modeling (already done in the previous steps by separating X and y)
# X_processed is the feature matrix ready for modeling
# y is the target variable

# You can now use X_processed and y to train a machine learning model

Original DataFrame:


Unnamed: 0,StudentID,Gender,Major,Score,Attendance,Pass
0,1,Male,Math,85,0.95,Yes
1,2,Female,Science,92,0.98,Yes
2,3,Male,Physics,78,,No
3,4,Female,Math,88,0.92,Yes
4,5,Male,Science,95,0.99,Yes
5,6,Female,Physics,80,0.85,No
6,7,Male,Math,82,0.93,Yes
7,8,Female,Science,90,,Yes
8,9,Male,Physics,75,0.88,No
9,10,Female,Math,88,0.96,Yes



DataFrame after imputing missing values:


Unnamed: 0,StudentID,Gender,Major,Score,Attendance,Pass
0,1,Male,Math,85,0.95,Yes
1,2,Female,Science,92,0.98,Yes
2,3,Male,Physics,78,0.9325,No
3,4,Female,Math,88,0.92,Yes
4,5,Male,Science,95,0.99,Yes
5,6,Female,Physics,80,0.85,No
6,7,Male,Math,82,0.93,Yes
7,8,Female,Science,90,0.9325,Yes
8,9,Male,Physics,75,0.88,No
9,10,Female,Math,88,0.96,Yes



Processed DataFrame (Encoded and Scaled):


Unnamed: 0,Gender_Female,Gender_Male,Major_Math,Major_Physics,Major_Science,Score,Attendance
0,0.0,1.0,1.0,0.0,0.0,-0.049049,0.4327919
1,1.0,0.0,0.0,0.0,1.0,1.095421,1.174721
2,0.0,1.0,0.0,1.0,0.0,-1.193518,2.745689e-15
3,1.0,0.0,1.0,0.0,0.0,0.441438,-0.3091371
4,0.0,1.0,0.0,0.0,1.0,1.585908,1.422031
