In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2_contingency
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,OneHotEncoder,OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [2]:
df = pd.read_csv('k12_dataset.csv')

In [3]:
df

Unnamed: 0,name,age,gender,country,state,city,parent_occupation,earning_class,level_of_student,level_of_course,course_name,time_spent_per_day,material_name,material_level,iq_of_student,assessment_score,promoted_or_skipped
0,Student_1,13,Female,India,Karnataka,Hubli,Government Job,High,Advanced,Intermediate,Java,4.14,Tutor,Easy,82,87,Promoted
1,Student_2,11,Male,India,Karnataka,Mysore,Business,Middle,Advanced,Basic,C++,5.25,Tutor,Hard,149,100,Promoted
2,Student_3,13,Male,India,Tamil Nadu,Chennai,Business,Middle,Intermediate,Basic,C++,4.80,PDF,Medium,95,79,Promoted
3,Student_4,11,Female,India,Karnataka,Bangalore,Doctor,Middle,Beginner,Intermediate,C++,5.35,PDF,Medium,115,94,Promoted
4,Student_5,8,Female,India,Tamil Nadu,Coimbatore,Doctor,Low,Beginner,Basic,JavaScript,3.42,Online Learning,Medium,148,100,Promoted
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8995,Student_8996,11,Female,India,Karnataka,Hubli,Other,High,Intermediate,Advanced,Python,5.71,Tutor,Easy,111,88,Promoted
8996,Student_8997,6,Male,India,West Bengal,Howrah,Teacher,High,Beginner,Basic,SQL,6.65,Book,Easy,150,100,Promoted
8997,Student_8998,15,Female,India,West Bengal,Kolkata,Teacher,High,Beginner,Basic,Java,5.90,PDF,Easy,85,87,Promoted
8998,Student_8999,10,Female,India,Maharashtra,Mumbai,Engineer,Low,Intermediate,Intermediate,C++,3.77,College,Hard,122,86,Promoted


In [4]:
df.drop(['name',  'country', 'state', 'city', 'promoted_or_skipped'], axis=1, inplace=True)

In [5]:
df

Unnamed: 0,age,gender,parent_occupation,earning_class,level_of_student,level_of_course,course_name,time_spent_per_day,material_name,material_level,iq_of_student,assessment_score
0,13,Female,Government Job,High,Advanced,Intermediate,Java,4.14,Tutor,Easy,82,87
1,11,Male,Business,Middle,Advanced,Basic,C++,5.25,Tutor,Hard,149,100
2,13,Male,Business,Middle,Intermediate,Basic,C++,4.80,PDF,Medium,95,79
3,11,Female,Doctor,Middle,Beginner,Intermediate,C++,5.35,PDF,Medium,115,94
4,8,Female,Doctor,Low,Beginner,Basic,JavaScript,3.42,Online Learning,Medium,148,100
...,...,...,...,...,...,...,...,...,...,...,...,...
8995,11,Female,Other,High,Intermediate,Advanced,Python,5.71,Tutor,Easy,111,88
8996,6,Male,Teacher,High,Beginner,Basic,SQL,6.65,Book,Easy,150,100
8997,15,Female,Teacher,High,Beginner,Basic,Java,5.90,PDF,Easy,85,87
8998,10,Female,Engineer,Low,Intermediate,Intermediate,C++,3.77,College,Hard,122,86


In [6]:
df.columns

Index(['age', 'gender', 'parent_occupation', 'earning_class',
       'level_of_student', 'level_of_course', 'course_name',
       'time_spent_per_day', 'material_name', 'material_level',
       'iq_of_student', 'assessment_score'],
      dtype='object')

In [7]:
df['parent_occupation'].value_counts()

parent_occupation
Business          1538
Government Job    1512
Other             1500
Doctor            1491
Teacher           1481
Engineer          1478
Name: count, dtype: int64

In [8]:
ohe_features = ['gender', 'parent_occupation', 'course_name', 'material_name']
ordinal_features = ['earning_class', 'level_of_student', 'level_of_course', 'material_level']

# Define the ColumnTransformer with both encoding and scaling
preprocessor = ColumnTransformer(
    transformers=[
        ('ohe', OneHotEncoder(handle_unknown='ignore'), ohe_features),
        ('ordinal', OrdinalEncoder(), ordinal_features),
          # Scaling applied to all features
    ],
    remainder='passthrough'  # Keep any remaining columns
)

# Create a pipeline with preprocessing and a model
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler()),
    ('classifier', RandomForestRegressor(random_state=42))  # Use any desired model
])


In [9]:
x = df.iloc[:,0:-1]
y = df.iloc[:,-1]

In [10]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [11]:
pipeline.fit(x_train,y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [12]:
y_pred = pipeline.predict(x_test)

In [13]:
r2_score(y_test,y_pred)

0.6893694881367909

In [14]:
x_train.columns

Index(['age', 'gender', 'parent_occupation', 'earning_class',
       'level_of_student', 'level_of_course', 'course_name',
       'time_spent_per_day', 'material_name', 'material_level',
       'iq_of_student'],
      dtype='object')

In [15]:
feature_names = ['age', 'gender', 'parent_occupation', 'earning_class',
                 'level_of_student', 'level_of_course', 'course_name',
                 'time_spent_per_day', 'material_name', 'material_level',
                 'iq_of_student']

# Given input data
sample_input = np.array([[13, 'Female', 'Government Job', 'High', 
                          'Advanced', 'Intermediate', 'Java', 
                          4.14, 'Tutor', 'Easy', 82]])

# Convert input to a DataFrame
sample_df = pd.DataFrame(sample_input, columns=feature_names)

# Predict using trained pipeline
predicted_score = pipeline.predict(sample_df)

# Print the prediction
print("Predicted Assessment Score:", predicted_score[0])

Predicted Assessment Score: 77.61


In [16]:
# import pickle
# with open("model.pkl", "wb") as file:
#     pickle.dump(pipeline, file)