In [23]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2_contingency
from sklearn.ensemble import RandomForestRegressor,RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,OneHotEncoder,OrdinalEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score,mean_squared_error

In [24]:
df = pd.read_csv('k12_dataset.csv')

In [25]:
df

Unnamed: 0,name,age,gender,country,state,city,parent_occupation,earning_class,level_of_student,level_of_course,course_name,time_spent_per_day,material_name,material_level,iq_of_student,assessment_score,promoted_or_skipped
0,Student_1,13,Female,India,Karnataka,Hubli,Government Job,High,Advanced,Intermediate,Java,4.14,Tutor,Easy,82,87,Promoted
1,Student_2,11,Male,India,Karnataka,Mysore,Business,Middle,Advanced,Basic,C++,5.25,Tutor,Hard,149,100,Promoted
2,Student_3,13,Male,India,Tamil Nadu,Chennai,Business,Middle,Intermediate,Basic,C++,4.80,PDF,Medium,95,79,Promoted
3,Student_4,11,Female,India,Karnataka,Bangalore,Doctor,Middle,Beginner,Intermediate,C++,5.35,PDF,Medium,115,94,Promoted
4,Student_5,8,Female,India,Tamil Nadu,Coimbatore,Doctor,Low,Beginner,Basic,JavaScript,3.42,Online Learning,Medium,148,100,Promoted
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8995,Student_8996,11,Female,India,Karnataka,Hubli,Other,High,Intermediate,Advanced,Python,5.71,Tutor,Easy,111,88,Promoted
8996,Student_8997,6,Male,India,West Bengal,Howrah,Teacher,High,Beginner,Basic,SQL,6.65,Book,Easy,150,100,Promoted
8997,Student_8998,15,Female,India,West Bengal,Kolkata,Teacher,High,Beginner,Basic,Java,5.90,PDF,Easy,85,87,Promoted
8998,Student_8999,10,Female,India,Maharashtra,Mumbai,Engineer,Low,Intermediate,Intermediate,C++,3.77,College,Hard,122,86,Promoted


In [26]:
df.drop(['name',  'country', 'state', 'city'], axis=1, inplace=True)

In [27]:
df

Unnamed: 0,age,gender,parent_occupation,earning_class,level_of_student,level_of_course,course_name,time_spent_per_day,material_name,material_level,iq_of_student,assessment_score,promoted_or_skipped
0,13,Female,Government Job,High,Advanced,Intermediate,Java,4.14,Tutor,Easy,82,87,Promoted
1,11,Male,Business,Middle,Advanced,Basic,C++,5.25,Tutor,Hard,149,100,Promoted
2,13,Male,Business,Middle,Intermediate,Basic,C++,4.80,PDF,Medium,95,79,Promoted
3,11,Female,Doctor,Middle,Beginner,Intermediate,C++,5.35,PDF,Medium,115,94,Promoted
4,8,Female,Doctor,Low,Beginner,Basic,JavaScript,3.42,Online Learning,Medium,148,100,Promoted
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8995,11,Female,Other,High,Intermediate,Advanced,Python,5.71,Tutor,Easy,111,88,Promoted
8996,6,Male,Teacher,High,Beginner,Basic,SQL,6.65,Book,Easy,150,100,Promoted
8997,15,Female,Teacher,High,Beginner,Basic,Java,5.90,PDF,Easy,85,87,Promoted
8998,10,Female,Engineer,Low,Intermediate,Intermediate,C++,3.77,College,Hard,122,86,Promoted


In [28]:
df.sample(10)

Unnamed: 0,age,gender,parent_occupation,earning_class,level_of_student,level_of_course,course_name,time_spent_per_day,material_name,material_level,iq_of_student,assessment_score,promoted_or_skipped
4382,14,Female,Business,Middle,Advanced,Intermediate,SQL,5.51,Book,Medium,135,96,Promoted
6636,6,Female,Engineer,Low,Intermediate,Advanced,Java,5.7,Book,Medium,149,100,Promoted
3350,17,Male,Other,Low,Beginner,Intermediate,JavaScript,4.66,PDF,Easy,132,95,Promoted
3951,6,Female,Government Job,Middle,Intermediate,Advanced,JavaScript,5.64,College,Medium,145,92,Promoted
5478,17,Male,Doctor,Middle,Advanced,Advanced,Java,5.55,College,Easy,105,89,Promoted
8170,12,Male,Doctor,Middle,Intermediate,Advanced,Java,4.61,PDF,Easy,112,86,Promoted
7720,12,Male,Teacher,Low,Beginner,Intermediate,SQL,3.93,Tutor,Medium,143,89,Promoted
8896,12,Female,Engineer,Middle,Intermediate,Advanced,Java,5.16,PDF,Easy,159,99,Promoted
3242,15,Female,Doctor,Middle,Advanced,Basic,SQL,4.79,PDF,Medium,147,95,Promoted
189,7,Female,Government Job,Low,Intermediate,Intermediate,Java,6.27,Book,Medium,109,94,Promoted


In [32]:
ohe_features = ['gender', 'parent_occupation', 'course_name', 'material_name']
ordinal_features = ['earning_class', 'level_of_student', 'level_of_course', 'material_level']
target_column = 'promoted_or_skipped'  # Target column for label encoding

# Label encode the target variable
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)  # Encode target variable

# Define the ColumnTransformer with encoding and scaling
preprocessor = ColumnTransformer(
    transformers=[
        ('ohe', OneHotEncoder(handle_unknown='ignore'), ohe_features),
        ('ordinal', OrdinalEncoder(), ordinal_features),
     
        # Apply scaling to ordinal features only
    ],
    remainder='passthrough'  # Keep any remaining numerical columns
)

# Create a pipeline with preprocessing and a RandomForestClassifier model
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    
    ('classifier', RandomForestClassifier(n_estimators=300, max_depth=20, random_state=42))
])

In [30]:
x = df.iloc[:,0:-1]
y = df.iloc[:,-1]

In [31]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [33]:
pipeline.fit(x_train, y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [34]:
y_pred = pipeline.predict(x_test)

In [35]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [36]:
accuracy = accuracy_score(y_test, y_pred)

In [37]:
accuracy

1.0

In [40]:
feature_names = ['age','gender','parent_occupation', 'earning_class',
                 'level_of_student', 'level_of_course', 'course_name',
                 'time_spent_per_day', 'material_name', 'material_level',
                 'iq_of_student','assessment_score']

# Given input data
sample_input = np.array([[13, 'Female', 'Government Job', 'High', 
                          'Advanced', 'Intermediate', 'Java', 
                          4.14, 'Tutor', 'Easy', 82,87]])

# Convert input to a DataFrame
sample_df = pd.DataFrame(sample_input, columns=feature_names)

# Predict using trained pipeline
predicted_result = pipeline.predict(sample_df)

# Print the prediction
print("Result:", predicted_result[0])

Result: Promoted


In [None]:
# import pickle
# with open("Promotion.pkl", "wb") as file:
#      pickle.dump(pipeline, file)