In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [2]:
df = pd.read_csv('student_dataset.csv')

In [3]:
df['Skills'] = df['Skills'].apply(lambda x: x.split(', ')) 

In [4]:
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler

In [5]:
mlb_zoho = MultiLabelBinarizer()
skills_encoded = pd.DataFrame(mlb_zoho.fit_transform(df['Skills']), columns=mlb_zoho.classes_, index=df.index)

In [6]:
numerical_data = df[['CGPA','Prior_Internship_Experience(Integer)','Aptitude_Score','Communication_Skill_Score(out_of_10)','Gender','Student_Department']]

In [7]:
X = pd.concat([numerical_data, skills_encoded], axis=1)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 12 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   Gender                                600 non-null    object 
 1   CGPA                                  600 non-null    float64
 2   Skills                                600 non-null    object 
 3   Student_Department                    600 non-null    object 
 4   Prior_Internship_Experience(Integer)  600 non-null    int64  
 5   Aptitude_Score                        600 non-null    int64  
 6   Communication_Skill_Score(out_of_10)  600 non-null    float64
 7   Zoho_hired                            600 non-null    object 
 8   Accenture_hired                       600 non-null    object 
 9   IBM_hired                             600 non-null    object 
 10  Cognizant_hired                       600 non-null    object 
 11  Knows Coding Langua

In [9]:
X.shape

(600, 14)

In [10]:
X

Unnamed: 0,CGPA,Prior_Internship_Experience(Integer),Aptitude_Score,Communication_Skill_Score(out_of_10),Gender,Student_Department,C++,Communication,HTML/CSS,Java,JavaScript,Problem Solving,Python,Teamwork
0,7.6,2,95,4.5,Female,MECH,1,1,0,1,0,1,0,0
1,7.6,1,98,4.9,Female,ECE,0,1,0,1,0,1,1,0
2,9.0,2,65,7.7,Female,AI & DS,1,0,1,1,0,1,0,0
3,8.7,1,85,5.0,Female,CSBS,1,1,0,0,0,0,0,0
4,7.9,0,51,6.0,Male,CSBS,0,1,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
595,6.3,1,75,6.0,Female,CSBS,0,1,0,0,1,0,0,1
596,8.5,2,50,9.2,Female,AI & ML,0,0,0,0,0,0,1,0
597,7.0,2,59,6.1,Male,AI & DS,0,0,1,0,1,1,0,0
598,8.0,2,65,9.8,Female,AI & DS,0,1,0,0,0,0,0,0


In [11]:
#df = pd.concat([df, skills_encoded], axis=1)

In [12]:
#df.head(2)

In [13]:
label_encoder = LabelEncoder()
df['Zoho_hired'] = label_encoder.fit_transform(df['Zoho_hired'])

In [14]:
#input features
#X = df.drop(['Zoho_hired', 'Skills','Knows Coding Language','Accenture_hired','IBM_hired','Cognizant_hired'], axis=1)  # Drop unnecessary columns for X
#output feature
y = df['Zoho_hired'] 

In [15]:
X.shape

(600, 14)

In [16]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

In [17]:
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [4,5])], remainder='passthrough')

In [18]:
import numpy as np

In [19]:
X = np.array(ct.fit_transform(X))

In [20]:
X

array([[1., 0., 0., ..., 1., 0., 0.],
       [1., 0., 0., ..., 1., 1., 0.],
       [1., 0., 1., ..., 1., 0., 0.],
       ...,
       [0., 1., 1., ..., 1., 0., 0.],
       [1., 0., 1., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 1.]])

In [21]:
from sklearn.preprocessing import StandardScaler

In [22]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [23]:
X_scaled

array([[ 0.95437311, -0.95437311, -0.43643578, ...,  1.46333908,
        -0.63651569, -0.69388867],
       [ 0.95437311, -0.95437311, -0.43643578, ...,  1.46333908,
         1.57105318, -0.69388867],
       [ 0.95437311, -0.95437311,  2.29128785, ...,  1.46333908,
        -0.63651569, -0.69388867],
       ...,
       [-1.04780824,  1.04780824,  2.29128785, ...,  1.46333908,
        -0.63651569, -0.69388867],
       [ 0.95437311, -0.95437311,  2.29128785, ..., -0.68336862,
        -0.63651569, -0.69388867],
       [-1.04780824,  1.04780824, -0.43643578, ..., -0.68336862,
        -0.63651569,  1.44115338]])

In [24]:
X_train,X_test,y_train,y_test = train_test_split(X_scaled,y,test_size=0.2,random_state=42)

In [25]:
classifier = RandomForestClassifier(random_state=42)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

In [26]:
y_pred = classifier.predict(X_test)

In [27]:
from sklearn.metrics import accuracy_score, classification_report

In [28]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.93


In [29]:
import joblib

In [30]:
joblib.dump(classifier, 'Zoho.pkl')

['Zoho.pkl']

In [31]:
joblib.dump(mlb_zoho, 'mlb_zoho.pkl')

['mlb_zoho.pkl']