In [None]:
# module/library importation
import pandas as pd
import numpy as np
import sklearn 
import plotly.express as px
import plotly.offline as pyo

In [None]:
from collections import Counter
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score, accuracy_score

In [None]:
pyo.init_notebook_mode()

In [None]:
df = pd.read_csv('collegePlace.csv')

In [None]:
df.head(10)

In [None]:
df.describe()

In [None]:
df.info()

Data Cleaning

In [None]:
df.isna().sum()

In [None]:
df['Gender'].value_counts()

In [None]:
df['Stream'].value_counts()

In [None]:
# renaming the engineering programs
eng = ['Mechanical', 'Electronics And Communication', 'Electrical', 'Civil']

for x in eng:
    df['Stream'] = df['Stream'].replace(x, x +' Engineering')
    
df['Stream'].unique()

In [None]:
df['Stream'].value_counts()

In [None]:
df.head(10)

In [None]:
df['Hostel'].unique()

In [None]:
df['CGPA'].min()

In [None]:
df['CGPA'].max()

### Data Visualization

In [None]:
# programs offered by students in the dataset
programs = Counter(df['Stream'])
# grabbing the most common elements in the dictionary
program = programs.most_common()
value = [x[0] for x in program]
count = [x[1] for x in program]
title = "Programs offered by students(count)"

fig = px.bar(program, x=value, y=count, color=value, labels={'x': "programs", 'y': "count"}, title=title)
fig.show()

Visualizing data based on computer science only

In [None]:
# creating new dataset based on computer science alone
df2 = df[df['Stream'] == 'Computer Science']

df2

In [None]:
# males compared to females who offered CS
counter = Counter(df2['Gender'])
# grabbing the most common elements in the dictionary
gender = counter.most_common()
value = [x[0] for x in gender]
count = [x[1] for x in gender]
title = "Males and Females who offered Computer Science"

fig1 = px.pie(gender, names=value, values=count, color=value, hole=0.2, title=title)
fig2 = px.bar(gender, x=value, y=count, color=value, labels={'x': 'Gender', 'y': 'Count'}, width=850)

fig1.show()
fig2.show()

In [None]:
gender

In [None]:
# students placed/not placed
counter = Counter(df['PlacedOrNot'])
# grabbing the most common elements in the dictionary
placed = counter.most_common()
value = [x[0] for x in placed]
count = [x[1] for x in placed]

fig = px.pie(placed, names=['Placed', 'Not placed'], values=count, color=value, hole=0.2, title='Percentage of students placed/not placed')
fig.show()

In [None]:
# CGPA
counter = Counter(df['CGPA'])
# grabbing the most common elements in the dictionary
cgpa = counter.most_common()
value = [x[0] for x in cgpa]
count = [x[1] for x in cgpa]

fig = px.bar(cgpa, x=value, y=count, color=value, labels={'x': 'CGPA', 'y': 'count'}, 
             width=850, title='Count of CGPA of students')
fig.show()

Data Prediction

In [None]:
# feature encoding
le_gender = LabelEncoder()
df['Gender'] = le_gender.fit_transform(df['Gender'])
le_stream = LabelEncoder()
df['Stream'] = le_stream.fit_transform(df['Stream'])
le_backlog = LabelEncoder()
backlog = le_backlog.fit_transform(['Yes', 'No'])

In [None]:
df['Stream'].unique()

In [None]:
# feature selection
X = df.drop(df[['Hostel', 'PlacedOrNot']], axis=1)
y = df['PlacedOrNot']

X

In [None]:
# scaling selected features
scaler = MinMaxScaler()
scaled_X = scaler.fit_transform(X)

In [None]:
scaled_X

In [None]:
# splitting dataset
X_train, X_test, y_train, y_test = train_test_split(scaled_X, y, test_size=0.20, random_state=42)

Decision Tree Classifier

In [None]:
# model selection and training(try phase)
model = DecisionTreeClassifier(random_state=23)

params = {'criterion': ['gini', 'entropy'], 'splitter': ['best', 'random'], 'max_features': ['auto', 'log2']} 

model_cv = GridSearchCV(model, params, cv=5)
model_cv.fit(X_train, y_train)

In [None]:
prediction = model_cv.predict(X_test)

In [None]:
print('Best parameters: {}'.format(model_cv.best_params_))
print('Best score: {:.3}'.format(model_cv.best_score_))
print('F1 score: {:.3}'.format(f1_score(y_test, prediction)))

In [None]:
# refitting best prameters of each model to the data
# each model was tested with GridSearchCV to find the best parameters used below
model = DecisionTreeClassifier(random_state=23, criterion='entropy', max_features='auto', splitter='best')
model2 = RandomForestClassifier(random_state=23, n_estimators=280, criterion='entropy', max_features='auto')
model3 = KNeighborsClassifier(n_neighbors=7, weights='uniform')

In [None]:
model.fit(X_train, y_train)
model2.fit(X_train, y_train)
model3.fit(X_train, y_train)

In [None]:
prediction = model.predict(X_test)
prediction2 = model2.predict(X_test)
prediction3 = model3.predict(X_test)

In [None]:
# for decicion tree classifier
print("F1 score: {:.3f}".format(f1_score(y_test, prediction)))
print("Accuracy score: {:.3f}".format(accuracy_score(y_test, prediction)))

In [None]:
# for random forest classifier
print("F1 score: {:.3f}".format(f1_score(y_test, prediction2)))
print("Accuracy score: {:.3f}".format(accuracy_score(y_test, prediction2)))

In [None]:
# for k nearest classifier
print("F1 score: {:.3f}".format(f1_score(y_test, prediction3)))
print("Accuracy score: {:.3f}".format(accuracy_score(y_test, prediction3)))

Final  model selection

In [None]:
# selecting random forest as final model for prediction
# rescaling and fiting of model
scaler = StandardScaler()
# using original X to fit the scaler and model fitting
x_scaled = scaler.fit_transform(X)

model = RandomForestClassifier(random_state=23, n_estimators=280, criterion='entropy', max_features='auto')
model.fit(x_scaled, y)

In [None]:
import pickle

# saving model and scaler
data = {'model': model, 'le_gender': le_gender, 'le_stream': le_stream, 'le_backlog': le_backlog}
with open('model.sav', 'wb') as file:
    pickle.dump(data, file)
    
with open('scaler.sav', 'wb') as file:
    pickle.dump(scaler, file)

In [None]:
with open('model.sav', 'rb') as file:
    data = pickle.load(file)

In [None]:
with open('scaler.sav', 'rb') as file:
    scaler = pickle.load(file)

#### testing model

In [None]:
model = data['model']
le_gender = data['le_gender']
le_stream = data['le_stream']
le_backlog = data['le_backlog']

In [None]:
# refining data for testing
x = np.array([[22, 'Male', 'Computer Science', 5, 8, 'No']])

x[:, 1] = le_gender.transform(x[:, 1])
x[:, 2] = le_stream.transform(x[:, 2])
x[:, 5] = le_backlog.transform(x[:, 5])
x = x.astype(int)

In [None]:
prediction = model.predict(x)

if prediction == 1:
    print('Student is placed')
else:
    print('Student not placed')