In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Table of Contents
* <a href="#intro">Introduction</a>
* <a href="#dataprep">Data preparation</a>
* <a href="#datavis">Data visualization and exploration
* <a href="#featureselection">Feature selection
* <a href="#hyptuning">Hyperparameter tuning
* <a href="#modelcomp">Model Comparison and selection
* <a href="#test">Testing the final model
* <a href="#intro">Conclusion
* <a href="#references">References
    

# Introduction

Placements are very important in a college curricullum. Predicting which students have higher chances to be placed can help in filtering out students for specific companies. Thus saving time and increasing placement efficiency. 
Although this kind of work may give rise to some ethical issues. In this notebook we are not going to address or discuss those issues.
This is just an attempt to see how variuos model would work on this dataset and carry out data mining tasks on it.
The notebook prepares the data and creates visualization to understand effect of different factors on status of placement for a student. It also explains the feature selection procedure and how to test wether the selected features give us significantly improved results or not.
It also carry out hyperparameter tuning of models and selects the best model. The notebook demonstrates hyperparameter tuning of following models, Decision Tree, KNN, SVC, Random forest and Naive Bayes.

<a id="section1"> </a>
# **Importing the required packages and libraries**

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.figure as figure
from matplotlib.pyplot import gcf
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn import feature_selection as fs
from sklearn.preprocessing import PowerTransformer
from sklearn.ensemble import RandomForestClassifier

<a id="dataprep"> </a>
# **Date preparation**

# Importing the data and inspecting the dtypes

In [None]:
placement_data = pd.read_csv("../input/factors-affecting-campus-placement/Placement_Data_Full_Class.csv")
placement_data.head()

In [None]:
#data preparation
placement_data.dtypes

In [None]:
#removing the unwanted columns
final_df = placement_data.drop(columns=['sl_no', 'salary'])

In [None]:
#checking the unique values for categorical columns
categorical_columns = final_df.columns[final_df.dtypes == object]
for col in categorical_columns:
    print("Unique values for ", col ,": " ,final_df[col].unique())

In [None]:
#checking the summary of descriptive features
descriptive_columns = final_df.columns[final_df.dtypes != object]
final_df[descriptive_columns].describe()
    

In [None]:
#checking if there are any NAs
print(final_df.isna().sum())
print("\nNumber of records: ", final_df['gender'].count())

<a id="datavis"> </a>
# Data visualization and exploration

In [None]:
#exploring different types of descriptive features
figure, axes = plt.subplots(3,2,figsize=(11, 11))
figure.delaxes(axes[2,1])
i = 0

figure.suptitle('Distribution of descriptive features')

for ax in axes:
    for a in ax:
        if(i == 5):
            break
        sns.histplot(ax = a, data = final_df, x = descriptive_columns[i],bins=30)
        a.axvline(np.mean(final_df[descriptive_columns[i]]),color='red', linestyle='--', label=("Average = " + str( round(np.mean(final_df[descriptive_columns[i]]),2) )) )
        i = i + 1

plt.show()

We have data for 215 students. Observing the above figure and due to the central limit theorem we can conclude that the distribution of every descriptive feature is normal. 

In [None]:
#visualizing different categorical variables
figure, axes = plt.subplots(4,2,figsize=(10, 16))
i = 0
sns.set_theme(style="whitegrid")
figure.suptitle('Distribution of categorical features')

for ax in axes:
    for a in ax:
        sns.countplot(ax = a, data = final_df, x = categorical_columns[i])
        i = i + 1

plt.show()

An imbalance can be observed for status which is our target variable. Hence, we will be using stratified  cross validation.

In [None]:
#visualising effect of other features on 'status'
figure, axes = plt.subplots(4,2,figsize=(10, 16))
figure.delaxes(axes[3,1])
i = 0
sns.set_theme(style="whitegrid")
figure.suptitle('Status vs other categorical features')

for ax in axes:
    for a in ax:
        if( i == 8 ):
            break
        if(categorical_columns[i] != 'status'):
            sns.countplot(ax = a, x="status", hue=categorical_columns[i], data=final_df, palette=['#432371',"#FAAE7B", "#1082a8"])
        i = i + 1

plt.show()

# sns.set_theme(style="whitegrid")
# sns.countplot(x="status", hue="specialisation", data=final_df, palette=['#432371',"#FAAE7B", "#1082a8"])

From above visualization, we can conlude that specialisation has an influence over the decision of placement.

In [None]:
#data vis and exploration
figure, axes = plt.subplots(3,2,figsize=(10, 17))
figure.delaxes(axes[2,1])
i = 0

figure.suptitle('Status vs other descriptive feature')

for ax in axes:
    for a in ax:
        if(i == 5):
            break
        sns.boxplot(ax = a, x="status", y=descriptive_columns[i], data=final_df)
        i = i + 1

plt.show()

We can confirm that the scores matter alot during the placement decisions. Scores have more importance in student selection than work experience.

 <a id="dataprepML"> </a>
 # Preparing the data for machine learning tasks - Feature selection, hyper parameter tuning and model selection.

In [None]:
#one hot encoding on categorical variables
for col in categorical_columns:
    if(len(final_df[col].unique()) == 2):
        final_df[col] = pd.get_dummies(final_df[col], drop_first=True)

final_df = pd.get_dummies(final_df)

In [None]:
#data standardization for descriptive features
for col in final_df.columns:
    final_df[[col]] = preprocessing.MinMaxScaler().fit_transform(final_df[[col]])

In [None]:
#observing the final dataframe created after data normalization
final_df.head(10)

In [None]:
#seperating the target variable
data = final_df.drop(columns=['status'])
target = final_df['status']

<a id="featureselection"> </a>
# **Feature selection**

In [None]:
#defining the global variables

#we will be selecting top 5 features
num_features = 5

#defining the cross validation method
cross_validation = RepeatedStratifiedKFold(n_splits=5,
                                          n_repeats=3,
                                          random_state=999)
#defning the scoring metric 
scoring_metric = 'accuracy'

In [None]:
#training the decision tree algorithm on full set of features
dt_clf = DecisionTreeClassifier()

#generating the cross validation score
cross_validation_full = cross_val_score(estimator=dt_clf,
                             X=data,
                             y=target, 
                             cv=cross_validation, 
                             scoring=scoring_metric)

#printing the result
print("\033[1m Accuracy on full set of features (DT classifier): \033[0m" + str(cross_validation_full.mean().round(3)))

In [None]:
#mutual information
#creating mutual info algorithm
mutual_info = fs.SelectKBest(fs.mutual_info_classif, k=num_features)

#fitting the data and getting the ranked features
mutual_info.fit_transform(data, target)

#getting the top 5 indices of the features by sorting them by their scores
indices_mutual_info = np.argsort(mutual_info.scores_)[::-1][0:num_features]

#printing the array of top 5 features
best_features_mi = data.columns[indices_mutual_info].values
best_features_mi

In [None]:
#plotting the importances of the features
feature_importances_mi = mutual_info.scores_[indices_mutual_info]

#visualizing the top 5 features and their scores
plt.bar(best_features_mi, feature_importances_mi, color='cadetblue')
plt.xlabel("Features")
plt.xlabel("Importance score")
plt.title("Top 5 features selected by Mutual information")
plt.xticks(fontsize=8, rotation=45)
plt.show()

In [None]:
#calculating the cross validation result of recall on selected 5 features
cv_results_mi = cross_val_score(estimator=dt_clf,
                             X=data.iloc[:,indices_mutual_info],
                             y=target, 
                             cv=cross_validation, 
                             scoring=scoring_metric)
#printing the result
print("\033[1m Cross validation accuracy score on features selected by mutual info (DT classifier): \033[0m" + str(cv_results_mi.mean().round(3)))

In [None]:
#creating random forest classifier with 100 estimators
rfi = RandomForestClassifier(n_estimators=100)

#fitting the data
rfi.fit(data, target)

#getting the indices of top 5 features sorted by their importance
indices_rfi = np.argsort(rfi.feature_importances_)[::-1][0:num_features]

In [None]:
#printing the top 5 features selected by rfi
best_features_rfi = data.columns[indices_rfi].values
best_features_rfi

In [None]:
#plotting the importances of the features
feature_importances_rfi = rfi.feature_importances_[indices_rfi]

#visualizing the top 5 features and their scores
plt.bar(best_features_rfi, feature_importances_rfi, color='cadetblue')
plt.xlabel("Features")
plt.xlabel("Importance score")
plt.title("Top 5 features selected by Random forest information")
plt.xticks(fontsize=8, rotation=45)
plt.show()

In [None]:
#calculating the cross validation result of recall on selected 5 features
cv_results_rfi = cross_val_score(estimator=dt_clf,
                             X=data.iloc[:, indices_rfi],
                             y=target, 
                             cv=cross_validation, 
                             scoring=scoring_metric)
#printing the result
print("\033[1m Cross validation accuracy score on features selected by rfi (DT classifier): \033[0m" + str(cv_results_rfi.mean().round(3)))

In [None]:
#creating the fscore algorithm
fit_fscore = fs.SelectKBest(fs.f_classif, k=num_features)

#fitting the algorithm
fit_fscore.fit_transform(data, target)

#getting the top 5 features 
indices_fscore = np.argsort(np.nan_to_num(fit_fscore.scores_))[::-1][0:num_features]
best_features_fscore = data.columns[indices_fscore].values
best_features_fscore

In [None]:
#plotting the importances of the features
feature_importances_fscore = fit_fscore.scores_[indices_fscore]

#visualizing the top 5 features and their scores
plt.bar(best_features_rfi, feature_importances_rfi, color='cadetblue')
plt.xlabel("Features")
plt.xlabel("Importance score")
plt.title("Top 5 features selected by F-score")
plt.xticks(fontsize=8, rotation=45)
plt.show()

In [None]:
#calculating the cross validation result of recall on selected 5 features
cv_results_fscore = cross_val_score(estimator=dt_clf,
                             X=data.iloc[:, indices_fscore],
                             y=target, 
                             cv=cross_validation, 
                             scoring=scoring_metric)
print("\033[1m Cross validation accuracy score on features selected by  info (DT classifier): \033[0m" + str(cv_results_fscore.mean().round(3)))

In [None]:
#comparing features cross validation accuracy
#creating variables to compare scores of different feature selection methods
set_of_feature_selection_method = ['Full Set of Features', 'F-Score', 'Mutual Information', 'RFI']
set_of_scores = [cross_validation_full.mean().round(3), cv_results_fscore.mean().round(3), cv_results_mi.mean().round(3), cv_results_rfi.mean().round(3)]

In [None]:
#visualizing the cross validation scores given by different feaure selection methods
plt.bar(set_of_feature_selection_method, set_of_scores, color='cadetblue')
plt.xlabel("Feature selection method")
plt.xlabel("Cross validation score")
plt.title("Comapring different feature selection methods")
plt.xticks(fontsize=8, rotation=45)
plt.show()

As it can observed, mutual information and f-score has better scores than others. But are the scores significantly more than others? We will now check this using paired T-test of significance.

In [None]:
#performing a two sided paired t-test
print(stats.ttest_rel(cross_validation_full, cv_results_fscore).pvalue.round(3))
print(stats.ttest_rel(cross_validation_full, cv_results_mi).pvalue.round(3))
print(stats.ttest_rel(cross_validation_full, cv_results_rfi).pvalue.round(3))

All the p-values are greater than 0.05 (p > 0.05), the result is not statistically significant. Hence, we can say that modelling with top 5 features don't give us a better result.

<a id="hyptuning"> </a>
# Hyperparameter tuning and model selection

In [None]:
#splitting the data into 75% train and 25% test
train_data, test_data, train_target, test_target = train_test_split(data, target, test_size=0.20, random_state=999)

In [None]:
#creating the array for different values of var smoothing variable
np.logspace(0,-9, num=10)

In [None]:
#setting the random seed
np.random.seed(999)

#creating default NB classifier
nb_classifier = GaussianNB()

#defining parameters to test
params_NB = {'var_smoothing': np.logspace(0,-9, num=100)}

#creating the GridSearch algorithm on NB classifier
gs_NB = GridSearchCV(estimator=nb_classifier, 
                     param_grid=params_NB, 
                     cv=cross_validation,
                     verbose=1, 
                     scoring=scoring_metric)

#NB expects data to be in normal distribution hence converting it using power transform
Data_transformed = PowerTransformer().fit_transform(train_data)

#fitting the data
gs_NB.fit(Data_transformed, train_target);


In [None]:
#getting the best params
nb_best_params = gs_NB.best_params_
nb_best_params

In [None]:
#getting the best score of NB
nb_best_score = gs_NB.best_score_
nb_best_score

In [None]:
#defining parameters
parameters={"splitter":["best","random"],
            "max_depth" : [1,3,5,7,9,11,12],
           "min_samples_leaf":[1,2,3,4,5,6,7,8,9,10],
           "min_weight_fraction_leaf":[0.1,0.2,0.3,0.4,0.5],
           "max_features":["auto","log2","sqrt",None],
           "max_leaf_nodes":[None,10,20,30,40,50,60,70,80,90] }

#creating default DT classifier
dt_clf = DecisionTreeClassifier()

#creating the GridSearch algorithm on DT classifier
dt_grid = GridSearchCV(estimator=dt_clf,
                          param_grid=parameters,
                          scoring=scoring_metric,
                          cv=cross_validation,
                          verbose=1)

dt_grid.fit(train_data,train_target)


In [None]:
#getting the best params given by Grid search
dt_best_params = dt_grid.best_params_
dt_best_params

In [None]:
#getting the best score of DT
dt_best_score = dt_grid.best_score_
dt_best_score

In [None]:
#specifying params for SVC
param_grid = {'C': [0.1,1, 10, 100], 'gamma': [1,0.1,0.01,0.001],'kernel': ['rbf', 'poly', 'sigmoid']}

#creating default SV classifier
svc_clf = SVC()

#creating the GridSearch algorithm on SV classifier
svc_grid = GridSearchCV(estimator=svc_clf,
                        param_grid=param_grid,
                        refit=True,
                        verbose=1, 
                        cv=cross_validation, 
                        scoring=scoring_metric)

svc_grid.fit(train_data,train_target)

In [None]:
#getting the best params
svc_best_params = svc_grid.best_params_
svc_best_params

In [None]:
#getting the best score of SVC
svc_best_score = svc_grid.best_score_
svc_best_score

In [None]:
#defining KNN's parameters
params_KNN = {'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8 , 9 , 10,11,12,13,14,15,16,17,18,19,20], 
              'p': [1, 2, 5]}

#creating default KNN classifier
knn_clf = KNeighborsClassifier()

#creating the GridSearch algorithm on KNN classifier
gs_KNN = GridSearchCV(estimator=knn_clf, 
                      param_grid=params_KNN,
                      cv=cross_validation,
                      verbose=1,  # verbose: the higher, the more messages
                      scoring=scoring_metric, 
                      return_train_score=True)

gs_KNN.fit(train_data, train_target)

In [None]:
#getting the best params of KNN
knn_best_params = gs_KNN.best_params_
knn_best_params

In [None]:
#getting the best scores of KNN
KNN_best_score = gs_KNN.best_score_
KNN_best_score

In [None]:
#defining random forest's parameters
param_rfc = {
    'criterion' :['gini', 'entropy'],
    'n_estimators': [100, 200, 500],
    'max_depth' : [2,3,4,5,6,7,8],
    'max_features': ['auto', 'sqrt', 'log2']
}

#creating default RandomForest classifier
rfc_clf = RandomForestClassifier(random_state=999)

#creating the GridSearch algorithm on RandomForest classifier
cv_rfc = GridSearchCV(estimator=rfc_clf, 
                      param_grid=param_rfc,
                      cv= cross_validation,
                      scoring=scoring_metric,
                      verbose=1)

cv_rfc.fit(train_data, train_target)

In [None]:
#Best parameters of random forest
rfc_best_params = cv_rfc.best_params_
rfc_best_params

In [None]:
#Best score of randomforest
rfc_best_score = cv_rfc.best_score_
rfc_best_score

<a id="modelcomp"> </a>
# Model comparison and selection

In [None]:
#model comparison by visualising the best scores
models = ['Naive Bayes', 'Decision Tree', 'KNN', 'SVM', 'Random Forest']
scores_bf = [nb_best_score, dt_best_score, KNN_best_score, svc_best_score, rfc_best_score]

df = pd.DataFrame({"models": models, "scores": scores_bf})

sns.barplot(data = df, x = "models", y="scores", order=df.sort_values('scores', ascending=False).models)
plt.show()

As we can observe from the above barplot, Random forest gives us the best accuracy. Hence, we can use this model to test out data and see how many correct prediction it can perform.

<a id="test"> </a>
# Testing the model with the unseen data

In [None]:
#predicting the test data using the best random forest classifier

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

best_RF = RandomForestClassifier(criterion='entropy', max_depth=7, max_features='auto', n_estimators=200, random_state=999)

model = best_RF.fit(train_data, train_target)

predicted = model.predict(test_data)

print(confusion_matrix(test_target, predicted))


In [None]:
#checking the classification report
print(classification_report(test_target, predicted))

So we got an overall accuracy of 91% on test data. During training, we tested our model with 5-fold cross validaton and observed an accuracy of around 85%.
So, 
Train accuracy: 85%
Test accuracy: 91%

Therefore, we can confirm that the model is not overfitting!

<a id="conclusion"> </a>
# Conclusion
1. Status of placement is highly influenced by the scores(percentages scored) and type of specialisation to choose
2. We cannot select any best features from the data as the t-test was not significant. Hence, every feature contributes in decision of your placement
3. Decision tree takes time to run and fit
4. Random forest performed best with following parameters: {'criterion': 'entropy','max_depth': 7, 'max_features': 'auto','n_estimators': 200}
5. There is no overfitting observed

<a id="references"> </a>
# References

SK Part 2: Feature Selection and Ranking | www.featureranking.com SK Part 2: Feature Selection and Ranking | www.featureranking.com. (2021). Retrieved 29 May 2021, from https://www.featureranking.com/tutorials/machine-learning-tutorials/sk-part-2-feature-selection-and-ranking/

SK Part 3: Cross-Validation and Hyperparameter Tuning | www.featureranking.com SK Part 3: Cross-Validation and Hyperparameter Tuning | www.featureranking.com. (2021). Retrieved 29 May 2021, from https://www.featureranking.com/tutorials/machine-learning-tutorials/sk-part-3-cross-validation-and-hyperparameter-tuning/

GridSearchCV, S. GridSearchCV, S. (2020). SVM Hyperparameter Tuning using GridSearchCV - Velocity Business Solutions Limited. Retrieved 29 May 2021, from https://www.vebuso.com/2020/03/svm-hyperparameter-tuning-using-gridsearchcv/