In this Notebook, We look at detailed Exploratory data analysis(EDA) on Student Placement Dataset with Visualization. Extract different Insights from data that will be helpful for Fresh candidates. At the end we create Machine Learning Model that predicts whether a particular candidate got placement or not based on features.

# Import Libraries

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from wordcloud import WordCloud
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, plot_confusion_matrix
from datetime import datetime
import warnings

warnings.filterwarnings("ignore", category = FutureWarning)


sns.set(style="darkgrid")
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
data = pd.read_csv('/kaggle/input/engineering-placements-prediction/collegePlace.csv')

In [1]:
data.head()

In [1]:
data.shape

# EDA

In [1]:
data.describe().T.style.bar(subset=['mean'], color='#105ff2').background_gradient(subset=['std'], cmap='Greens').background_gradient(subset=['50%'], cmap='cool')

In [1]:
data.isnull().sum()

In [1]:
fig = px.histogram(data, 'Age',title="<b>Average Age of Student</b>")

fig.add_vline(x=data['Age'].mean(), line_width=2, line_dash="dash", line_color="yellow")

fig.show()

In [1]:
fig = px.histogram(data, 'Age', color="Gender", title="<b>Average Age Gender wise</b>")

fig.add_vline(x=data['Age'].mean(), line_width=2, line_dash="dash", line_color="yellow")

fig.show()

In [1]:
data['Gender'].value_counts()

In [1]:
fig = px.pie(data, names = "Gender",title = "<b>Counts in Gender</b>", hole = 0.5, )

fig.update_traces(textposition='inside',textinfo='percent+label',
                  marker=dict(line=dict(color='#000000', width = 1.5)))
fig.show()

In [1]:
male = data[data['Gender'] == "Male"]
female = data[data['Gender'] == "Female"]

In [1]:
total_male = male.shape[0]
total_female = female.shape[0]

In [1]:
total_male_pass = male[male['PlacedOrNot'] == 1].shape[0]
total_female_pass = female[female['PlacedOrNot'] == 1].shape[0]

In [1]:
pass_male_percentage = np.round((total_male_pass * 100) / total_male,2)
pass_female_percentage = np.round((total_female_pass * 100) / total_female,2)

In [1]:
overview = {"Total Male": [total_male],
             "Total Female": [total_female],
             "Total male pass" : [total_male_pass],
             "Total female pass" : [total_female_pass],
             "% of Passed Male" : [pass_male_percentage],
             "% of Passed Female" : [pass_female_percentage]}

In [1]:
overview

In [1]:
gender_wise = pd.DataFrame(overview, index=["Detail"])
gender_wise.T

In [1]:
fig = px.histogram(data_frame = data,
             x = "Stream",
             color="PlacedOrNot", title="<b>Counts of Stream</b>",
             pattern_shape_sequence=['.'],
#              template='plotly_dark'
                  )

fig.show()

In [1]:
cgpa_above_avg = data[data['CGPA'] > data['CGPA'].mean()]

cgpa_above_avg

In [1]:
fig = px.histogram(data_frame = cgpa_above_avg,
                   x = 'CGPA',
                   color='PlacedOrNot',
                   title = "<b>Above Average CGPA Vs Placement</b>",
                   template='plotly')

fig.update_layout(bargap=0.7)

fig.show()

In [1]:
cgpa_below_avg = data[data['CGPA'] < data['CGPA'].mean()]

cgpa_below_avg

In [1]:
fig = px.histogram(data_frame = cgpa_below_avg,x = 'CGPA', color='PlacedOrNot',
                   title = "<b>Below Average CGPA Vs Placement</b>", barmode='group')

fig.update_layout(bargap=0.4)

fig.show()

In [1]:
stream_wise = data.groupby('Stream').agg({'Age':'mean',
                                          'Internships' : 'sum',                            
                                           "CGPA":'mean',
                                           'PlacedOrNot':'sum'})

stream_wise.style.highlight_max()

In [1]:
px.bar(data_frame=stream_wise, barmode='group',
       title = "<b>Stream wise Analyzing</b>",template="plotly")

In [1]:
no_internship = data[data['Internships'] == 0]

no_internship

In [1]:
fig = px.histogram(data_frame = no_internship,
                   x = "PlacedOrNot",
                   color="PlacedOrNot",
                   title = "<b>No Internship Experience Vs Placement</b>")

fig.update_layout(bargap=0.7)

fig.show()

# Preprocessing Data

In [1]:
dummy_gender = pd.get_dummies(data['Gender'])
dummy_stream = pd.get_dummies(data['Stream'])

In [1]:
data = pd.concat([data.drop(["Gender", "Stream"], axis = 1), dummy_gender, dummy_stream], axis = 1)
data.head()

In [1]:
data = data[['Age', 'Male', 'Female',
             'Electronics And Communication',
             'Computer Science', 'Information Technology',
             'Mechanical', 'Electrical', "Civil",
             "Internships","CGPA",'Hostel',
             'HistoryOfBacklogs', 'PlacedOrNot']]

data.head(3)

# Scaling Features

In [1]:
scaler = StandardScaler()

scaler.fit(data.drop('PlacedOrNot',axis=1))

scaled_features = scaler.transform(data.drop('PlacedOrNot',axis=1))

In [1]:
scaled_features = pd.DataFrame(scaled_features, columns = data.columns[:-1])
scaled_features.head()

# Visualize Correlation

In [1]:
corrmat = data.corr()
top_corr_features = corrmat.index

plt.figure(figsize=(25,15))

#plot heat map
g=sns.heatmap(data[top_corr_features].corr(),annot=True,cmap="Accent")

In [1]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(scaled_features,
                                                    data['PlacedOrNot'],
                                                    test_size = 0.25,
                                                    random_state = 0)

In [1]:
print(f"Shape of X_train is: {X_train.shape}")
print(f"Shape of X_test is: {X_test.shape}\n")

print(f"Shape of y_train is: {y_train.shape}")
print(f"Shape of y_test is: {y_test.shape}")

In [1]:
models = {"DecisionTree":DecisionTreeClassifier(),
         "RandomForest":RandomForestClassifier(),
         "XgBoost": XGBClassifier(),
         "KNeighborsClassifier":KNeighborsClassifier()}

In [1]:
def models_score(models, X_train, X_test, y_train, y_test):    

    scores = {}

    for name, model in models.items():
        model.fit(X_train, y_train)
        scores[name] = model.score(X_test,y_test)

    model_scores = pd.DataFrame(scores, index=['Score']).transpose()
    model_scores = model_scores.sort_values('Score')

    return model_scores

In [1]:
model_scores = models_score(models, X_train, X_test, y_train, y_test)

In [1]:
model_scores.style.highlight_max()

In [1]:
fig = px.bar(data_frame = model_scores,
             x="Algorithms",
             y="Score",
             color="Algorithms", title = "<b>Models Score</b>", template = 'plotly')

fig.update_layout(bargap=0.5)

fig.show()

In [1]:
# Hyper parameter optimization

params={
 "learning_rate"    : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ] ,
 "max_depth"        : [ 3, 4, 5, 6, 8, 10, 12, 15],
 "min_child_weight" : [ 1, 3, 5, 7 ],
 "gamma"            : [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
 "colsample_bytree" : [ 0.3, 0.4, 0.5 , 0.7 ]
    
}

In [1]:
def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print(f'\nTime taken: {thour} hours {tmin} minutes and {round(tsec, 2)} seconds.')

In [1]:
xgb_classifier = XGBClassifier()

In [1]:
random_search = RandomizedSearchCV(xgb_classifier,
                                   param_distributions=params,
                                   n_iter=5,
                                   scoring='roc_auc',
                                   n_jobs=-1,
                                   cv=5, verbose=3)

In [1]:
start_time = timer(None) 

random_search.fit(X_train, y_train)

timer(start_time)

In [1]:
xgb_best_params = random_search.best_estimator_
xgb_best_params

In [1]:
random_search.best_params_

In [1]:
classifier = xgb_best_params

In [1]:
classifier.fit(X_train, y_train, eval_metric='logloss')

In [1]:
pred = classifier.predict(X_test)

print(f"Accuracy: {accuracy_score(y_test, pred)}")

In [1]:
plot_confusion_matrix(classifier,
                      X_test, y_test,
                      cmap=plt.cm.Blues,
                      display_labels = ['Not Placed', 'Placed'])
plt.grid(False)
plt.show();

*I hope you like the work, Don't forget to upvote.*