In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly import tools
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('/kaggle/input/factors-affecting-campus-placement/Placement_Data_Full_Class.csv')
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isna().sum()

 Salary column has 67 null values. Using salary mean to fill the null values.  

In [None]:
df['salary'].fillna(df['salary'].mean(), inplace=True)

**Splitting data in to two sets categorical and numerical for EDA.**

In [None]:
categorical_feat = df.select_dtypes(include=['object'])
numerical_feat = df.select_dtypes(include=['float64'])

# Numerical Feature Analysis

In [None]:
fig = px.scatter(numerical_feat, x='ssc_p', y='hsc_p', color='degree_p', size = 'etest_p', hover_data=['mba_p'])
fig.show()

In [None]:
fig = px.scatter(numerical_feat, x='ssc_p', color=df['status'], height=500, width=600, title='SSC Percentage VS Status',trendline="ols")
fig.show()

fig = px.scatter(numerical_feat, x='hsc_p', color=df['status'], height=500, width=600, title='HSC Percentage VS Status',trendline="ols")
fig.show()

fig = px.scatter(numerical_feat, x='degree_p', color=df['status'], height=500, width=600, title='Degree Percentage VS Status',trendline="ols")
fig.show()

fig = px.scatter(numerical_feat, x='etest_p', color=df['status'], height=500, width=600, title='Employability test Percentage VS Status',trendline="ols")
fig.show()

fig = px.scatter(numerical_feat, x='mba_p', color=df['status'], height=500, width=600, title='MBA Percentage VS Status',trendline="ols")
fig.show()

In [None]:
fig = px.scatter(x = numerical_feat['salary'].value_counts().index, y=numerical_feat['salary'].value_counts())
fig.show()

# Univariate Analysis Of Categorical Features

In [None]:
colors=['mediumturquoise','lightgreen','seagreen','palegreen','olive']

for col in categorical_feat:
    plt.figure()
    categorical_feat[col].value_counts().plot.pie(wedgeprops={"edgecolor":"k",'linewidth': 2},textprops={'color':'k'}, pctdistance=0.7, autopct='%.2f%%',
                                                 figsize=(5,5), labels=None, subplots=True, colors=colors)
    plt.title('{} Distribution'.format(col), fontsize=17, ha='right')
    plt.legend(labels=categorical_feat[col].value_counts().index, loc='best', bbox_to_anchor=(1, 0.25, 0.5, 0.5))
    plt.show()

In [None]:
categorical_feat = categorical_feat.drop('status',1)

# Bivariate Analysis Of Categorical Features

In [None]:
for col in categorical_feat:
    sns.countplot(x = categorical_feat[col], hue=df['status'], palette =['salmon','lightblue'])
    plt.show()

# Label Encoding

In [None]:
# Using Label encoding to convert categorical values into numerical values as many algorithms can't handle categorical values.

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
 
gender = le.fit_transform(df['gender'])
ssc_b = le.fit_transform(df['ssc_b'])
hsc_b = le.fit_transform(df['hsc_b'])
degree_t = le.fit_transform(df['degree_t'])
workex = le.fit_transform(df['workex'])
specialisation = le.fit_transform(df['specialisation'])
status = le.fit_transform(df['status'])

In [None]:
# droping columns

df.drop(['sl_no','gender', 'ssc_b', 'hsc_b', 'hsc_s', 'degree_t', 'workex','specialisation','status'],1,inplace=True)

# Feature Scaling
I will use Standard Scaler

In [None]:
from sklearn.preprocessing import StandardScaler
#df[['ssc_p','hsc_p','degree_p','etest_p','mba_p','salary']] = StandardScaler().fit_transform(df[['ssc_p','hsc_p','degree_p','etest_p','mba_p','salary']])

In [None]:
# Appending Label encoded columns to dataframe

df['gender'] = gender
df['ssc_b'] = ssc_b
df['hsc_b'] = hsc_b
df['degree_t'] = degree_t
df['workex'] = workex
df['specialisation'] = specialisation
df['status'] = status
df.head()

# Correlation with Heatmap

In [None]:
fig, ax = plt.subplots(figsize=(10,10))
fig.suptitle('Correlation between Status and features',fontsize=20)
ax=sns.heatmap(df.corr()[["status"]].sort_values("status"),vmax=1, vmin=-1, cmap="YlGnBu", annot=True, ax=ax);
ax.invert_yaxis()

In [None]:
from sklearn.model_selection import train_test_split

x = df.iloc[:,:-1]
y = df.iloc[:,-1]
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=0)

In [None]:
print(x.shape)
print(y.shape)

In [None]:
accuracies = dict()

# Classification Models

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

reg = LogisticRegression()
reg.fit(x_train, y_train)

#Make prediction
y_pred = reg.predict(x_test)

from sklearn.metrics import accuracy_score
accuracies['Logistic Regression'] = accuracy_score(y_test, y_pred)
print('Accuracy is: '+str(accuracy_score(y_test, y_pred)))

** Confusion Matrix Of Logistic Regression**

In [None]:
from sklearn.metrics import plot_confusion_matrix

plot_confusion_matrix(reg, x_test, y_test, display_labels=['Placed','Not Placed'], cmap=plt.cm.PuRd, normalize='true')
plt.title('Confusion Matrix Of Campus Placement')
plt.show()

# Kernel SVM

In [None]:
from sklearn.svm import SVC


classifier = SVC(kernel = 'rbf', random_state = 0)
classifier.fit(x_train, y_train)

y_pred = classifier.predict(x_test)

from sklearn.metrics import accuracy_score
accuracies['Kernel SVM'] = accuracy_score(y_test, y_pred)
print('Accuray is: '+str(accuracy_score(y_test, y_pred)))

**Confusion Matrix Of Kernel SVM**

In [None]:
from sklearn.metrics import plot_confusion_matrix

plot_confusion_matrix(classifier, x_test, y_test, display_labels=['Placed','Not Placed'], cmap=plt.cm.Blues, normalize='true')
plt.title('Confusion Matrix of Campus Placement')
plt.show()

In [None]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb.fit(x_train, y_train)

y_pred = gnb.predict(x_test)

from sklearn.metrics import accuracy_score

accuracies['Naive Bayes'] = accuracy_score(y_test, y_pred)
print('Accuray is: '+str(accuracy_score(y_test, y_pred)))

**Confusion Matrix Of Naive Bayes**

In [None]:
from sklearn.metrics import plot_confusion_matrix

plot_confusion_matrix(gnb, x_test, y_test, display_labels=['Placed','Not Placed'], cmap=plt.cm.Purples, normalize='true')
plt.title('Confusion Matrix of Campus Placement')
plt.show()

# KNeighbors Classifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
knn.fit(x_train, y_train)

y_pred = knn.predict(x_test)

from sklearn.metrics import accuracy_score
accuracies['KNeighbors Classifier'] = accuracy_score(y_test, y_pred)
print('Accuracy is: '+str(accuracy_score(y_test, y_pred)))

**Confusion Matrix Of KNeighbors Classifier**

In [None]:
from sklearn.metrics import plot_confusion_matrix

plot_confusion_matrix(knn, x_test, y_test, display_labels=['Placed','Not Placed'], cmap=plt.cm.pink, normalize='true')
plt.title('Confusion Matrix of Campus Placement')
plt.show()

In [None]:
from sklearn.tree import DecisionTreeClassifier

dtc = DecisionTreeClassifier(criterion = 'entropy', random_state= 0)
dtc.fit(x_train, y_train)

#Make Prediction
y_pred = classifier.predict(x_test)

from sklearn.metrics import accuracy_score

accuracies['Decision Tree Classification'] = accuracy_score(y_test, y_pred)
print('Accuracy is: ' + str(accuracy_score(y_test, y_pred)))

****Confusion Matrix Of Decision Tree Classifier

In [None]:
from sklearn.metrics import plot_confusion_matrix

plot_confusion_matrix(dtc, x_test, y_test, display_labels=['Placed','Not Placed'], cmap=plt.cm.bone, normalize='true')
plt.title('Confusion Matrix of Campus Placement')
plt.show()

In [None]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators= 2, random_state= 0)
rfc.fit(x_train, y_train)

#Make Prediction
y_pred = classifier.predict(x_test)

from sklearn.metrics import accuracy_score

accuracies['Random Tree Classification'] = accuracy_score(y_test, y_pred)
print('Accuracy is: ' + str(accuracy_score(y_test, y_pred)))

**Confusion Matrix** Of Random Forest Classifier

In [None]:
from sklearn.metrics import plot_confusion_matrix

plot_confusion_matrix(rfc, x_test, y_test, display_labels=['Placed','Not Placed'], cmap=plt.cm.copper, normalize='true')
plt.title('Confusion Matrix of Campus Placement')
plt.show()

In [None]:
accuracy_df = pd.DataFrame(list(accuracies.items()), columns=['Model Name','Accuracy Score'])
accuracy_df

In [None]:
f, ax = plt.subplots(figsize=(8, 6))
sns.set_color_codes('pastel')
sns.barplot(y='Model Name', x='Accuracy Score', data=accuracy_df, color='pink')
plt.show()