In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn import set_config
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder,LabelEncoder,StandardScaler,OrdinalEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,plot_confusion_matrix,recall_score,precision_score
import seaborn as sns
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline
import matplotlib.pyplot as plt

In [None]:
data = pd.read_csv('/kaggle/input/factors-affecting-campus-placement/Placement_Data_Full_Class.csv')
data.head()

In [None]:
data.describe()

In [None]:
data.isnull().sum()

In [None]:
data.shape

In [None]:
data.status.value_counts()

In [None]:
data.salary = data.salary.fillna(0)
data.isnull().sum()

**Correlation Heatmap**

In [None]:
sns.heatmap(data.corr(),annot=True)

**Salary distribution**

Salary distribution seems somewhat multimodal because one of the mode gets occupied by the mode of placed and rest by the zero salaried non placed candidates.

In [None]:
sns.distplot(data.salary)

**Gender Gap**

In [None]:
sns.countplot(data.gender)

**Stream Selection by Candidates**

In [None]:
sns.countplot(data.hsc_s)

**Specialization taken by Science Students**

In [None]:
only_sci = data[data.hsc_s=="Science"]
sns.countplot(only_sci.degree_t)

In [None]:
sns.countplot(only_sci[only_sci.degree_t=="Others"].specialisation)

In [None]:
sns.pairplot(data,hue='specialisation')

In [None]:
sns.pairplot(data=data, hue='gender')

Detailed count of all students seperated based on placement status

In [None]:
data.groupby('status').sum()

Let us now see which specialization gets more average salary

In [None]:
data.groupby('specialisation').mean()['salary'].sort_values(ascending=False)

We can clearly see that average salary in Finance is more than HR.


We will now see which stream in hsc will get you a better chance of placement as well as salary

In [None]:
data.groupby('hsc_s').count()['status'].sort_values(ascending=False)

Commerce students get placed more and next comes Science followed by Arts

In [None]:
data.groupby('hsc_s').mean()['salary'].sort_values(ascending=False)

But Science students earn more salary than anyone else according to this data.

Relationship between degree percentage and salary

In [None]:
sns.regplot(data.degree_p,data.salary)

As degree percentage increases salary generally increases.
we will see if there is any such relationship with high school and secondary school percentages.

In [None]:
sns.regplot(data.ssc_p,data.salary)

Secondary school percentage also related to salary but can it be attributed to better stream selection following secondary school?

In [None]:
sns.regplot(data.hsc_p,data.salary)

**Is Central better or Others ???**

In [None]:
data.groupby('ssc_b').count()['status'].sort_values(ascending=False)

Central students gets placed more.

In [None]:
data.groupby('ssc_b').mean()['salary'].sort_values(ascending=False)

But Others earn more on average

**Does work Experience Matter ???**

In [None]:
data.groupby('workex').count()['status'].sort_values(ascending=False)

People with work experience get placed less and vice versa .

In [None]:
data.groupby('workex').mean()['salary'].sort_values(ascending=False)

But candidates with work experience earn more :D.

**Box plot to get the Outliers**

In [None]:
sns.boxplot(data=data)

In [None]:
data = data.drop(columns=['sl_no','salary'])
x = data.iloc[:,:-1]
x.head()

In [None]:
y=data[['status']]
y.head()

In [None]:
y.status = LabelEncoder().fit_transform(y.status)
y.head()

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3)


In [None]:
x_test.shape

In [None]:
x_train.head()

In [None]:
trans_nom = make_column_transformer((OneHotEncoder(sparse=False),[0,5,7,10]),
                                    (OrdinalEncoder(),[2,4,8]),
                                    remainder="passthrough")

In [None]:
trans_nom.fit_transform(x_train)

In [None]:
model = LogisticRegression(solver='liblinear')

In [None]:
set_config(display='diagram')
pipe = make_pipeline(trans_nom, model)
pipe

In [None]:
data.gender.unique()

In [None]:
pipe.fit(x_train,y_train)

In [None]:
x_train.shape

In [None]:
pred = pipe.predict(x_test)
accuracy_score(pred,y_test)*100

In [None]:
new_data = pd.DataFrame(np.array(['M',67.00,'Central',71.0,'Central','Science',64.33,'Others','Yes',64.00,'Mkt&HR',61.26], dtype=object).reshape(1,12), columns=x.columns)

pipe.predict(new_data)

SVC Model

In [None]:
model_svc = SVC()
pipe_svc = make_pipeline(trans_nom, model_svc)
pipe.fit(x_train,y_train)
pred = pipe.predict(x_test)
accuracy_score(pred,y_test)*100

KNN Model

In [None]:
model_knn = KNeighborsClassifier(n_neighbors = 3)
pipe_knn = make_pipeline(trans_nom, model_knn)
pipe.fit(x_train,y_train)
pred = pipe.predict(x_test)
accuracy_score(pred,y_test)*100

****Confusion Matrix****

Logistic Regression Confusion Matrix

In [None]:
plot_confusion_matrix(pipe, x_test, y_test)

**Does the Accuracy Really Give a Good Picture ???**

However we got around 92% accuracy. It doesnt paint a great picture abt the overall performance of this data since the number of placed students are way more than candidates who are not placed as we can see below.

In [None]:
data.status.value_counts()

In [None]:
sns.countplot(data.status)

This is a classic case of Imbalanced dataset.
Now next we will move on to balance it.

Installing Imblearn 

In [None]:
# !pip install imblearn

In [None]:
y.status.value_counts()

OVER-SAMPLING

In [None]:
from imblearn.over_sampling import RandomOverSampler

oversamp = RandomOverSampler()

x_over,y_over = oversamp.fit_resample(x,y)

y_over.status.value_counts()

UNDER-SAMPLING

In [None]:
from imblearn.under_sampling import RandomUnderSampler

undersamp = RandomUnderSampler()

x_under, y_under = undersamp.fit_resample(x,y)

y_under.status.value_counts()

Logistic Regression Model after balancing dataset

In [None]:
set_config(display='diagram')

smote = SMOTE()

pipe_new_lr = make_pipeline(trans_nom, smote, model)

pipe_new_lr.fit(x_train,y_train)
pred = pipe_new_lr.predict(x_test)
print("Accuracy score for Logistic Regression : ",accuracy_score(pred,y_test)*100)

pipe_new_lr

SVC model 

In [None]:
smote = SMOTE()

pipe_new_svc = make_pipeline(trans_nom, smote, model_svc)

pipe_new_svc.fit(x_train,y_train)
pred = pipe_new_svc.predict(x_test)
accuracy_score(pred,y_test)*100

KNN model

In [None]:
smote = SMOTE()

pipe_new_knn = make_pipeline(trans_nom, smote, model_knn)

pipe_new_knn.fit(x_train,y_train)
pred = pipe_new_knn.predict(x_test)
accuracy_score(pred,y_test)*100

Confusion matrix for best model (Logistic Regression)

In [None]:
plot_confusion_matrix(pipe_new_lr, x_test, y_test,cmap=plt.cm.Blues)