In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/kaggle/input/factors-affecting-campus-placement/Placement_Data_Full_Class.csv', index_col=0)
df.head()

## Basic EDA

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.hist()
plt.show()

## Q. What is average salary? And what are chances of getting higher salary?

In [None]:
df.salary.median()

In [None]:
sns.displot(x='salary', data=df, kde=True)
plt.show()

In [None]:
ranges = [0, 200000, 300000, 400000, 500000, 600000, 700000, 800000, np.inf]
labels = ['0-200k', '200k-300k', '300k-400k', '400k-500k', '500k-600k', '600k-700k', '700k-800k', '800k+']

salary_group = pd.cut(df.salary, bins=ranges, labels=labels)
salary_group.head()

In [None]:
sns.countplot(salary_group)
plt.xticks(rotation=45)
plt.show()

In [None]:
sns.displot(x='salary', data=df, kind='ecdf')
plt.grid(True)
plt.xticks(rotation=30)
plt.ylabel('Percent')
plt.show()

#### Ans: 
* Average salary lies between 200k to 300k. And ecdf shows that 80 percent of data lies below 300k.
* So the chances of getting salary higher than 300k is only 20 percent and getting more than 400k is less than 10 percent.

## Q. Does percentage have any affect on placements?

In [None]:
df['total_p'] = round((df['ssc_p'] + df['hsc_p'] + df['degree_p'] + df['mba_p']) / 4, 2)
df['total_p'].head()

In [None]:
placed = df[df['status']=='Placed']
not_placed = df[df['status']=='Not Placed']

In [None]:
sns.displot(x='ssc_p', data=df, kind='ecdf', hue='status')
plt.xlabel('SSC Percentage')
plt.grid(True)
plt.show()
print('Median SSC marks of student who got placed is',placed['ssc_p'].median())
print("Median SSC marks of student who didn't got placed is",not_placed['ssc_p'].median())

In [None]:
sns.displot(x='hsc_p', data=df, hue='status', kind='ecdf')
plt.xlabel('HSC Percentage')
plt.grid(True)
plt.show()
print('Median HSC marks of student who got placed is',placed['hsc_p'].mean())
print("Median HSC marks of student who didn't got placed is",not_placed['hsc_p'].mean())

In [None]:
sns.displot(x='degree_p', data=df, kind='ecdf', hue='status')
plt.xlabel('Degree Percentage')
plt.grid(True)
plt.show()
print('Medaian degree marks of student who got placed is',placed['degree_p'].median())
print("Medaian degree marks of student who didn't got placed is",not_placed['degree_p'].median())

In [None]:
sns.displot(x='mba_p', data=df, kind='ecdf', hue='status')
plt.xlabel('Degree Percentage')
plt.grid(True)
plt.show()
print('Medaian MBA marks of student who got placed is',round(placed['mba_p'].median()),2)
print("Medaian MBA marks of student who didn't got placed is",not_placed['mba_p'].median())

##### Ok, here we can see something different. MBA marks do't matter that much in placement which is a good sign, but to get a MBA from a good college you need to score well before.

In [None]:
sns.displot(x='total_p', data=df, kind='ecdf', hue='status')
plt.xlabel('Total Percentage')
plt.grid(True)
plt.show()
print('Medaian of total marks of student who got placed is',placed['total_p'].median())
print("Medaian of total marks of student who didn't got placed is",not_placed['total_p'].median())

#### Ans: 
* Yes, percentages do affect placements. And the median of student who got placed is slightly higher than those who didnt got placed.
* In the figure of total percentage we can see 80% of student who didn't got placed have percentage less than 65 percent, while there are only around 30% of students who got placed have percentage less than 65 percent. 
* Which means 70% of students got placed have greater than 65 percent.

## Q. Which degree have highest chances of placement and type of degree even matters?

In [None]:
sns.countplot(x='degree_t', data=df, hue='status')
plt.xlabel('Degree Type')
plt.show()

##### Ok, so Commerce and Managemnt have highest placement but it's because they have more students. We want to find which stream have highest chances of placement.

In [None]:
print((placed.degree_t.value_counts() / df.degree_t.value_counts()) * 100)

In [None]:
((placed.degree_t.value_counts() / df.degree_t.value_counts()) * 100).plot()
plt.ylabel('Placement Percentage')
plt.xlabel('Degree Type')
plt.show()

#### Ans: 
* Comm&Mgmt and Sci&Tech have same chances of placement, the only difference is the number of students. 
* Other degress have less chances of placement. So, degree does matters.

## Q. Which Degree type get high packages?

In [None]:
sns.displot(x='salary', hue='degree_t', kind='kde', data=df)
plt.xlabel('Degree Type')
plt.show()

In [None]:
sns.displot(x='salary', hue='degree_t', kind='ecdf', data=df)
plt.xlabel('Degree Type')
plt.grid(True)
plt.xticks(rotation=30)
plt.show()

#### Ans: 
* Sci&Tech have high chances of getting package more than 400k i.e. 20%.
* Comm&Mgmt have less than 10% chance of getting more than 400k. But the placemet of Comm&Mgmt is high.

## Q. Does specialization have any impact on placement and salary?

In [None]:
sns.countplot(x='specialisation', hue='status', data=df)
plt.xlabel('Specialisation')
plt.show()

In [None]:
print((placed.specialisation.value_counts() / df.specialisation.value_counts()) * 100)

##### Marketing and Finance have large number of students but still there placement is around 25% high than Marketing and HR. Let's see difference in salary

In [None]:
sns.displot(x='salary', hue='specialisation', kind='kde', data=placed)
plt.xlabel('Salary')
plt.show()

In [None]:
sns.displot(x='salary', hue='specialisation', kind='ecdf', data=placed)
plt.xlabel('Salary')
plt.xticks(rotation=35)
plt.grid(True)
plt.show()

#### Ans: 
* Mkt&Fin is a clear winner here, there student strenth is high, their placement is 25% higher than Mkt&HR and even the range of salary is high. 
* In Mkt&HR there are very few student got more than 300k.

## Q. Does work experience matters for placement and salary? 

In [None]:
sns.countplot(x='status', hue='workex', data=df)
plt.show()

##### Ok, so students with no work ecperience have high chances of not getting placed. But there is a chance one might not get placed even if you have work experence and one can get placed even if he don't have work experence, let's dig deep into it.

In [None]:
placed.workex.value_counts()

In [None]:
df.workex.value_counts()

In [None]:
print(round((placed.workex.value_counts() / df.workex.value_counts()) * 100, 2))

##### Company prefer students with work experience and chances of getting placed is 86% if student have work experience, company might take students without any work experience but there is 60% chances of that. So chances increase getting placed increase by 26% if student have any work experence.

In [None]:
sns.displot(x='salary', hue='workex', data=df, kind='kde')
plt.show()

In [None]:
sns.displot(x='salary', hue='workex', data=df, kind='ecdf')
plt.xticks(rotation=30)
plt.xlabel('Salaries')
plt.grid(True)
plt.show()

##### As you can see, all high packages are offered to students who had some work experience and very few students got more than 300k who had no work experience

In [None]:
print(placed[placed['workex']=='No']['salary'].sort_values(ascending=False)[:10])

#### Ans: 
* Clearly experience matters for getting placed and for getting high package. 
* Company can hire you without work experience but chances of that is 60% and you won't get a high package, only few students got package more than 400k rest of them are below 300k.
* Students chances of getting placed increases by 25% if they have work experience and all high packages are being offered to students with work experience.

## Pre-Processing

In [None]:
# let's create classification df for classification models
df = pd.read_csv('/kaggle/input/factors-affecting-campus-placement/Placement_Data_Full_Class.csv', index_col=0)

class_df = df

percentages = ['ssc_p', 'hsc_p', 'degree_p', 'etest_p', 'mba_p']

In [None]:
for i in percentages:
    sns.displot(class_df[i])
    plt.show()

In [None]:
ranges = [0, 35, 50, 60, 70, 80, 90, np.inf]
labels = ['F', 'E', 'D', 'C', 'B', 'A', 'A+']

for i in percentages:
    class_df[i] = pd.cut(class_df[i], bins=ranges, labels=labels)
    class_df[i] = class_df[i].astype('object')
    
class_df.head()

In [None]:
df.head()

In [None]:
class_df = class_df.drop('salary', axis=1)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

le = LabelEncoder()

for i in class_df.columns:
    if class_df[i].dtype == 'object':
        class_df[i] = le.fit_transform(class_df[i])

X = class_df.drop('status', axis=1)
y = class_df.status

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
from imblearn.over_sampling import SMOTE

print("Before OverSampling, counts of label '1': {}".format(sum(y_train==1)))
print("Before OverSampling, counts of label '0': {} \n".format(sum(y_train==0)))

sm = SMOTE(random_state=2)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train.ravel())

print('After OverSampling, the shape of train_X: {}'.format(X_train_res.shape))
print('After OverSampling, the shape of train_y: {} \n'.format(y_train_res.shape))

print("After OverSampling, counts of label '1': {}".format(sum(y_train_res==1)))
print("After OverSampling, counts of label '0': {}".format(sum(y_train_res==0)))

## Decision Tree Classifier

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

dt = DecisionTreeClassifier()

tree_param = {'criterion':['gini','entropy'],'max_depth':[4,5,6,7,8,9,10,11,12,15,20,30,40,50,70,90,120,150],\
              'min_samples_leaf':[4,5,6,7,8,9,10,11,12,15,20,30,40,50,70,90,120,150]}

grid = GridSearchCV(dt, param_grid=tree_param, cv=5)

grid.fit(X_train_res, y_train_res)

In [None]:
dt = grid.best_estimator_

pipeline = make_pipeline(StandardScaler(), dt)

pipeline.fit(X_train_res, y_train_res)

y_pred = pipeline.predict(X_test)

print(accuracy_score(y_pred, y_test))
print(classification_report(y_pred, y_test))

In [None]:
bc = BaggingClassifier(base_estimator=dt, n_estimators=50, random_state=1, oob_score=True)

bc.fit(X_train_res, y_train_res)

# Predict test set labels
y_pred = bc.predict(X_test)

print(accuracy_score(y_pred, y_test))
print(classification_report(y_pred, y_test))

## Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()

tree_param = {'criterion':['gini','entropy'],'max_depth':[4,5,6,7,8,9,10,11,12,15,20,30,40,50,70,90,120,150],\
              'min_samples_leaf':[4,5,6,7,8,9,10,11,12,15,20,30,40,50,70,90,120,150]}

rand = RandomizedSearchCV(rf, tree_param, cv=5)
rand.fit(X_train_res, y_train_res)

In [None]:
rf = rand.best_estimator_

pipeline = make_pipeline(StandardScaler(), rf)

pipeline.fit(X_train_res, y_train_res)

y_pred = pipeline.predict(X_test)

print(accuracy_score(y_pred, y_test))
print(classification_report(y_pred, y_test))

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier

# Define the classifiers
classifiers = [LogisticRegression(), LinearSVC(),
               SVC(), KNeighborsClassifier()]

# Fit the classifiers
for c in classifiers:
    c.fit(X_train_res, y_train_res)
    print(c.score(X_test, y_test))

In [None]:
# Train and validaton errors initialized as empty list
train_errs = list()
valid_errs = list()
C_values = [0.001, 0.01, 0.1, 1, 10, 100, 1000]

# Loop over values of C_value
for C_value in C_values:
    # Create LogisticRegression object and fit
    lr = LogisticRegression(C=C_value)
    lr.fit(X_train, y_train)
    
    # Evaluate error rates and append to lists
    train_errs.append(lr.score(X_train_res, y_train_res) )
    valid_errs.append(lr.score(X_test, y_test) )
    
# Plot results
plt.semilogx(C_values, train_errs, C_values, valid_errs)
plt.legend(("train", "validation"))
plt.show()

In [None]:
lr = LogisticRegression(C=1, penalty='l2')

lr.fit(X_train_res, y_train_res)

print('Score:', lr.score(X_test, y_test))

## Conclusion:
* Average salary lies between 200k to 300k. And ecdf shows that 80 percent of data lies below 300k.
* The chances of getting salary higher than 300k is only 20 percent and getting more than 400k is less than 10 percent, and it depends on factors like degree type, over all percentage and work experience
* Students with degress of Science & Tech have good placement and have more chances of getting high package compared to other degress type.
* Commerce and Management also have good placement and the number of seats are high.
* MBA in Finance have high placement percentage i.e. around 80% and they get high packages.
* Work experience is really important for getting placed and getting high package, all the high packages are offered to students with work experience.
* Students can get job without work experience but chances are low and packages are also low.
* So, if student wants a good package he should have aggregate more than 65%, and if he/she is doing a MBA then MBA in Finance is a great option and while studying he/she should try to get work experience and that's really important.