In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # Data visualization
import seaborn as sb 
from itertools import product
%matplotlib inline
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Here we will import the data and load it to a dataframe
df = pd.read_csv('../input/factors-affecting-campus-placement/Placement_Data_Full_Class.csv')

In [None]:
# Let's see our top 5 rows
df.head()

In [None]:
# Summary of our dataframe
df.info()

- Here we can see that our dataset contains 215 rows with 15 features asssociated with it.

In [None]:
# Checking for Null values in our Dataset
df.isnull().sum()

- Salary contains 67 NaN values in the dataset. We will replace all those NaN values in the next step.

In [None]:
# Rows whose Salary value is not present
df[df['salary'].isnull()]

In [None]:
# Let's replace those NaN value with 0
df['salary'] = df['salary'].fillna(0)

In [None]:
# Checking all the null values are removed
df.isnull().sum()

### Univariate Exploration:

#### Different Candidate count in Placement Drive:

In [None]:
# Let's check the different data of Gender column
df['gender'].value_counts()

In [None]:
# Exploration of different Candidate
plt.figure(figsize = (14.7, 8.27))
viscolor = sb.color_palette('colorblind')[0]
ax = sb.countplot(data = df, x = 'gender', color = viscolor, order = df['gender'].value_counts().index)
plt.xlabel('Gender')
plt.ylabel('Count')
for i, v in df['gender'].value_counts().reset_index().iterrows():
    ax.text(i, v.gender + 0.2 , v.gender, color='black')
plt.title('Different Candidate in Placement Drive');

- We can see here that in the record of the placement drive there is more number of Male candidates than that of female candidates.

#### Education Specialization:

In [None]:
# Value of count of different Specialization
df['hsc_s'].value_counts()

In [None]:
# Visualization of higher secondary specialization
cls_name = ['Commerce', 'Science', 'Arts']
fig, ax = plt.subplots(figsize = (14.7, 8.27))
wedges, text, autotext = ax.pie(df['hsc_s'].value_counts(), labels = cls_name, autopct = '%1.2f%%')
ax.legend(wedges, cls_name, loc = "center left", bbox_to_anchor =(1, 0, 0.5, 1))
ax.set_title("Proportion of Different Specialization in Higher Secondary");

- We can see that in placement drive more candidate are from Commerce and Science background. Less candidates are from Art's background means only 5.12%.

In [None]:
# Value count of Degree specialization
df['degree_t'].value_counts()

In [None]:
# Value count of Postgrad specialization
df['specialisation'].value_counts()

In [None]:
# Visualization of Degree Specialization
fig, ax = plt.subplots(figsize = (14.7, 8.27))
wedges, text, autotext = ax.pie(df['degree_t'].value_counts(),
                                labels = df['degree_t'].value_counts().index, 
                                autopct = '%1.2f%%')
ax.legend(wedges, df['degree_t'].value_counts().index,
          loc = "center left", bbox_to_anchor =(1, 0, 0.5, 1))
ax.set_title("Proportion of Different Specialization in Degree");

- Here we can see that more Candidate have Comm&Mgmt degree followed by Sci&Tech.

In [None]:
# Visualization of Postgrad Specialization
fig, ax = plt.subplots(figsize = (14.7, 8.27))
wedges, text, autotext = ax.pie(df['specialisation'].value_counts(),
                                labels = df['specialisation'].value_counts().index, 
                                autopct = '%1.2f%%')
ax.legend(wedges, df['specialisation'].value_counts().index,
          loc = "center left", bbox_to_anchor =(1, 0, 0.5, 1))
ax.set_title("Proportion of Different Specialization in Post Graduation");

- Here we can see that 55.81% belongs to Mkt&Fin postgraduation program. Other belongs to Mkt&HR background.

#### Creating Category of Mark Secured in Different Educational Phase:
- Here we will create 3 category:
    - 85% + 
    - 60% - 85%
    - < 60% 

In [None]:
# Defining a function which will be used to determine the above category.
# We will store that in a new category
def checkCateg(perct):
    if(perct >= 85):
        return '85% +'
    elif(perct < 85 and perct >= 60):
        return '60% - 85%'
    else:
        return '< 60%'

In [None]:
# Adding new columns in our dataframe with this category
df['ssc_p_catg'] = df['ssc_p'].apply(checkCateg)
df['hsc_p_catg'] = df['hsc_p'].apply(checkCateg)
df['mba_p_catg'] = df['mba_p'].apply(checkCateg)
df['degree_p_catg'] = df['degree_p'].apply(checkCateg)
df['etest_p_catg'] = df['etest_p'].apply(checkCateg)

In [None]:
# Let's visualize what is proportion of different score in HSC and SSC
categ_cls = ['60% - 85%', '< 60%', '85% +']
fig, (ax1, ax2) = plt.subplots(1, 2, figsize =(14.70, 8.27))
wedges, text, autotext = ax1.pie(df['hsc_p_catg'].value_counts(),
                                 labels = df['hsc_p_catg'].value_counts().index,
                                 autopct = '%1.2f%%')
ax1.set_title("Score of Different Candidate in HSC");
plt.tight_layout()
wedges, text, autotext = ax2.pie(df['ssc_p_catg'].value_counts(),
                                 labels = df['ssc_p_catg'].value_counts().index,
                                 autopct = '%1.2f%%')
ax2.set_title("Score of Different Candidate in SSC");
ax2.legend(wedges, categ_cls,
           loc = "upper right", bbox_to_anchor =(1, 0, 0.5, 1));

##### Insights:
- We can see that in the placement drive more number of candidate belong to 60%-85% cateogry in HSC and SSC. Only 4 to 6% candidate belongs to 85% category in HSC and SSC.
- We will further visualize the other score and gain some insight.

In [None]:
# Let's visualize what is proportion of different score in Degree and MBA
categ_cls = ['60% - 85%', '< 60%', '85% +']
fig, (ax1, ax2) = plt.subplots(1, 2, figsize =(14.70, 8.27))
wedges, text, autotext = ax1.pie(df['degree_p_catg'].value_counts(),
                                 labels = df['degree_p_catg'].value_counts().index,
                                 autopct = '%1.2f%%')
ax1.set_title("Score of Different Candidate in Degree");
plt.tight_layout()
wedges, text, autotext = ax2.pie(df['mba_p_catg'].value_counts(),
                                 labels = df['mba_p_catg'].value_counts().index,
                                 autopct = '%1.2f%%')
ax2.set_title("Score of Different Candidate in MBA");
handles, labels = ax1.get_legend_handles_labels()
fig.legend(handles, labels, loc='upper right');

- Here we can see the similar trend as previously. We can see more number of candidate belong to 60%-85% category.
- In Degree only 0.93% candidate belong to "85% +" category. In this placement drive No candidate have "85% +" in MBA.

In [None]:
# Finally we will visualize Etest
fig, ax = plt.subplots(figsize =(14.70, 8.27))
wedges, text, autotext = ax.pie(df['etest_p_catg'].value_counts(),
                                labels = df['etest_p_catg'].value_counts().index,
                                autopct = '%1.2f%%')
ax.set_title("Score of Different Candidate ETest")
handles, labels = ax.get_legend_handles_labels()
fig.legend(handles, labels,loc = "center", bbox_to_anchor =(0, 0, 0.5, 1));

- In ETest we can observe the same trend as above. But we can see 22.33% candidate belong to 85% + category.

#### Visualization of Work Experience of Candidates in placement drive:

In [None]:
# Visualize of Work Experience
plt.figure(figsize = (14.7, 8.27))
ax = sb.countplot(data = df, x = 'workex', color = viscolor, order = df['workex'].value_counts().index)
plt.xlabel('Work Experience')
plt.ylabel('Count')
for i, v in df['workex'].value_counts().reset_index().iterrows():
    ax.text(i, v.workex + 0.2 , v.workex, color='black')
plt.title('Work Experience of Candidates in Placement Drive');

- We can see that more candidate has no work experience. 

#### Candidate Placement Status:

In [None]:
# Visualizing candidate placement status
plt.figure(figsize = (14.7, 8.27))
ax = sb.countplot(data = df, x = 'status', color = viscolor,
            order = df['status'].value_counts().index)
plt.xlabel('Placement Status')
plt.ylabel('Count')
for i, v in df['status'].value_counts().reset_index().iterrows():
    ax.text(i, v.status + 0.2 , v.status, color='black')
plt.title('Candidate Placement Status');

- Here we can see that more of the candidate are placed.

In the next exploration we will see the relation between two variable and gain some insights.

### Bivariate Exploration:

#### Placment Status of Different Candidate:

In [None]:
# defining a function which will show the frequency of the bar in the countplot
def withhue(ax, feature, nooffeature, huecategories):
    a = [p.get_height() for p in ax.patches]
    patch = [p for p in ax.patches]
    for i in range(nooffeature):
        total = df[feature].value_counts().values[i]
        for j in range(huecategories):
            percentage = '{:.1f}%'.format(100 * a[(j*nooffeature + i)]/total)
            x = patch[(j*nooffeature + i)].get_x() + patch[(j*nooffeature + i)].get_width() / 2 - 0.15
            y = patch[(j*nooffeature + i)].get_y() + patch[(j*nooffeature + i)].get_height()
            ax.annotate(percentage, (x, y), size = 12, ha='center', va='bottom')
    plt.show()

In [None]:
# Visualization of placement status of different candidate based on gender
plt.figure(figsize = (14.7, 8.27))
ax = sb.countplot(data = df, x = 'gender', hue = 'status')
plt.xlabel('Gender')
plt.ylabel('Count')
plt.title('Placement Status of Different Candidate');
withhue(ax, 'gender', 2, 2)

- Here we can see that more number of Male candidates are placed than that of the female candidates. Also, more number of Male candidates are not placed as compared to female candidate.

#### Work Experience of Different Candidate:

In [None]:
# Visualization of Work experience of different candidate
plt.figure(figsize = (14.7, 8.27))
ax = sb.countplot(data = df, x = 'gender', hue = 'workex')
plt.xlabel('Gender')
plt.ylabel('Count')
plt.title('Work Experience of Different Candidate');
withhue(ax, 'gender', 2, 2)

- Proportion of No workexperience is more for both male and female candidate.

#### MBA Specialisation of Different Candidate:

In [None]:
# Visualisation of MBA specialisation of different candidate:
plt.figure(figsize = (14.7, 8.27))
ax = sb.countplot(data = df, x = 'gender', hue = 'specialisation')
plt.xlabel('Gender')
plt.ylabel('Count')
plt.title('MBA Specialisation of different Candidate');
withhue(ax, 'gender', 2, 2)

- Male candidate have more number of Mkt&Fn specialisation where as Female candidates have more number of Mkt&HR specialisation.

#### Salary of Different Candidate with WorkExperience:

In [None]:
# Visualisation of Salary of Different Candidate with WorkExperience
plt.figure(figsize = (14.7, 8.27))
sb.violinplot(data = df, x = 'workex', y = 'salary', color = viscolor, inner = 'quartile')
plt.xlabel('Work Experience')
plt.ylabel('Salary')
plt.title('Salary of Candidate by Work Experience');

- Candidate with work experience is getting more salary than that of the candidate having no experience.

#### HSC Percentage of different HSC Board:

In [None]:
# Visualization of HSC percentage distribution of different HSC Board
plt.figure(figsize = (14.7, 8.27))
sb.boxplot(data = df, x = 'hsc_b', y = 'hsc_p', color = viscolor)
plt.xlabel('HSC Board')
plt.ylabel('HSC Percetage')
plt.title('HSC Percentage of Different HSC Board');

- Central board candidate has better median score than that of Other boards. But we can see that Other board candiate has higher score range than that of the central board.

### Multivariate Exploration:

#### Placement status of different candidate by MBA and Degree:

In [None]:
g = sb.catplot(kind = 'count', data = df, x = 'gender',
               hue = 'specialisation', col = 'status', row = 'degree_t');
g.set_axis_labels('Gender', 'Count');

- Candidates having Comm&Mgmt degree and Mkt&Fin specialisation are placed in more number than that of Mkt&HR specialisation.

#### Placement Status of Different Degree holders with Specialisation:

In [None]:
# Visualisation of Placement status of different degree holders
g = sb.catplot(kind = 'violin', data = df, x = 'degree_t', y = 'degree_p' , col = 'status',
              inner = 'quartile', color = viscolor, row = 'specialisation');
g.set_axis_labels('Degree', 'Percentage');

- Having higher percentage gives more chance of placement.

#### Salary of Different Candidates with Degree and Specialisation:

In [None]:
g = sb.catplot(kind = 'swarm', data = df.query('status == "Placed"'), x = 'degree_t', y = 'salary',
              color = viscolor, row = 'specialisation', col = 'workex');
g.set_axis_labels('Degree', 'Salary');

- Here we can see that salary of different degree holders having Mkt&Fin specialisation is getting more salary than that of the Mkt&HR.

#### Job offers by Educational Qualification:

In [None]:
g = sb.catplot(kind = 'count', data = df, x = 'hsc_s',
               hue = 'status', col = 'degree_t', row = 'specialisation');
g.set_axis_labels('Qualification', 'Count');

- We can see every sort of degree holders has received jobs. But Those who have Commerce background and has a degree in Comm&Mgmt and MBA Specialisation in Mkt&Fn, Mkt&HR are more likely to be placed followed by Science background with Sci&Tech degree with Mkt&Fn, Mkt&HR specialisation.

#### Insights:
- By analyzing the data we saw that more number of Commerece background candidate are placed followed by Science background candidates.
- Mkt&Fn MBA specialization candidates are placed in more numbers followed by Mkt&HR specialisation.
- Work experience is your bonus point. Candidate having work experience are more likely to get a good salary package.
- Scoring good percentage/score throughout the career may help candidate secure a job.

### Classification of Placement Status & Prediction of Salary using Different Attribute:
As we have gained some insights about the attributes helping a candidate secure a job and a good package. So, first using these attribute we will classify if a candidate is going to secure a job or not; then we are going to follow Regression mechanism to predict the package that a candidate might get.

#### Label Encoding & Feature Scaling:
- There are different features that needed to be scaled before used in a model. So do different categorical variables which need to be encoded.
###### Reference:
- https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
- https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html

In [None]:
# Label encoding
# Importing the library
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df[['gender','ssc_b','hsc_b','hsc_s','degree_t','workex','specialisation','status']] = df[['gender','ssc_b','hsc_b','hsc_s','degree_t','workex','specialisation','status']].apply(le.fit_transform)

In [None]:
# Checking the Encoded values
df.head(10)

In [None]:
# Feature Scaling
# Importing Library
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df[['ssc_p','hsc_p','degree_p','etest_p','mba_p','salary']] = scaler.fit_transform(df[['ssc_p','hsc_p','degree_p','etest_p','mba_p','salary']])

In [None]:
# Checking Scaled features
df.head(10)

In [None]:
# Separating Features for classification
X = df.iloc[:,1:13]
y = df.iloc[:, 13]

In [None]:
# Creating Training and Test data for classification
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, shuffle = True)

In [None]:
# Importing Classifier Libraries
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score

In [None]:
recall_l, precision_l, accuracy_l = [], [], []

In [None]:
# Defining a Function to get the accuracy of different classifiers
def get_accuracy(y_test, y_pred):
    matrix = confusion_matrix(y_test,y_pred)
    TP = matrix[1][1]
    TN = matrix[0][0]
    FP = matrix[0][1]
    FN = matrix[1][0]
    # calculate the Recall
    recall = TP / (TP + FN)    
    # calculate the Precision
    precision = TP / (TP + FP)
    recall_l.append(recall)
    precision_l.append(precision)
    accuracy_l.append(accuracy_score(y_test, y_pred))
    return recall, precision, accuracy_score(y_test, y_pred)

In [None]:
# Decision Tree Classifier
df_clf = DecisionTreeClassifier(random_state = 0)
df_clf.fit(X_train, y_train)
y_pred = df_clf.predict(X_test)
recall, precision, accuracy = get_accuracy(y_test, y_pred)
print('================================')
print('Classifier: Decision Tree')
print('Recall    : ', recall)
print('Precision : ', precision)
print('Accuracy  : ', accuracy)

In [None]:
# Random Forest Classifier
rf_clf = RandomForestClassifier(n_estimators = 50, random_state = 0, criterion = 'entropy')
rf_clf.fit(X_train, y_train)
y_pred = rf_clf.predict(X_test)
recall, precision, accuracy = get_accuracy(y_test, y_pred)
print('================================')
print('Classifier: Random Forest')
print('Recall    : ', recall)
print('Precision : ', precision)
print('Accuracy  : ', accuracy) 

In [None]:
# Logistic Regression
lr = LogisticRegression(solver = 'lbfgs')
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
recall, precision, accuracy = get_accuracy(y_test, y_pred)
print('================================')
print('Classifier: Logistic Regression')
print('Recall    : ', recall)
print('Precision : ', precision)
print('Accuracy  : ', accuracy) 

In [None]:
# Plotting Recall, Precision, Accuracy of classifier
x = ['DecisionTree', 'RandomForest', 'LogisticRegression']
x_axis = np.arange(len(x))
plt.figure(figsize = [14.70, 8.27])
plt.bar(x_axis - 0.2, recall_l, 0.15, label = 'Recall')
plt.bar(x_axis, precision_l, 0.15, label = 'Precision')
plt.bar(x_axis + 0.2, accuracy_l, 0.15, label = 'Accuracy')
plt.xticks(x_axis, x)
plt.legend()
plt.title('Recall, Precision, Accuracy of Classifiers');

In [None]:
# Preparing the data for our Regression model
X = df.iloc[:,1:14]
y = df.iloc[:, 14]

In [None]:
# Creating Training and Test data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, shuffle = False)

In [None]:
# Importing Library
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [None]:
# Root Mean Squared Error List for Regressor
rmse = []

In [None]:
# Random Forest Regression
regressor_rf = RandomForestRegressor(n_estimators = 20, random_state = 0)
regressor_rf.fit(X_train, y_train)
y_test = regressor_rf.predict(X_test)
print('==============================')
print('Regression Model      : RandomForest')
print('RMSE                  : ', mean_squared_error(y_test, y_pred, squared = False))
rmse.append(mean_squared_error(y_test, y_pred, squared = False))

In [None]:
# Decision Tree Regressor
regressor_dt = DecisionTreeRegressor(random_state = 0)
regressor_dt.fit(X_train, y_train)
y_test = regressor_dt.predict(X_test)
print('==============================')
print('Regression Model      : DecisionTree')
print('RMSE                  : ', mean_squared_error(y_test, y_pred, squared = False))
rmse.append(mean_squared_error(y_test, y_pred, squared = False))

In [None]:
# Linear Regression
regressor_lr = LinearRegression()
regressor_lr.fit(X_train, y_train)
y_test = regressor_lr.predict(X_test)
print('==============================')
print('Regression Model      : Linear')
print('RMSE                  : ', mean_squared_error(y_test, y_pred, squared = False))
rmse.append(mean_squared_error(y_test, y_pred, squared = False))

In [None]:
# Plotting RMSE
x = ['RandomForest', 'DecisionTree', 'LinearRegression']
x_axis = np.arange(len(x))
plt.figure(figsize = (14.7, 8.27))
plt.bar(x_axis, rmse, 0.15, label = 'RMSE')
plt.xticks(x_axis, x)
plt.title('RMSE for Regressor');