In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import pandas_profiling

#Plotly Libraris

# Run the below code if PLOTLY is not installed
#!pip install plotly

import plotly.express as px
import plotly.graph_objects as go
from plotly.colors import n_colors
from plotly.subplots import make_subplots
#Run the below code in anaconda promt for pandas-profiling to work
#conda install -c conda-forge pandas-profiling

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Loading data**

In [None]:
data = pd.read_csv("/kaggle/input/factors-affecting-campus-placement/Placement_Data_Full_Class.csv")
data.drop("sl_no", axis=1, inplace=True)
print(data.shape)
data.head()

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
data.describe(include="O") #O -> include categorical columns

In [None]:
data.nunique()

### Missing Value Treatment

In [None]:
data.isna().sum()

In [None]:
# Salary has missing values. Lets replace that with '0' as the status of all Salary='Null' is 'Not Placed'
# We cannot drop the values as the dataset is small and also it might give valueable information as to why the student did not get place

print(round(data['salary'].isnull().sum()/len(data['salary'])*100,2),"% of data in Salary column is NULL!")

In [None]:
data['salary'] = data['salary'].fillna(value=0)

# Exploratory Data Analysis
### Interactive report

In [None]:
from pandas_profiling import ProfileReport

ProfileReport(data)

### Multivariate Analysis
- Seaborn Library

In [None]:
print("Salary Distribution as per gender")

plt.figure(figsize=(15,5))
sns.kdeplot(data.salary[ data.gender == "M"])
sns.kdeplot(data.salary[ data.gender == "F"])
plt.legend(["Male", "Female"])
plt.xlabel("Salary(100)")
plt.show()

### Observation:
- Male students got offered more salary than Female students
- More of male students were placed while more female students were not placed.

In [None]:
print("Placement vs Marks at differnt education level")

f, axes = plt.subplots(2, 2, figsize=(15, 10), sharex=False, squeeze = True)
#sns.despine(left=True)

sns.kdeplot(data.ssc_p[ data.status== "Placed"], ax=axes[0, 0])
sns.kdeplot(data.ssc_p[ data.status== "Not Placed"], ax=axes[0, 0])
plt.xlabel("10th Marks")


sns.kdeplot(data.hsc_p[ data.status== "Placed"], ax=axes[0, 1])
sns.kdeplot(data.hsc_p[ data.status== "Not Placed"], ax=axes[0, 1])
plt.xlabel("12th Marks")

sns.kdeplot(data.degree_p[ data.status== "Placed"], ax=axes[1, 0])
sns.kdeplot(data.degree_p[ data.status== "Not Placed"], ax=axes[1, 0])
plt.xlabel("Degree_Marks")

sns.kdeplot(data.mba_p[ data.status== "Placed"], ax=axes[1, 1])
sns.kdeplot(data.mba_p[ data.status== "Not Placed"], ax=axes[1, 1])
plt.legend(["Placed", "Not Placed"])
plt.xlabel("MBA_Marks")

plt.setp(axes, yticks=[])
plt.tight_layout()

### Observation:
* SSC
    - Students with good average marks were offered more jobs than top rankers
    - Students scoring less than 50% are not getting job oppurtunities
* HSC
    - Students getting less than 50% are not selected
    - Students with average score have more chances of getting job offers
* Degree
    - Average students have more chances of getting job offers
    - No student with less than 50% got job offers
* MBA
    - Students with 70% score have got more job offers.

### * For the below graphs Python Plotly is used

In [None]:
fig = px.bar(data, x="gender", y="salary",color="gender",facet_row="workex", facet_col="specialisation")
fig.update_layout(title_text='Facet view of Student Salary wrt Gender, Specialization in Higher education and previous work experience')
fig.show()

### Observation
- Maximum salary offered in 3lakh package
- More job offers are in Marketting in Finance.
- Students with work experience get good offers.
* Male
    - Specialzation in Marketting & Finance, irrespctive of work experience they got more job offers with greater CTC.
    - Specialzation in Marketting & HR, they got CTC as per previous work experience.
* Female
    - Specialzation in Marketting & Finance, they got lesser CTC wrt to male students.
    - Specialzation in Marketting & HR, very less students have work experience.

In [None]:
fig = px.scatter(data,x="mba_p",y="salary",color="specialisation", facet_row='gender', facet_col="workex")
fig.update_layout(title_text='Facet view of Student Salary wrt Gender, MBA%, HighEd Specialization and previous work experience')
fig.show()

In [None]:
status = data['status'].value_counts().to_frame().reset_index().rename(columns={'index':'Status','status':'Count'})
status

In [None]:
fig = go.Figure([go.Pie(labels = status['Status'], values = status['Count'], hole=0.6)])
fig.update_traces(hoverinfo='label+percent', textinfo='value+percent', textfont_size=15,insidetextorientation='radial')
fig.update_layout(title="Placement staus",title_x=0.5)
fig.show()

In [None]:
sunburst = data[['gender','status','specialisation','salary','degree_t']].groupby(['gender','status','specialisation','salary','degree_t']).agg('max').reset_index()


In [None]:
fig = px.sunburst(sunburst, path=['gender','status','specialisation','degree_t'], values='salary')
fig.update_layout(title="Salary Distribution by Gender, Placement Status, HigherEd Specialization, Degree Subject",title_x=0.5)
fig.show()

In [None]:
print("Tabuler Format - Job offers as per SSC & HSC")
Table_10_12 = pd.DataFrame(data.groupby(["ssc_b", "hsc_b", "hsc_s"])["status"].count()).style.background_gradient(cmap="bone_r")
Table_10_12

In [None]:
print("Tabuler Format - Job offers as per Degree, Work Experience & MBA Specialsation")
Table_deg_workex_mba = pd.DataFrame(data.groupby(["degree_t", "workex", "specialisation"])["status"].count()).style.background_gradient(cmap="bone_r")
Table_deg_workex_mba

---

### Splitting dataset into Predictor and Target Dataset:

In [None]:
### Placed --> 1, Not Placed --> 0
## Splitting dataset into X dataset (Predictor variables) & Y dataset (Target variable):
data['status'] = np.where(data['status'] == "Placed", 1,0)
X = data.drop('status', axis=1)
Y = data['status']

In [None]:
# X dataset - creating dummies
X = pd.get_dummies(X)
print("The Dimension of X (Predictor Dataset):",X.shape)
print("The Dimension of Y (Target Dataset):",Y.shape)
X.head()

### Scaling X dataset

In [None]:
from sklearn.preprocessing import MinMaxScaler
scale = MinMaxScaler()
scaled = scale.fit_transform(X)
X = pd.DataFrame(scaled,columns = X.columns)
X.head()

### Splitting dataset into train & test:

In [None]:
# Split x and y into training and testing set (70%-30% ratio and a random state of 200)

import sklearn.model_selection as ms
x_train, x_test, y_train, y_test= ms.train_test_split(X,Y, test_size=0.3, random_state=200)

In [None]:
print("X-Train :", x_train.shape)
print("Y-Train :", y_train.shape)  # Labels of training dataset
print("X-Test  :", x_test.shape)
print("Y-Test  :", y_test.shape)   # Labels of testing dataset

___

## Feature Selection

* Random Forest is a good algorithm for feature selection

In [None]:
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import f1_score ,confusion_matrix
from sklearn.metrics import roc_auc_score,roc_curve
import sklearn.metrics as metrics

In [None]:
#Using Random Forest Algorithm
RF = RandomForestClassifier(n_estimators=100)
RF.fit(x_train, y_train)
y_pred = RF.predict(x_test)

In [None]:
print("Random Forest Model Results:\n")
print("Accuracy Score:", round(accuracy_score(y_test, y_pred),2)*100,"%")
print("***************************************************\n")
print("Classification Report:\n", classification_report(y_test, y_pred))
print("***************************************************\n")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("***************************************************")

In [None]:
CM = pd.DataFrame(confusion_matrix(y_test, y_pred))

sns.heatmap(CM, annot=True, annot_kws={"size": 15}, cmap="cividis_r", linewidths=0.9)
plt.title('Confusion matrix for RF', y=1.1, fontdict = {'fontsize': 20})
plt.ylabel('Actual label')
plt.xlabel('Predicted label')

In [None]:
## ROC curve for RF:
fpr, tpr, _ = metrics.roc_curve(y_test, y_pred)
auc = metrics.roc_auc_score(y_test, y_pred)

plt.figure(figsize=(10,5))
plt.style.use('seaborn')
plt.plot(fpr,tpr,label="AUC ="+str(auc))
plt.plot([0,1],[0,1],"r--")
plt.title("ROC for RF model", fontdict = {'fontsize': 20})
plt.xlabel("True positive_rate")
plt.ylabel("False positive_rate")
plt.legend(loc= 4, fontsize = "x-large")

In [None]:
# Feature Selection
print("**Dataframe showing Feature Importance in descending order**")
best_features = pd.DataFrame({'Features': x_train.columns, 'Importance':RF.feature_importances_})
best_features.sort_values('Importance', ascending=False)

### Observation for RandomForest Prediction Model: 
- Since accuracy and AUD of Random Forest Model is 100%, there is a chance of overfitting. Hence it might not be a good prediction model 
- Also as per feature importance, almost all the variables are good fit for prediction model.

___

### Prediction Model - K- Nearest Neighbour(KNN)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
## Getting values of k
error_rate=[]
for i in range(1,20):
    knn= KNeighborsClassifier(n_neighbors=i)
    knn.fit(x_train, y_train)
    y_pred_kn= knn.predict(x_test)
    error_rate.append(np.mean(y_pred_kn != y_test))  
    
## Plotting values of k

plt.figure(figsize=(15,5))
plt.style.use('seaborn')
plt.plot(range(1,20), error_rate, marker ='o', label= "k-value", linestyle="dashed" )
plt.title(label= "Error rate of all the values of K", fontdict = {'fontsize': 20})
plt.legend(fontsize = "xx-large")
plt.show()

### Observation:
- Errors are least at k=5 as after 5, the errors are increasing

In [None]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(x_train,y_train)
y_pred_kn = knn.predict(x_test)

In [None]:
print("KNN Model Results:\n")
print("Accuracy Score:", round(accuracy_score(y_test, y_pred_kn),4)*100,"%")
print("***************************************************\n")
print("Classification Report:\n", classification_report(y_test, y_pred_kn))
print("***************************************************\n")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_kn))
print("***************************************************")

In [None]:
CM_knn = pd.DataFrame(confusion_matrix(y_test, y_pred_kn))

sns.heatmap(CM_knn, annot=True, annot_kws={"size": 15}, cmap="cividis_r", linewidths=0.9)
plt.title('Confusion matrix for KNN', y=1.1, fontdict = {'fontsize': 20})
plt.ylabel('Actual label')
plt.xlabel('Predicted label')

In [None]:
## ROC curve for KNN:
fpr1, tpr1, _ = metrics.roc_curve(y_test, y_pred_kn)
auc1 = metrics.roc_auc_score(y_test, y_pred_kn)

plt.figure(figsize=(10,5))
plt.style.use('seaborn')
plt.plot(fpr1, tpr1, label="AUC ="+str(auc1))
plt.plot([0,1],[0,1],"r--")
plt.title("ROC for KNN model", fontdict = {'fontsize': 20})
plt.xlabel("True positive_rate")
plt.ylabel("False positive_rate")
plt.legend(loc= 4, fontsize = "x-large")

### Observation for KNN Prediction Model:
* KNN is an average model as the accuracy is 76.92% and AUC is 68.2%

___

# Summary:
* Educational percentage are statistically significant for a candidate to get campus placement.
* Students with good average marks throughout their educational period had more chances to get job offers.
* Past workexperince helps in getting better jobs after Masters final placement.
* No gender discrimination while hiring.
* Getter packages where provided to Male Candidates.
* There is significant spike in Male students as we go towards higher education.
* Students with Marketting and Finance Specialsation with previous work exp got the highest CTC.

___