In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**DATA DESCRIPTION**

1.sl_no ----> Serial Number

2.gender ---> Gender- Male='M',Female='F'

3.ssc_p ---> Secondary Education percentage- 10th Grade

4.ssc_b ---> Board of Education- Central/ Others

5.hsc_p ---> Higher Secondary Education percentage- 12th Grade

6.hsc_b ---> Board of Education- Central/ Others

7.hsc_s ---> Specialization in Higher Secondary Education

8.degree_p ---> Degree Percentage

9.degree_t ---> Under Graduation(Degree type)- Field of degree education

10.workex ---> Work Experience

11.etest_p ---> Employability test percentage ( conducted by college)

12.specialisation ---> Post Graduation(MBA)- Specialization

13.mba_p ---> MBA percentage

14.status ---> Status of placement- Placed/Not placed

15.salary ---> Salary offered by corporate to candidates

**IMPORTING DATASET**

In [None]:
pd.set_option('display.max_columns', None)
df=pd.read_csv("/kaggle/input/factors-affecting-campus-placement/Placement_Data_Full_Class.csv")

In [None]:
df.head()

Dropping "sl_no" column as it is of no use in prediction

In [None]:
df=df.drop("sl_no",axis=1)

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

We will replace all the nan values because it is a dataset of placementand all the nan values are present at locations where 

In [None]:
df["salary"]=df["salary"].fillna(0)

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.isnull().sum()

Checking the different unique values in the categorical features.

In [None]:
cat_cols=["gender","ssc_b","hsc_b","hsc_s","degree_t","workex","specialisation","status"]

In [None]:
for i in cat_cols:
    print(i,df[i].unique())

**DATA VISUALISATION**

In [None]:
sns.relplot("etest_p","salary",data=df,hue="degree_t",style="degree_t")
sns.relplot("mba_p","salary",data=df,hue="specialisation")

In [None]:
sns.relplot("mba_p","salary",data=df,kind="line",col="specialisation",hue="workex")

Here, we can see that when the "specialisation" is "Mkt&Fin" then the salary of most of the candidates is high.

In [None]:
sns.catplot("workex","salary",data=df.query("salary>10000"),kind="swarm")

When a candidate is having work experience the salary is a bit better.

In [None]:
sns.catplot(x="degree_t",y="degree_p",data=df,kind="point",hue="workex",col="status")

Candidates who are having a high percentage in degree college have a better cance of placement than the candidates who are having low percentage in degree college.

In [None]:
plt.figure(figsize=(20,10))
plt.subplot(131)
sns.countplot(df["hsc_s"])
plt.title("Specialization in Higher Secondary Education")



plt.subplot(132)
sns.countplot(df["degree_t"])
plt.title("Under Graduation(Degree type)- Field of degree education")


plt.subplot(133)
sns.countplot(df["specialisation"])
plt.title("Post Graduation(MBA)- Specialization")

As we can see most the people chose commerce side in high school (12th) and also in the degree college.

And in MBA colleges the widely selected specialisation field is "Mkt&Fin"

In [None]:
plt.figure(figsize=(15,7))

plt.subplot(151)
sns.stripplot(x='status', y='ssc_p', data=df)
plt.title("10th percentage")

plt.subplot(152)
sns.stripplot(x='status', y='hsc_p', data=df)
plt.title("12th percentage")


plt.subplot(153)
sns.stripplot(x='status', y='degree_p', data=df)
plt.title("degree percentage")


plt.subplot(154)
sns.stripplot(x='status', y='etest_p', data=df)
plt.title(" Employability test percentage")



plt.subplot(155)
sns.stripplot(x='status', y='mba_p', data=df)
plt.title("MBA percentage")

As we can clearly see that the density of blue dots increase when the percentage increases. So it is very clear that more the percentage ,more the chances of getting placement.

In [None]:
plt.figure(figsize=(7,7))
sns.boxplot(x='gender', y='salary', hue='hsc_s', data=df, palette=['blue', 'green', 'red'])

Salaries of both the gender are almost same but the science field has a better hand in this as highest salary is from science department.

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(df.corr(),annot=True)

In [None]:
sns.jointplot("ssc_p","salary",data=df,kind="hex")

In [None]:
sns.jointplot("hsc_p","salary",data=df)

In [None]:
sns.jointplot("degree_p","salary",data=df,kind="hex")

In [None]:
sns.jointplot("mba_p","salary",data=df,kind="hex")

As we saw from the above visualisations we see that salary of the people is mostly in the range of 20k to 40k. And candidates getting placed are also having percentage in the range of 50-75. As we see that salary is increasing with increase in percentage but there are some exceptional candidates who are having a very high salary and not high percentage.

In [None]:
sns.pairplot(df,hue="status")

**As we have status and salary which both can be predicted. So we will try both the approaches to predict classification:"status" and regression:"salary".**

**REGRESSION APPROACH**

**Label Encoding the categorical values**

In [None]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
for i in cat_cols:
    df[i]=le.fit_transform(df[i]) 

**Feature Scaling**


As there are some of the features having values higher than the others. So we will apply standard scaler to bring of all them in a similar range. by this our prediction will become better.

In [None]:
from sklearn.preprocessing import StandardScaler
scaling=["ssc_p","hsc_p","degree_p","etest_p","mba_p"]
sc = StandardScaler()
scaled=sc.fit_transform(df[['ssc_p', 'hsc_p', 'degree_p', 'etest_p', 'mba_p']])
scaled = pd.DataFrame(scaled,columns=scaling)
df=df.drop(columns=scaling,axis=1)
df=df.merge(scaled, left_index=True, right_index=True, how = "right")

In [None]:
df.head()

**Splitting the dataset into dependent and independent variable**

In [None]:
x=df.drop(['salary'],axis=1)
y=df.salary

**Train Test Split**

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=0)

**Training the dataset on Random Forest Regression**

In [None]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators = 10, random_state = 0)
regressor.fit(x_train, y_train)

**Checking accuracy**

In [None]:
regressor.score(x_test,y_test)

**K-Fold cross validation**

In [None]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = regressor, X = x_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

**Training the dataset on Linear Regression**

In [None]:
from sklearn.linear_model import LinearRegression
regressor_2 = LinearRegression()
regressor_2.fit(x_train, y_train)

**Checking the accuracy**

In [None]:
regressor_2.score(x_test,y_test)

**K-Fold Cross Validation**

In [None]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = regressor_2, X = x_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

**ACCURACY**


**Random Forest Regression**
 : before cross validation - 76%
, after cross validation - 66%




**Linear Regression**
 : before cross validation - 88%%
, after cross validation - 74%


**CLASSIFICTION APPROACH**

As "salary" column is of no use to us now. So we drop it.

In [None]:
df=df.drop("salary",axis=1)

In [None]:
df.head()

**Splitting the dataset into dependent and independent variable**

In [None]:
X=df.drop("status",axis=1)
Y=df.status

**Splitting the dataset into Training and Test set**

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=0)

**Training the dataset on Logistic Regression**

In [None]:
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression()
lr.fit(X_train,Y_train)

**Checking the accuracy by accuracy matrix**

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
Y_pred = lr.predict(X_test)
cm = confusion_matrix(Y_test, Y_pred)
print(cm)
accuracy_score(Y_test, Y_pred)

**K-Fold cross validation**

In [None]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = lr, X = X_train, y = Y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

**Training the dataset on Random Forest Classifier**

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc=RandomForestClassifier(n_estimators = 40, criterion = 'entropy', random_state = 0)
rfc.fit(X_train,Y_train)

**Checking the accuracy by accuracy matrix**

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
Y_pred = rfc.predict(X_test)
cm = confusion_matrix(Y_test, Y_pred)
print(cm)
accuracy_score(Y_test, Y_pred)

**K-Fold cross Validation**

In [None]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = rfc, X = X_train, y = Y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

**ACCURACY**


**Logistic Regression**
 : before cross validation - 81%
, after cross validation - 88.9%




**Random Forest Classification**
 : before cross validation - 72%
, after cross validation - 88.89%
