# Import Libraries

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

# Read Data

In [None]:
df = pd.read_csv('/kaggle/input/factors-affecting-campus-placement/Placement_Data_Full_Class.csv')

# Data Inspection

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()

# Checking Null Values

In [None]:
df.isna().sum().sort_values(ascending=False)

Null values present in Salary Column

In [None]:
df[df.salary.isna()==True]['status'].unique()

Only 'Not Placed' students have salary as Null values, so we will replace null values with 0

In [None]:
df.salary.fillna(0,inplace=True)

In [None]:
df.isna().sum().sort_values(ascending=False)

In [None]:
df.head()

No Null values present in the data set now.

# Checking Outliers

In [None]:
df.dtypes

In [None]:
plt.figure(figsize=(20,10))
i = 1
for x in df.columns:
    if df[x].dtypes != 'O':
        plt.subplot(2,4,i)
        sns.boxplot(df[x])
        i+=1

There are few outliers in the data set but we cannot drop or replace it as these values are real.

# Data Analysis

## 1. Does Gender affect salary?

In [None]:
gender_salary = df.groupby('gender').agg({'salary':['median','mean']})
gender_salary

### Salary is more for males as compared to females

## 2. Does placed status differ by gender

In [None]:
placed_count = df[df.status=='Placed'].groupby('gender').agg({'status':'count'})
placed_count

100 Males placed & 48 Females placed

In [None]:
placed_count['percentage'] = 100 * placed_count.status/placed_count.status.sum()
placed_count

### Percentage of male placed students is higher.

In [None]:
sns.barplot(placed_count.index,placed_count.percentage)
plt.show()

## 3. Does percentages matters for placement?

In [None]:
plt.hist(df[df.status=='Placed'].ssc_p)
plt.hist(df[df.status=='Not Placed'].ssc_p,alpha=0.9)
plt.xlabel('SSC %')
plt.show()

In [None]:
plt.hist(df[df.status=='Placed'].hsc_p)
plt.hist(df[df.status=='Not Placed'].hsc_p,alpha=0.9)
plt.xlabel('HSC %')
plt.show()

In [None]:
plt.hist(df[df.status=='Placed'].degree_p)
plt.hist(df[df.status=='Not Placed'].degree_p,alpha=0.9)
plt.xlabel('Degree %')
plt.show()

### Students having high percentages are getting placed.

In [None]:
df.groupby('status').agg({'ssc_p':['mean','std'],'hsc_p':['mean','std'],'degree_p':['mean','std']})

### Average SSC % for placed students is 71% with +-8% 
### Average HSC % for placed students is 69% with +-9% 
### Average Degree % for placed students is 68% with +-6%

## 4. Which degree specialization is much demanded by corporate?

In [None]:
df[df.status=='Placed']['degree_t'].value_counts()

In [None]:
sns.barplot(x=df[df.status=='Placed']['degree_t'].value_counts().index,
           y=df[df.status=='Placed']['degree_t'].value_counts().values)

### Comm&Mgmt is much demanded

In [None]:
df.head()

## 5. Does salary differ by work experience?

In [None]:
sal = df.groupby('workex').agg({'salary':['mean']})
sal

### Salary of Work Experienced student is much higher than fresher

# Create Target and Predictor Variables

In [None]:
X = df.copy()
Y = df.status

In [None]:
X.drop(['sl_no','salary','status'],axis=1,inplace=True)

# Checking Skewness

In [None]:
plt.figure(figsize=(20,10))
i = 1
for x in X.columns:
    if X[x].dtypes != 'O':
        plt.subplot(2,3,i)
        sns.distplot(X[x])
        i+=1

Log Transformation

In [None]:
for x in X.columns:
    if X[x].dtypes != 'O':
        X[x] = np.log(X[x])

In [None]:
plt.figure(figsize=(20,10))
i = 1
for x in X.columns:
    if X[x].dtypes != 'O':
        plt.subplot(2,3,i)
        sns.distplot(X[x])
        i+=1

# Label Encoding

In [None]:
col = []
for x in X.columns:
    if X[x].dtypes == 'O':
        col.append(x)
col

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for x in col:
    X[x] = le.fit_transform(X[x])

In [None]:
Y = Y.map({'Not Placed':0,'Placed':1})

In [None]:
Y.value_counts()

Data set is imbalanced, using over sampling techniques

# Split Data

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.25,random_state=10,stratify=Y)

In [None]:
Y_train.value_counts()

In [None]:
# Returning to one dataframe
training_set = pd.concat([X_train, Y_train], axis=1)

In [None]:
# Separating classes
class0 = training_set[training_set.status == 0]
class1 = training_set[training_set.status == 1]

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
# oversampling using SMOTE
stom = SMOTE(random_state=42)
stom_x_train, stom_y_train = stom.fit_resample(X_train, Y_train)

In [None]:
# Separate into training and test sets
stom_x_train = pd.DataFrame(stom_x_train, columns = X_train.columns)
stom_y_train = pd.DataFrame(stom_y_train, columns = ['status'])

In [None]:
stom_y_train.value_counts()

Data is balanced now.

# Scaling

In [None]:
# Sampling
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
X_train_ss = ss.fit_transform(stom_x_train)
X_test_ss = ss.transform(X_test)

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train_ss,stom_y_train)
Y_pred=lr.predict(X_test_ss)
# Evaluation
print(classification_report(Y_test,Y_pred))

# Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
lr = DecisionTreeClassifier()
lr.fit(X_train_ss,stom_y_train)
Y_pred=lr.predict(X_test_ss)
# Evaluation
print(classification_report(Y_test,Y_pred))

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
lr = RandomForestClassifier()
lr.fit(X_train_ss,stom_y_train)
Y_pred=lr.predict(X_test_ss)
# Evaluation
print(classification_report(Y_test,Y_pred))

Thanks a lot, do upvote if you liked it.