# Campus Placement Prediction with Logistic Regression

Import the libraries that will be used:

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

Read in and inspect the data.

In [None]:
df = pd.read_csv('../input/factors-affecting-campus-placement/Placement_Data_Full_Class.csv')

In [None]:
df.head()

In [None]:
df.info()

No missing data besides 'salary' for those who were not placed. 

Rename all features for readability and remove 'sl_no'.

In [None]:
df = df.drop('sl_no',axis=1)

df.rename(inplace=True,columns={'gender':'Gender','ssc_p':'Secondary Ed %','ssc_b':'Secondary Board',
                   'hsc_p':'Higher Secondary Ed %','hsc_b':'Higher Secondary Board',
                   'hsc_s':'Higher Secondary Specialization','degree_p':'Degree %',
                   'degree_t':'Under Grad Field','workex':'Work Exp','etest_p':'Employability Test %',
                   'specialisation':'Post Grad Specialization','mba_p':'MBA %','status':'Placement Status',
                                'salary':'Salary'})

## Exploratory Data Analysis

Explore features in the data and the relationship to 'Placement Status':

### Placement Status

In [None]:
sns.countplot(x='Placement Status',data= df)

In [None]:
(len(df[df['Placement Status'] == 'Placed']) / len(df['Placement Status']))*100

About 69% of the data are for those who were placed. 

### Gender

In [None]:
sns.countplot(x='Placement Status',data= df,hue='Gender')

Appears that men are more likely than females to be placed. Let's look closer. First define a function to print Placement Status percentages for each unique value in a feature.

In [None]:
def feature_classification_percent(dataframe, feature, goalfeature, positive_goal_value):
    
    unique_lst = []
    unique_dic = {}
    
    # Add all unique values in feature to a list
    for i in range(len(dataframe[feature].unique())):
        unique_lst.append(dataframe[feature].unique()[i])

    
    # Count the amount for each value in feature and count the amount of each value in that feature that the goalfeature is 1
    # Calculate the percentage of that feature that the goalfeature is 1
    # Add information for each value to a calculated values list
    tot_lst = []
    pos_lst = []
    pos_perc_lst = []
    
    for j in range(len(unique_lst)):
        tot = dataframe[dataframe[feature] == unique_lst[j]][goalfeature].count()
        pos = len(dataframe[(dataframe[feature] == unique_lst[j]) & (dataframe[goalfeature] == positive_goal_value)])
        pos_perc = str(round((pos / tot) * 100,1)) + '%'
        
        tot_lst.append(tot)
        pos_lst.append(pos)
        pos_perc_lst.append(pos_perc)
        
        
    # Convert these lists into Series and create index Series
    tot_series = pd.Series(tot_lst)
    pos_series = pd.Series(pos_lst)
    pos_perc_series = pd.Series(pos_perc_lst)
    value_series = pd.Series(unique_lst)
    
    #Create Dataframe from Series
    feature_dataframe = pd.DataFrame({'Values':value_series,'Total Amount':tot_series,'Positive Amount':pos_series,'Positive Percentage':pos_perc_series})
    feature_dataframe.set_index('Values',inplace=True)
    feature_dataframe.sort_values(by='Positive Percentage',inplace=True, ascending=False)
   
    with pd.option_context('display.max_rows', None, 'display.max_columns', None):
        display(feature_dataframe)

In [None]:
feature_classification_percent(df,'Gender','Placement Status','Placed')

71.9% of men and 63.2% of women are placed. There are more men that were placed in the data and they were more likely to be placed.

### Secondary Ed %

In [None]:
sns.boxplot('Secondary Ed %','Placement Status',data=df)

As 'Secondary Ed %' increases, likeliness of being placed increases.

### Secondary Board

In [None]:
sns.countplot(x='Placement Status',data= df,hue='Secondary Board')

In [None]:
feature_classification_percent(df,'Secondary Board','Placement Status','Placed')

'Secondary Board' doesn't appear to have much impact on 'Placement Status'

### Higher Secondary Ed %

In [None]:
sns.boxplot('Higher Secondary Ed %','Placement Status',data=df)

'Higher Secondary Ed %' appears to increase likeliness of being placed.

### Higher Secondary Board

In [None]:
sns.countplot(x='Placement Status',data= df,hue='Higher Secondary Board')

In [None]:
feature_classification_percent(df,'Higher Secondary Board','Placement Status','Placed')

Again, the 'Higher Secondary Board' doesn't seem to influence 'Placement Status'.

### Higher Secondary Specialization

In [None]:
sns.countplot(x='Higher Secondary Specialization',data= df,hue='Placement Status')

In [None]:
feature_classification_percent(df,'Higher Secondary Specialization','Placement Status','Placed')

Commerce and Science specializations have the same chance of being placed, but Arts is less likely. 

### Degree %

In [None]:
sns.boxplot('Degree %','Placement Status',data=df)

As 'Degree %' increases, likeliness of being placed also increases.

### Under Grad Degrees

In [None]:
sns.countplot(x='Placement Status',data= df,hue='Under Grad Field')

In [None]:
feature_classification_percent(df,'Under Grad Field','Placement Status','Placed')

### Work Experience

In [None]:
sns.countplot(x='Placement Status',data= df,hue='Work Exp')

'Work Experience' appears to influence 'Placement Status' if they have Work Experience as most were placed. Those without Work Experience are split between both. 

### Employability Test

In [None]:
sns.boxplot('Employability Test %','Placement Status',data=df)

A lot of overlap, but it may still may influence 'Placement Status'.

### Post Grad Specialization

In [None]:
sns.countplot(x='Placement Status',data= df,hue='Post Grad Specialization')

In [None]:
feature_classification_percent(df,'Post Grad Specialization','Placement Status','Placed')

Those with 'Mkt&Fin' were more likely to be placed. This is also true for 'Mkt&HR' but far less so.

### MBA %

In [None]:
sns.boxplot('MBA %','Placement Status',data=df)

A lot of overlap for this feature as well.

## Data Cleaning

Before making the feature objects into numerical and creating dummies, I will drop both school boards because they don't seem to correlate with placement.

In [None]:
df.drop(['Secondary Board','Higher Secondary Board'],axis=1,inplace=True)

Use map function to assign numerical values to features:

In [None]:
df['Gender'] = df['Gender'].map({'M':0,'F':1})
df['Work Exp'] = df['Work Exp'].map({'No':0,'Yes':1})
df['Placement Status'] = df['Placement Status'].map({'Not Placed':0,'Placed':1})

Get dummies for the other features that have more than 2 unique values:

In [None]:
df = pd.get_dummies(df)

## Outliers

In [None]:
sns.boxplot(df['Salary'])

Remove any data with 'Salary' above 400000.

In [None]:
df = df.drop(df[df['Salary'] > 400000].index)

With these outliers removed, let's drop salary because it can't be used to predict 'Placement Status'.

In [None]:
df.drop('Salary',axis=1,inplace=True)

## Correlation Plots

In [None]:
plt.figure(figsize=(14,8))
sns.heatmap(df.corr(),annot=True)

Remove more features that show signs of multicoliniarity (>0.7)

- Higher Secondary Specialization (remove science)
- Under Grad Field (remove Sci&Tech)
- Post Grad Specialization (remove Mkt&HR)

In [None]:
df = df.drop(['Higher Secondary Specialization_Science','Under Grad Field_Sci&Tech',
                'Post Grad Specialization_Mkt&HR'],axis=1)

In [None]:
plt.figure(figsize=(14,8))
sns.heatmap(df.corr(),annot=True)

Drop more features that have low correlation with 'Placement Status':

In [None]:
df = df.drop(['Gender','Under Grad Field_Comm&Mgmt','Under Grad Field_Others','MBA %',
                           'Higher Secondary Specialization_Arts',
                           'Higher Secondary Specialization_Commerce'],axis=1)

In [None]:
plt.figure(figsize=(14,8))
sns.heatmap(df.corr(),annot=True)

## Predict Placement Status

In [None]:
X = df.drop('Placement Status',axis=1)
y = df['Placement Status']

### Scale the Data

Use Sklearn's MinMaxScaler to scale the test data.

In [None]:
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

### Train/Test Split

Split the data into train and test dataframes. Use 30% of the data as the test set.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled,y,test_size=0.3,random_state=42)

### Logistic Regression Model

Train a logistic regression model to predict 'Placement Status' of the test set. 

In [None]:
logmodel = LogisticRegression()
logmodel.fit(X_train,y_train)
predictions = logmodel.predict(X_test)

In [None]:
print(classification_report(y_test,predictions))
print(confusion_matrix(y_test,predictions))