# Engineering Placement prediction

# Importing libraries

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import warnings
import time
%matplotlib inline
warnings.filterwarnings('ignore')

## Importing and initialising models

In [None]:
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
classifier=[]
imported_as=[]

#MultiLayerPerceptron
from sklearn.neural_network import MLPClassifier
mlp=MLPClassifier()
classifier.append('Multi Layer Perceptron')
imported_as.append('mlp')

#Bagging
from sklearn.ensemble import BaggingClassifier
bc = BaggingClassifier()
classifier.append('Bagging')
imported_as.append('bc')

#GBC
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier()
classifier.append('Gradient Boosting')
imported_as.append('gbc')

#ADA
from sklearn.ensemble import AdaBoostClassifier
ada = AdaBoostClassifier()
classifier.append('Ada Boost')
imported_as.append('ada')

#XGB
import xgboost as xgb
from xgboost import XGBClassifier
xgb = XGBClassifier() 
classifier.append('XG Boost')
imported_as.append('xgb')

# Logistic Regression
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
classifier.append('Logistic Regression')
imported_as.append('lr')

#RFC
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
classifier.append('Random Forest')
imported_as.append('rfc')

#KNN
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=1)
classifier.append('k Nearest Neighbours')
imported_as.append('knn')

#SVM
from sklearn.svm import SVC
svc = SVC()
classifier.append('Support Vector Machine')
imported_as.append('svc')

#Grid
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [0.1,1, 10, 100, 1000,2000], 'gamma': [1,0.1,0.01,0.001,0.0001], 'kernel': ['rbf']} 
grid = GridSearchCV(SVC(),param_grid,refit=True,verbose=3)
classifier.append('SVM tuning grid')
imported_as.append('grid')

#STcaking
from sklearn.ensemble import StackingClassifier
estimators=[('rf', RandomForestClassifier(n_estimators=10, random_state=42)),
            ('svr',SVC(random_state=42))]
stc = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())
classifier.append('Stacked (RFR & SVM)')
imported_as.append('stc')

classifiers = pd.DataFrame({'Classifier':classifier,'Imported as':imported_as})
print('All Models Imported\nModels stored in dataframe called classifiers')

## Creating class

In [None]:
class Modelling:
    def __init__(self, X_train, Y_train, X_test, Y_test, models):
        self.X_train = X_train
        self.X_test = X_test
        self.Y_train = Y_train
        self.Y_test = Y_test
        self.models = models
    
    def fit(self):
        model_acc = []
        model_time= []
        for i in self.models:
            start=time.time()
            if i == 'knn':
                accuracy = []
                for j in range(1,200):    
                    kn = KNeighborsClassifier(n_neighbors=j)
                    kn.fit(self.X_train,self.Y_train)
                    predK = kn.predict(self.X_test)
                    accuracy.append([accuracy_score(self.Y_test,predK),j])
                temp = accuracy[0]
                for m in accuracy:
                    if temp[0] < m[0]:
                        temp=m
                i = KNeighborsClassifier(n_neighbors=temp[1])
            i.fit(self.X_train,self.Y_train)
            model_acc.append(accuracy_score(self.Y_test,i.predict(self.X_test)))
            stop=time.time()
            model_time.append((stop-start))
            print(i,'has been fit')
        self.models_output = pd.DataFrame({'Models':self.models,'Accuracy':model_acc,'Runtime (s)':model_time})
        
    def results(self):
        models=self.models_output
        models = models.sort_values(by=['Accuracy','Runtime (s)'],ascending=[False,True]).reset_index().drop('index',axis=1)
        self.best = models['Models'][0]
        models['Models']=models['Models'].astype(str).str.split("(", n = 2, expand = True)[0]
        models['Accuracy']=models['Accuracy'].round(5)*100
        self.models_output_cleaned=models
        return(models)
        
    def best_model(self,type):
        if type=='model':
            return(self.best)
        elif type=='name':
            return(self.models_output_cleaned['Models'][0])
    
    def best_model_accuracy(self):
        return(self.models_output_cleaned['Accuracy'][0])
    
    def best_model_runtime(self):
        return(round(self.models_output_cleaned['Runtime (s)'][0],3))
    
    def best_model_predict(self,X_test):
        return(self.best.predict(X_test))
    
    def best_model_clmatrix(self):
        return(classification_report(self.Y_test,self.best.predict(self.X_test)))
    
    def best_confusion(self):
        return(confusion_matrix(self.Y_test,self.best.predict(self.X_test)))

## Importing data

In [None]:
data = pd.read_csv('../input/engineering-placements-prediction/collegePlace.csv')

In [None]:
data.head()

# EDA

In [None]:
sns.set_style('darkgrid')
sns.set_context('notebook')
sns.set_palette('rainbow')

In [None]:
data.info()

In [None]:
data.isnull().sum()

## Number of placed vs not placed

In [None]:
sns.countplot(data=data,x='PlacedOrNot')

## Age Group of participants

In [None]:
sns.histplot(data=data,x='Age',kde=True)

## Role of CGPA

In [None]:
sns.boxplot(data=data,y='CGPA',x='PlacedOrNot')

In [None]:
plt.figure(figsize=(10,5))
sns.barplot(data=pd.DataFrame(data.groupby('CGPA').mean()['PlacedOrNot']).reset_index().rename(columns = {'PlacedOrNot':'Placement Possibility'}, inplace = False),
           x='CGPA',
           y='Placement Possibility')

## Gender wise placement possibility

In [None]:
plt.figure(figsize=(5,5))
sns.barplot(data=pd.DataFrame(data.groupby('Gender').mean()['PlacedOrNot']).reset_index().rename(columns = {'PlacedOrNot':'Placement Possibility'}, inplace = False),
           x='Gender',
           y='Placement Possibility')

## Role of Internships

In [None]:
plt.figure(figsize=(10,5))
sns.barplot(data=pd.DataFrame(data.groupby('Internships').mean()['PlacedOrNot']).reset_index().rename(columns = {'PlacedOrNot':'Placement Possibility'}, inplace = False),
           x='Internships',
           y='Placement Possibility')

## Role of Stream

In [None]:
plt.figure(figsize=(20,5))
sns.countplot(data=data[data['PlacedOrNot']==1],
              x='Stream')
display(pd.DataFrame(data[data['PlacedOrNot']==1].groupby('Stream').count().rename(columns = {'PlacedOrNot':'Number of Placements'}, inplace = False)['Number of Placements']))

In [None]:
plt.figure(figsize=(20,5))
sns.barplot(data=pd.DataFrame(data.groupby('Stream').mean()['PlacedOrNot']).reset_index().rename(columns = {'PlacedOrNot':'Placement Possibility'}, inplace = False),
           x='Stream',
           y='Placement Possibility')

## Role of Backlogs

In [None]:
display(pd.DataFrame(data.groupby(['HistoryOfBacklogs','PlacedOrNot']).count().rename(columns = {'Internships':'Number of Participants'}, inplace = False)['Number of Participants']))

In [None]:
plt.figure(figsize=(5,5))
sns.barplot(data=pd.DataFrame(data.groupby('HistoryOfBacklogs').mean()['PlacedOrNot']).reset_index().rename(columns = {'PlacedOrNot':'Placement Possibility'}, inplace = False),
           x='HistoryOfBacklogs',
           y='Placement Possibility')

## Role of Hostels

In [None]:
display(pd.DataFrame(data.groupby(['Hostel']).count().rename(columns = {'Internships':'Number of Participants'}, inplace = False)['Number of Participants']))

In [None]:
plt.figure(figsize=(5,5))
sns.barplot(data=pd.DataFrame(data.groupby('Hostel').mean()['PlacedOrNot']).reset_index().rename(columns = {'PlacedOrNot':'Placement Possibility'}, inplace = False),
           x='Hostel',
           y='Placement Possibility')

# Data PreProcessing

## Cardinal Encoding

In [None]:

from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
dfs = []
for i in ['Stream','Gender']:
    temp = pd.DataFrame({'Before Encoding':data[i].unique(),'After Encoding':label_encoder.fit_transform(data[i].unique())})
    dfs.append([temp.sort_values(by=['After Encoding']),i])
    data[i] = label_encoder.fit_transform(data[i])
for i in dfs:
    print(i[1])
    display(i[0])
    print('\n')

In [None]:
data.head()

## Feature and Target split

In [None]:
X = data.drop('PlacedOrNot',axis=1)
Y = data['PlacedOrNot']

## Test Train SPlit

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,random_state=100)

# Data modelling

In [None]:
classifiers

In [None]:
models_to_test = [bc,gbc,ada,xgb,rfc,knn,mlp,svc,lr]

In [None]:
classification = Modelling(X_train,Y_train,X_test,Y_test,models_to_test)
classification.fit()

In [None]:
classification.results()

In [None]:
print('BestModel is:',  classification.best_model(type='name'))
print('Accuracy of model:',classification.best_model_accuracy())
print('Training Runtime in seconds',classification.best_model_runtime())
print('Classification Matrix:\n')
print(classification.best_model_clmatrix())

In [None]:
cf_matrix = classification.best_confusion()

group_names = ['True Neg','False Pos','False Neg','True Pos']

group_percentages = ["{0:.2%}".format(value) for value in
                     cf_matrix.flatten()/np.sum(cf_matrix)]
labels = [f"{v1}\n{v2}" for v1, v2 in zip(group_names,group_percentages)]

labels = np.asarray(labels).reshape(2,2)
sns.heatmap(cf_matrix, annot=labels, fmt='', cmap='Blues')