In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 

Load the data

In [None]:
df_student = pd.read_csv('../input/engineering-placements-prediction/collegePlace.csv')

EDA and checking the quality of data

In [None]:
df_student.head()

In [None]:
len(df_student)

In [None]:
set(df_student['Stream'])

In [None]:
df_student.isnull().values.any() # no nans in the data

In [None]:
values = df_student['Gender'].value_counts()
categories = values.index
plt.figure(figsize = (8,8))
plt.pie(values, labels = categories, autopct='%1.1f%%')
plt.title('There are more than 4 times more Males than Females applying for Engernieering course ')
plt.show()

In [None]:
df_barplot = df_student['Stream'].value_counts().sort_values().rename_axis('Stream').reset_index(name = 'number of students')
plt.figure(figsize = (8,8))
sns.barplot(data = df_barplot, x = 'Stream', y = 'number of students')
plt.title('CS and IT streams dominate the selection processes', fontsize = 16)
plt.xticks(rotation = 45,ha = 'right')
plt.xlabel(xlabel = 'Stream', fontsize = 14)
plt.ylabel(ylabel = 'number of students', fontsize = 14)
plt.show()

In [None]:
df_student['Hostel'].unique()

In [None]:
df_student['CGPA'].unique()

In [None]:
df_barplot = df_student['CGPA'].value_counts().rename_axis('CGPA').reset_index(name= 'number of students').sort_values(by = 'CGPA').reset_index(drop = True)
plt.figure(figsize = (8,8))
plot = sns.barplot(data = df_barplot, x = 'CGPA', y = 'number of students')
for index, row in df_barplot.iterrows():

    plot.text(row.name,row['number of students']+5, row['number of students'], color = 'black', ha = "center")
    
plt.title('Distribution of CGPA seems to be close to normal', fontsize = 16)
plt.xlabel(xlabel = 'CGPA', fontsize = 14)
plt.ylabel(ylabel = 'number of students', fontsize = 14)
plt.show()

In [None]:
df_pieplot = df_student['PlacedOrNot'].value_counts().rename_axis('Placed or not').reset_index(name= 'number of students')
df_pieplot['Placed or not'].replace({1:'Placed', 0:'Not placed'}, inplace = True)
plt.figure(figsize=(8,8))
plt.pie( df_pieplot['number of students'], labels = df_pieplot['Placed or not'], autopct='%1.1f%%', textprops={'fontsize': 14})
plt.title('The dataset a little bit unbalanced', fontsize = 16)
plt.xticks(rotation = 45,ha = 'right')
plt.show()

## Let's see how different features correlate with the target value correlate with target value

In [None]:

plt.figure(figsize = (12,8))
sns.heatmap(df_student.corr(), cmap = 'Greens', annot = True, annot_kws = {'size':14})
plt.xlabel('PREDICTED', fontsize = 16, labelpad = 10)
plt.ylabel('ACTUAL', fontsize = 16, labelpad = 10)
plt.title('TARGET VALUE IS CORRELATED WITH CGPA AND N OF INTERNSHIPS', fontsize = 16)
plt.show()


## Prepare dataset for model  
 I will use One-Hot encoding. You can read about it here:  
 https://machinelearningmastery.com/why-one-hot-encode-data-in-machine-learning/

In [None]:
df_student = pd.concat([df_student, pd.get_dummies(df_student['Gender'] )], axis = 1)
df_student.drop(columns = ['Gender', 'Male'], inplace = True) 

In [None]:
df_student = pd.concat([df_student, pd.get_dummies(df_student['Stream'] )], axis = 1)
df_student.drop(columns = 'Stream', inplace = True)

In [None]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.linear_model import LogisticRegression

## Standardize data  
  
Standardization of a dataset is a common requirement for many machine learning estimators: they might behave badly if the individual features do not more or less look like standard normally distributed data (e.g. Gaussian with 0 mean and unit variance).  
https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html


In [None]:
y = df_student['PlacedOrNot']
X = df_student.drop('PlacedOrNot',axis = 1)

scaler = preprocessing.StandardScaler().fit(X)
X = scaler.transform(X)


In [None]:
#split for validation
X_train,X_test,y_train,y_test= train_test_split(X, y, test_size=0.2, random_state=21)


Start with a LogisticReg as a baseline

In [None]:
lg = LogisticRegression()
lg.fit(X_train, y_train)
lg.score(X_test, y_test)

## XGBoost Guide  
https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/

In [None]:
xgb1 = xgb.XGBClassifier(
 learning_rate =0.3,
 n_estimators=190,
 max_depth=6,
 objective= 'binary:logistic',
 scale_pos_weight=0.47,
 seed=27,
 use_label_encoder=False)
xgb1.fit(X_train, y_train.to_numpy(),eval_metric = 'error')



In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

preds = xgb1.predict(X_test)

print(f'XGBoost has {round(accuracy_score(y_test, preds),3)*100}% of accuracy on the validation dataset')




In [None]:
cm = confusion_matrix(y_test, preds)
cm = (cm.T / cm.sum(axis=1)).T
df_cm = pd.DataFrame(cm,index = ['Placed', 'Not placed'],columns=['Placed', 'Not placed'])

In [None]:
plt.figure(figsize=(6,6))
sns.heatmap(df_cm, cmap='Greens', annot=True, annot_kws={'size':14})
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.xlabel('PREDICTED', fontsize=16, labelpad=10)
plt.ylabel('ACTUAL', fontsize=16, labelpad=10)
plt.title('MODEL IS CONFUSED MORE WITH NOT PLACED', fontsize = 16)
plt.show()
