In [None]:
''' This project is to predict whether a Student gets placed in the campus interview or not.'''

# About Data


This data set consists of Placement data of students in our campus. It includes secondary and higher secondary school percentage and specialization. It also includes degree specialization, type and Work experience and salary offers to the placed students

Source: https://www.kaggle.com/benroshan/factors-affecting-campus-placement

### Attribute Information:

sl_no
Serial Number

gender
Gender- Male='M',Female='F'

ssc_p
Secondary Education percentage- 10th Grade

ssc_b
Board of Education- Central/ Others

hsc_p
Higher Secondary Education percentage- 12th Grade

hsc_b
Board of Education- Central/ Others

hsc_s
Specialization in Higher Secondary Education

degree_p
Degree Percentage

degree_t
Under Graduation(Degree type)- Field of degree education

workex
Work Experience

etest_p
Employability test percentage ( conducted by college)

specialisation
Post Graduation(MBA)- Specialization

mba_p
MBA percentage

status
Status of placement- Placed/Not placed

salary
Salary offered by corporate to candidates

# **Import Libraries**

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import xgboost
import lightgbm
from sklearn.metrics import confusion_matrix, accuracy_score
import warnings
warnings.filterwarnings("ignore")
 
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

### Importing the df

In [None]:
df = pd.read_csv('/kaggle/input/factors-affecting-campus-placement/Placement_Data_Full_Class.csv')

### Run this to Profile data

import pandas_profiling as pp


profile = pp.ProfileReport(    df, title="Campus Recruitment Profile", html={"style": {"full_width": True}}, sort=None)
profile

In [None]:
# Converting the non-numerical data into catetgorical data

df['gender'] = df['gender'].astype('object')
df['ssc_b'] = df['ssc_b'].astype('object')
df['hsc_b'] = df['hsc_b'].astype('object')
df['hsc_s'] = df['hsc_s'].astype('object')
df['degree_t'] = df['degree_t'].astype('object')
df['workex'] = df['workex'].astype('object')
df['specialisation'] = df['specialisation'].astype('object')

#Getting all the categorical columns except the target
categorical_columns = df.select_dtypes(exclude = 'number').drop('status', axis = 1).columns

print(categorical_columns)

In [None]:
# First considering only numerical values for feature selection
X = df.iloc[:,[2,4,7,10,12,14]].values
Y = df.iloc[:,13].values

In [None]:
print(X)

In [None]:
print(Y)

In [None]:
len(df)

#### Checking for null values

In [None]:
df.isnull().sum()

In [None]:
# So salary column contains null values

### Taking care of missing data in Salary column

In [None]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X[:,[5]])
X[:,[5]] = imputer.transform(X[:,[5]])

In [None]:
print(X)

### Feature Selection of Numerical Values

In [None]:
plt.rcParams['figure.figsize']=15,6 
sns.set_style("darkgrid")

x = df.iloc[:,[2,4,7,10,12,14]]

from sklearn.ensemble import ExtraTreesClassifier
import matplotlib.pyplot as plt
model = ExtraTreesClassifier()
model.fit(X,Y)
print(model.feature_importances_) 
feat_importances = pd.Series(model.feature_importances_, index=x.columns)
feat_importances.nlargest(12).plot(kind='barh')
plt.show()

In [None]:
# So we can conclude that 'Salary' and 'ssc_p' are two relavent features for predicting the status of placement for a student

### Feature Selection of Categorical Data

In [None]:
# Import the function
#from scipy.stats import chi2_contingency
#Testing the relationship
#chi_res = chi2_contingency(pd.crosstab(df['status'], df['gender']))
#print('Chi2 Statistic: {}, p-value: {}'.format(chi_res[0], chi_res[1]))

In [None]:
from scipy.stats import chi2_contingency
chi2_check = []
for i in categorical_columns:
    if chi2_contingency(pd.crosstab(df['status'], df[i]))[1] < 0.05:
        chi2_check.append('Reject Null Hypothesis')
    else:
        chi2_check.append('Fail to Reject Null Hypothesis')
res = pd.DataFrame(data = [categorical_columns, chi2_check] 
             ).T 
res.columns = ['Column', 'Hypothesis']
print(res)

In [None]:
# If we choose our p-value level to 0.05, if the p-value test result is more than 0.05 then we fail to reject the Null Hypothesis. 
# This means, there is no relationship between the Feature and Dependent Variable based on the Chi-Square test of independence.
# And if the p-value test result is less than 0.05 then we reject the Null Hypothesis. 
# This means, there is a relationship between the Feature and Dependent Variable based on the Chi-Square test of independence.

In [None]:
# So we conclude that 'workex' and 'specialisation' are two important features for predicting status.

In [None]:
# So after feature selection of categorical and numerical features, X comes as,
X = df.iloc[:,[2,9,11,14]].values
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X[:,[3]])
X[:,[3]] = imputer.transform(X[:,[3]])

### Encoding Categorical Values

#### Finding the categories

In [None]:
print(df['workex'].unique())
print(df['specialisation'].unique())
print(df['status'].unique())

### Label encoding

In [None]:
from sklearn.preprocessing import LabelEncoder
le1 = LabelEncoder()
X[:,1] = le1.fit_transform(X[:, 1])
le2 = LabelEncoder()
X[:,2] = le2.fit_transform(X[:, 2])
le3 = LabelEncoder()
Y = le3.fit_transform(Y)

In [None]:
print(X[0])

In [None]:
print(Y)

### Splitting the df into training set and test set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state=1)

### Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train[:,[0,3]] = sc.fit_transform(X_train[:,[0,3]])
X_test[:,[0,3]] = sc.transform(X_test[:,[0,3]])

### Applying classification models on the Training set

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
import lightgbm
import xgboost
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

names = [
    "CatBoostClassifier",
    "Logistic Regression",
    "Support Vector Machine",
    "Decision Tree",
    "Neural Network",
    "Random Forest",
    "XGBoost",
    "LGBMClassifier",
    "XGBRFClassifier",
    "GradientBoosting",
    "GaussianNB",
    "KNeighborsClassifier"
]
models = [
    CatBoostClassifier(verbose= False),
    LogisticRegression(),
    SVC(),
    DecisionTreeClassifier(),
    MLPClassifier(),
    RandomForestClassifier(),
    XGBClassifier(),
    lightgbm.LGBMClassifier(max_depth=2, random_state=4),
    xgboost.XGBRFClassifier(max_depth=3, random_state=1),
    GradientBoostingClassifier(max_depth=2, random_state=1),
    GaussianNB(),
    KNeighborsClassifier(n_neighbors=5, p=2, metric='minkowski')
]

accuracy=[]
for model, name in zip(models,names):
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    print('Confusion matrix of ',name)
    print(confusion_matrix(y_test, y_pred))
    ac = accuracy_score(y_test, y_pred)
    print('Accuracy score is ',ac)
    accuracy.append(ac)
    print('='*50)

Accuracy_list = pd.DataFrame(list(zip(names, accuracy)),columns =['Model', 'Accuracy'])
Accuracy_list= Accuracy_list.sort_values('Accuracy', axis=0, ascending=False, inplace=False, kind='quicksort', na_position='last', ignore_index=True, key=None)

plt.rcParams['figure.figsize']=20,6 
sns.set_style("darkgrid")
ax = sns.barplot(x = 'Model',y = 'Accuracy',data = Accuracy_list , palette = "rocket", saturation =1.5)
plt.xlabel("Model", fontsize = 20 )
plt.ylabel("Accuracy", fontsize = 20)
plt.title("Accuracy of different Models", fontsize = 20)
plt.xticks(fontsize = 11, horizontalalignment = 'center', rotation = 8)
plt.yticks(fontsize = 13)
for p in ax.patches:
    width, height = p.get_width(), p.get_height()
    x, y = p.get_xy() 
    ax.annotate(f'{height:.2%}', (x + width/2, y + height*1.02), ha='center', fontsize = 'x-large')
plt.show()