In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

#from pandas_profiling import ProfileReport
#from autoviz.AutoViz_Class import AutoViz_Class as AVC

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score

In [None]:
raw_data = pd.read_csv('../input/factors-affecting-campus-placement/Placement_Data_Full_Class.csv')
raw_data.head()

In [None]:
raw_data.describe(include='all')

In [None]:
raw_data.isnull().sum()

#To generate a quick indepth EDA analysis

profile = ProfileReport(raw_data, title = 'Pandas Profiling Report')
profile

#To get visualizations about the data. output is a variation of different charts eg scatter plots, histograms etc
avc = AVC()
data_viz = avc.AutoViz('C:\\Users\\Fabulous\\Downloads\\Data\\Placement_Data_Full_Class.csv')
data_viz

In [None]:
raw_data['status'].value_counts()

In [None]:
raw_data['salary'] = raw_data['salary'].fillna(0)
raw_data['salary'].isnull().sum()

In [None]:
raw_data['salary'].skew()

The skewness lies between -0.5 and 0.5, so the distribution of salary is approximately symmetric

In [None]:
sns.boxplot(raw_data['salary'])

The boxplot shows that a lot of individuals were offered salaries in the range of 0k - 200k, with 0k representing those who were not placed with companies. 

## Exploratory Data Analysis

In [None]:
fig = plt.figure(figsize=(8,6))
sns.countplot(data= raw_data, x = 'status')
plt.xlabel('Status', fontsize = 14)
plt.ylabel('Count', fontsize = 14)
plt.title('Placement Rate', fontsize = 18)
plt.show()

A lot more people were placed with companies. The ratio of the placement rate (being placed compared to not being placed) is about 2:1.

In [None]:
fig = plt.figure(figsize=(8,6))
sns.countplot(data= raw_data, hue = 'workex', x = 'status')
plt.xlabel('Status', fontsize = 14)
plt.ylabel('Count', fontsize = 14)
plt.title('Placement Rate based on Work Experience', fontsize = 18)
plt.show()

Work experience doesnt seem to have an impact on placement rate. Individuals with or without work experience were still placed with companies. However, individuals without work experience had  a higher chance of not being placed compared to those with work experience. Generally, a lot of other factors are often considered in addition to having work experience. For example, cultural fit, performance at the interview stage etc. 

In [None]:
fig = plt.figure(figsize=(8,6))
sns.countplot(data= raw_data, hue = 'gender', x = 'status')
plt.xlabel('Status', fontsize = 14)
plt.ylabel('Count', fontsize = 14)
plt.title('Placement Rate by Gender', fontsize = 18)
plt.show()

Men had a much significant placement rate with companies than women.

In [None]:
fig = plt.figure(figsize=(8,6))
sns.violinplot(data= raw_data, x = 'gender', y = 'salary')
plt.xlabel('Gender', fontsize = 14)
plt.ylabel('Salary', fontsize = 14)
plt.title('Salary distribution based on Gender', fontsize = 18)
plt.show()

Men were offered higher salaries than women. This is a prevalent issue in our society. Men are generally offered higher salaries than women.

In [None]:
fig = plt.figure(figsize=(8,6))
sns.countplot(data= raw_data, hue = 'degree_t', x = 'status')
plt.xlabel('Status', fontsize = 14)
plt.ylabel('Count', fontsize = 14)
plt.title('Placement Rate by Field of Degree', fontsize = 18)
plt.show()

Individuals with a degree in field of communications & management had a significant chance of getting placed compared to individuals with a degree in other fields (Science & Technology and Others). They also had a higher salary distribution as seen in the chart below.
Individuals with a degree in 'Others' field had the lowest chance of not getting placed as well as the lowest salary distribution.

In [None]:
fig = plt.figure(figsize=(8,6))
sns.violinplot(data= raw_data, x = 'degree_t', y = 'salary')
plt.xlabel('Gender', fontsize = 14)
plt.ylabel('Salary', fontsize = 14)
plt.title('Salary distribution by on Field of Degree', fontsize = 18)
plt.show()

In [None]:
fig = plt.figure(figsize=(8,6))
sns.countplot(data= raw_data, hue = 'specialisation', x = 'status')
plt.xlabel('Status', fontsize = 14)
plt.ylabel('Count', fontsize = 14)
plt.title('Placement Rate by MBA Specialisation', fontsize = 18)
plt.show()

People with an MBA specialisation in marketing & finance had a significant placement rate than people with an MBA specialisation in Marketing & HR. 
People with an MBA specialisation in Marketing & HR also had a higher chance of not being placed with companies compared to those with an MBA specialisation in marketing & finance. If at this time period, companies had a much higher demand for people with a degree in marketing & finance, it could lead to such individuals having a higher placement rate than those with a degree in marketing & HR. These individuals also had a much higher salary distribution than those with a specialisation in marketing & HR.

In [None]:
fig = plt.figure(figsize=(8,6))
sns.violinplot(data= raw_data, x = 'specialisation', y = 'salary')
plt.xlabel('Gender', fontsize = 14)
plt.ylabel('Salary', fontsize = 14)
plt.title('Salary distribution based on MBA specialisation', fontsize = 18)
plt.show()

In [None]:
placement_data = raw_data.copy()
placement_data.head()

In [None]:
X = placement_data.drop(['sl_no','status','salary'], axis = 1)
y = placement_data['status']

In [None]:
#Machine Learning models take just numbers so any string values we have in our data will have to be converted to numbers.

#Using Column Transformer and One Hot Encoder rather than Label Encoder and One Hot Encoder as both give the same results.
#Using this method is however more effcient since i use just two lines of code.

#One Hot Encoder sorts the values for each column in ascending order and encodes each category based on this order. Eg male and 
#female, female will have a value of 1, 0 and male 0, 1. The output from One Hot Encoding puts the encoded columns first and 
#then the other columns that were not encoded.

ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0, 2, 4, 5, 7, 8, 10])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

In [None]:
print(X[:1])

In [None]:
lab_enc = LabelEncoder()
y = lab_enc.fit_transform(y)

In [None]:
print(y[:5])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [None]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
models = [LogisticRegression(max_iter = 1500), 
          KNeighborsClassifier(),
          SVC(kernel = 'linear'), 
          SVC(kernel = 'rbf'), 
          GaussianNB(), 
          DecisionTreeClassifier(), 
          RandomForestClassifier(), 
          XGBClassifier(),
          LGBMClassifier(),
          ExtraTreesClassifier()]

a, b, c, d = [], [], [], []

for i in models:
    model = i.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    a.append(accuracy_score(y_test, y_pred))
    b.append(f1_score(y_test, y_pred))
    c.append(precision_score(y_test, y_pred))
    d.append(recall_score(y_test, y_pred))
    
class_metrics = pd.DataFrame([a, b, c, d], index = ['Accuracy','F1 Score','Precision','Recall'], 
                             columns = ['Logistic Reg','KNN','SVM','KSVM','Naive Bayes','Decision Tree','Random Forest', 
                                        'XGBoost','LGBM','Extra Trees'])

class_metrics.transpose().sort_values(by='Accuracy', ascending=False)

In addition to accuracy, f1-score, precison and recall can also be used to measure the classification model
Precison measure the ability of the model to not label positive values as negative.
Recall is the ability of the model to find positive vales.
F1-score is the weighted mean of precision and recall. The closer to 1 these values are, the better.

The logistic regression and SVM models have the same values across all four metrics. They also have high accuracy, f1, precison and recall scores. I will be using Logistic regression on the dataset.

In [None]:
log_classifier = LogisticRegression(max_iter = 1500)
log_classifier.fit(X_train, y_train)

In [None]:
log_pred = log_classifier.predict(X_test)

In [None]:
log_cm = confusion_matrix(y_test, log_pred)
print(log_cm)
accuracy_score(y_test, log_pred)

In [None]:
print(classification_report(y_test, log_pred))

## Cross Validation

In [None]:
accuracies = cross_val_score(estimator = log_classifier, X = X_train, y = y_train, cv = 10)

print("Accuracy: {:.2f} %".format(accuracies.mean()*100))

The cross validation accuracy is close to the accuracy predicted by the logistic regression. This shows that the model did not overfit or underfit the data.

In [None]:
log_classifier.get_params()

## Examining Feature Importance

In [None]:
importance = log_classifier.coef_[0]
for i, v in enumerate(importance):
    print('Feature: %0d, Score:%.5f' % (i, v))
#plotting feature importance
plt.bar([x for x in range(len(importance))], importance)
plt.show()

The positive scores indicate a feature that predicts class 1 while a negative score indicates a feature that predicts class 0.

In [None]:
placement_data.head(1)

In [None]:
print(X[:1])

1. The sl_no and salary columns were dropped for the machine learning models and one hot encoding was carried out on categorical columns
2. Features 0 and 1 - the two gender categories (M,F)
3. Feature 2 - ssc_p
4. Features 3 and 4 - the two ssc_b categories (Others, Central)
5. Feature 5 - hsc_p
6. Features 6 and 7 - the two hsc_b categories (Others, Central)
7. Features 8, 9 and 10 - the three hsc_s categories (Commerce, Science, Arts)
8. Feature 11 - degree_p
9. Features 12, 13 and 14 - three categories of degree_t (Comm & Mgmt, Sci & Tech, Others)
10. Features 15 and 16 - two categories of workex (No, Yes)
11. Feature 17 - etest_p
12. Features 18 and 19 - two categories of specialisation ( Mkt & HR, Mkt & Fin)
13. Feature 20 - mba_p

The variables that contribute heavily in the model are work experience (workex), employability test percentage (etest_p), mba post graduation specialisation (specialisation) and mba percentage (mba_p)