In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot
import matplotlib.pyplot as plt
from datetime import datetime #To Know the Running Time
from sklearn.model_selection import train_test_split # Data Splitting
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler # Data Standadization
from sklearn.preprocessing import MinMaxScaler #Min-Max Data Normalization
from imblearn.over_sampling import ADASYN # Oversampling Data with ADASYN
from sklearn.linear_model import LogisticRegression # Logistic regression algorithm

In [None]:
#Read Dataset
df = pd.read_csv('../input/factors-affecting-campus-placement/Placement_Data_Full_Class.csv')

In [None]:
#Data Summary
df.head()

In [None]:
#Drop Kolom Sallary
df1 = df.drop(['sl_no','salary'], axis=1)

In [None]:
#Count the Categorical Data
cat_cols = df1.select_dtypes(include=object).columns.tolist()
(pd.DataFrame(
    df[cat_cols]
    .melt(var_name='column', value_name='value')
    .value_counts())
.rename(columns={0: 'counts'})
.sort_values(by=['column', 'counts']))

In [None]:
#Convert Categorical Data to Integer
df1['degree_t'] = df1['degree_t'].replace(['Others'], 1).replace(['Sci&Tech'], 2).replace(['Comm&Mgmt'], 3)
df1['gender'] = df1['gender'].replace(['F'], 1).replace(['M'], 2)
df1['hsc_b'] = df1['hsc_b'].replace(['Central'], 1).replace(['Others'], 2)
df1['hsc_s'] = df1['hsc_s'].replace(['Arts'], 1).replace(['Science'], 2).replace(['Commerce'], 2)
df1['specialisation'] = df1['specialisation'].replace(['Mkt&HR'], 1).replace(['Mkt&Fin'], 2)
df1['ssc_b'] = df1['ssc_b'].replace(['Others'], 1).replace(['Central'], 2)
df1['workex'] = df1['workex'].replace(['Yes'], 1).replace(['No'], 2)
df1['status'] = df1['status'].replace(['Not Placed'], 0).replace(['Placed'], 1)

In [None]:
df1.head()

In [None]:
print(df1.dtypes)

**Classification Model Build Start**

In [None]:
#Data Split
x = df1.drop('status', axis = 1).values
y = df1['status'].values
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = None)

#Oversampling Data
ada = ADASYN(sampling_strategy='auto', random_state=27)
x_train, y_train = ada.fit_resample(x_train, y_train)

#Min-Max Data Scalling
scaler = MinMaxScaler() 
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

#Min-Max Data Scalling
scaler = StandardScaler() 
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [None]:
start = datetime.now()

#Logistic Regression
lr = LogisticRegression()
lr.fit(x_train, y_train)
lr_yhat = lr.predict(x_test)

end = datetime.now()
time_taken = end - start
print('Time: ',time_taken)

In [None]:
import itertools # advanced tools
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

#Confusion Matrix
#Defining the plot function
def plot_confusion_matrix(cm, classes, title, normalize = False, cmap = plt.cm.Blues):
    title = 'Confusion Matrix of {}'.format(title)
    if normalize:
        cm = cm.astype(float) / cm.sum(axis=1)[:, np.newaxis]

    plt.imshow(cm, interpolation = 'nearest', cmap = cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation = 45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment = 'center',
                 color = 'white' if cm[i, j] > thresh else 'black')

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

#Compute confusion matrix for the models
lr_matrix = confusion_matrix(y_test, lr_yhat, labels = [0, 1]) #Logistic Regression

#Plot the confusion matrix
plt.rcParams['figure.figsize'] = (6, 6)

# Logistic regression
lr_cm_plot = plot_confusion_matrix(lr_matrix, 
                                classes = ['Negative(0)','Positive(1)'], 
                                normalize = False, title = 'Logistic Regression')
plt.savefig('lr_cm_plot.png')
plt.show()

In [None]:
# get importance
importance = lr.coef_[0]

feature_names = ["gender", "ssc_p",	"ssc_b",	"hsc_p",	"hsc_b",	"hsc_s",	"degree_p",	"degree_t",	"workex",	"etest_p",	"specialisation",	"mba_p"]
feature_importance = pd.DataFrame(feature_names, columns = ["feature"])
feature_importance["importance"] = importance
feature_importance = feature_importance.sort_values(by = ["importance"], ascending=True)

from sklearn.linear_model import LogisticRegression
ax = feature_importance.plot.barh(x='feature', y='importance')
plt.show()

**Most Wanted Industry by Industry**

In [None]:
#Drop Unnecessary Column
df2 = df.drop(['sl_no','salary'], axis=1)
#Drop the Unemployed
df2 = df2[df2.status != 'Not Placed']
#Mathematical Statement
mkthr = round((df2.specialisation == 'Mkt&HR').sum()/(df.specialisation == 'Mkt&HR').sum(), 2)
mktfin = round((df2.specialisation == 'Mkt&Fin').sum()/(df.specialisation == 'Mkt&Fin').sum(), 2)
#Print
print('Percentage Placed of Each Degree')
print('Marketing & HR Specialization - Employment Rate {}'.format(mkthr))
print('Marketing & Financial - Employment Rate {}'.format(mktfin))
#Bar Chart
x = ['Marketing & HR', 'Marketing & Fin']
energy = [mkthr, mktfin]
x_pos = [i for i, _ in enumerate(x)]
plt.bar(x_pos, energy)
plt.xticks(x_pos, x)
plt.show()