# Exploratory Data Analysis, Statistical analysis and Machine Learning Classification on Campus Recruitment
**I've added my comments and inferences under the code snippets**

***Content***:

* Importing the Necessary Libraries
* Exploring continuous and categorical data seprately, also their interaction.
* Anova test
* Correlation
* PCA and its principal components visualization with Variance Explained 
* Using SMOTE to balance target variable
* Training Logistic and SVM algorithm with Random and GridSearchcv
* Plotting ROC and Confusion matrix
* Conclusion

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import plotly.express as px

from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import imblearn
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from collections import Counter
from sklearn.model_selection import train_test_split
from scipy.stats import loguniform
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.metrics import classification_report,confusion_matrix


sns.set_style("darkgrid")



# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data_student = pd.read_csv('/kaggle/input/factors-affecting-campus-placement/Placement_Data_Full_Class.csv')

In [None]:
data_student.head()

In [None]:
data_student.info()

**Checking target variable distribution**

In [None]:
fig = px.histogram(data_student, x='status', color="status", barmode='group',width=800, height=600)
fig.show()

**Plotting continuous variables density plots**
>  evident from the plots that all continuous variables are normally distributed

In [None]:
fig, axes = plt.subplots(1, 5, figsize=(30, 5), sharey=True)
fig.suptitle('Continious Variables distribution')
# ['ssc_p', 'hsc_p', 'degree_p', 'etest_p', 'mba_p']
sns.kdeplot(ax=axes[0],
   data=data_student, x="ssc_p", hue="status",
   fill=True, common_norm=False, palette="turbo_r",
   alpha=.5, linewidth=0,
)
sns.kdeplot(ax=axes[1],
   data=data_student, x="hsc_p", hue="status",
   fill=True, common_norm=False, palette="turbo_r",
   alpha=.5, linewidth=0,
)
sns.kdeplot(ax=axes[2],
   data=data_student, x="degree_p", hue="status",
   fill=True, common_norm=False, palette="turbo_r",
   alpha=.5, linewidth=0,
)
sns.kdeplot(ax=axes[3],
   data=data_student, x="etest_p", hue="status",
   fill=True, common_norm=False, palette="turbo_r",
   alpha=.5, linewidth=0,
)
sns.kdeplot(ax=axes[4],
   data=data_student, x="mba_p", hue="status",
   fill=True, common_norm=False, palette="turbo_r",
   alpha=.5, linewidth=0,
)

**Plotting categorical variables countplots**

In [None]:
fig, axes = plt.subplots(1, 7, figsize=(30, 5), sharey=True)
fig.suptitle('Continious Variables distribution')
# ['ssc_p', 'hsc_p', 'degree_p', 'etest_p', 'mba_p']
sns.countplot(ax=axes[0],x="gender",hue="status", data=data_student)

sns.countplot(ax=axes[1],x="ssc_b",hue="status", data=data_student)

sns.countplot(ax=axes[2],x="hsc_b",hue="status", data=data_student)

sns.countplot(ax=axes[3],x="hsc_s",hue="status", data=data_student)

sns.countplot(ax=axes[4],x="degree_t",hue="status", data=data_student)

sns.countplot(ax=axes[5],x="workex",hue="status", data=data_student)

sns.countplot(ax=axes[6],x="specialisation",hue="status", data=data_student)


In [None]:
data_student.drop(['salary'],axis=1,inplace=True)
data_student.isnull().sum()

In [None]:
conti_data = data_student[['ssc_p', 'hsc_p', 'degree_p', 'etest_p', 'mba_p','status']]
data = pd.melt(conti_data,id_vars="status",
                    var_name="features",
                    value_name='value')

In [None]:
plt.figure(figsize=(15,8))
sns.swarmplot(x="features", y="value", hue='status',data=data)

In [None]:
fig = go.Figure()

fig.add_trace(go.Violin(x=data['features'][ data['status'] == 'Placed' ],
                        y=data['value'][ data['status'] == 'Placed' ],
                        legendgroup='Placed', scalegroup='Yes', name='Placed',
                        side='negative',
                        line_color='blue')
             )
fig.add_trace(go.Violin(x=data['features'][ data['status'] == 'Not Placed' ],
                        y=data['value'][ data['status'] == 'Not Placed' ],
                        legendgroup='Not Placed', scalegroup='No', name='Not Placed',
                        side='positive',
                        line_color='orange')
             )
fig.update_traces(meanline_visible=True)
fig.update_layout( template = 'ggplot2',
    title_text=" ",width=1300, height=650,plot_bgcolor='blanchedalmond',paper_bgcolor='blanchedalmond',
    violingap=0.2, violingroupgap=0.3, violinmode='overlay')
fig.show()

In [None]:
tex = [0 if cl=='Placed' else 1 for cl in data_student['status']]
tex = pd.Series(tex)
fig = go.Figure(data=go.Splom(
                  dimensions=[dict(label='sr sec', values=data_student['ssc_p']),
                              dict(label='high school', values=data_student['hsc_p']),
                              dict(label='deg', values=data_student['degree_p']),
                              dict(label='ged2', values=data_student['etest_p']),
                              dict(label='mba', values=data_student['mba_p'])],
                  marker=dict(color=tex,
                              size=5,
                              colorscale='Bluered',
                              line=dict(width=0.5,
                                        color='rgb(230,230,230)')),
                  text=list(data_student.status),
                  diagonal=dict(visible=False)))

title = "Scatterplot Matrix (SPLOM) for Placement Dataset"
fig.update_layout(title=title,
                  dragmode='select',
                  width=1200,
                  height=1000,
                  hovermode='closest')

fig.show()

In [None]:
data_student.drop('sl_no',axis=1,inplace=True)

**Continuous and Categorical variables interaction**

In [None]:
fig, axes = plt.subplots(5, 7, figsize=(45, 20),sharey=True)

sns.violinplot(ax=axes[0][0],x="gender", y="ssc_p",
                 hue="status",palette="YlOrBr",
                data=data_student,split=True,
                    scale="count", inner="quartile")
sns.violinplot(ax=axes[0][1],x="ssc_b", y="ssc_p",
                 hue="status",palette="YlOrBr",
                data=data_student,split=True,
                    scale="count", inner="quartile")
sns.violinplot(ax=axes[0][2],x="hsc_b", y="ssc_p",
                 hue="status",palette="YlOrBr",
                data=data_student,split=True,
                    scale="count", inner="quartile")
sns.violinplot(ax=axes[0][3],x="hsc_s", y="ssc_p",
                 hue="status",palette="YlOrBr",
                data=data_student,split=True,
                    scale="count", inner="quartile")
sns.violinplot(ax=axes[0][4],x="degree_t", y="ssc_p",
                 hue="status",palette="YlOrBr",
                data=data_student,split=True,
                    scale="count", inner="quartile")
sns.violinplot(ax=axes[0][5],x="workex", y="ssc_p",
                 hue="status",palette="YlOrBr",
                data=data_student,split=True,
                    scale="count", inner="quartile")
sns.violinplot(ax=axes[0][6],x="specialisation", y="ssc_p",
                 hue="status",palette="YlOrBr",
                data=data_student,split=True,
                    scale="count", inner="quartile")

sns.violinplot(ax=axes[1][0],x="gender", y="hsc_p",
                 hue="status",palette="Blues",
                data=data_student,split=True,
                    scale="count", inner="quartile")
sns.violinplot(ax=axes[1][1],x="ssc_b", y="hsc_p",
                 hue="status",palette="Blues",
                data=data_student,split=True,
                    scale="count", inner="quartile")
sns.violinplot(ax=axes[1][2],x="hsc_b", y="hsc_p",
                 hue="status",palette="Blues",
                data=data_student,split=True,
                    scale="count", inner="quartile")
sns.violinplot(ax=axes[1][3],x="hsc_s", y="hsc_p",
                 hue="status",palette="Blues",
                data=data_student,split=True,
                    scale="count", inner="quartile")
sns.violinplot(ax=axes[1][4],x="degree_t", y="hsc_p",
                 hue="status",palette="Blues",
                data=data_student,split=True,
                    scale="count", inner="quartile")
sns.violinplot(ax=axes[1][5],x="workex", y="hsc_p",
                 hue="status",palette="Blues",
                data=data_student,split=True,
                    scale="count", inner="quartile")
sns.violinplot(ax=axes[1][6],x="specialisation", y="hsc_p",
                 hue="status",palette="Blues",
                data=data_student,split=True,
                    scale="count", inner="quartile")

sns.violinplot(ax=axes[2][0],x="gender", y="degree_p",
                 hue="status",palette="Set2",
                data=data_student,split=True,
                    scale="count", inner="quartile")
sns.violinplot(ax=axes[2][1],x="ssc_b", y="degree_p",
                 hue="status",palette="Set2",
                data=data_student,split=True,
                    scale="count", inner="quartile")
sns.violinplot(ax=axes[2][2],x="hsc_b", y="degree_p",
                 hue="status",palette="Set2",
                data=data_student,split=True,
                    scale="count", inner="quartile")
sns.violinplot(ax=axes[2][3],x="hsc_s", y="degree_p",
                 hue="status",palette="Set2",
                data=data_student,split=True,
                    scale="count", inner="quartile")
sns.violinplot(ax=axes[2][4],x="degree_t", y="degree_p",
                 hue="status",palette="Set2",
                data=data_student,split=True,
                    scale="count", inner="quartile")
sns.violinplot(ax=axes[2][5],x="workex", y="degree_p",
                 hue="status",palette="Set2",
                data=data_student,split=True,
                    scale="count", inner="quartile")
sns.violinplot(ax=axes[2][6],x="specialisation", y="degree_p",
                 hue="status",palette="Set2",
                data=data_student,split=True,
                    scale="count", inner="quartile")

sns.violinplot(ax=axes[3][0],x="gender", y="etest_p",
                 hue="status",palette="rocket",
                data=data_student,split=True,
                    scale="count", inner="quartile")
sns.violinplot(ax=axes[3][1],x="ssc_b", y="etest_p",
                 hue="status",palette="rocket",
                data=data_student,split=True,
                    scale="count", inner="quartile")
sns.violinplot(ax=axes[3][2],x="hsc_b", y="etest_p",
                 hue="status",palette="rocket",
                data=data_student,split=True,
                    scale="count", inner="quartile")
sns.violinplot(ax=axes[3][3],x="hsc_s", y="etest_p",
                 hue="status",palette="rocket",
                data=data_student,split=True,
                    scale="count", inner="quartile")
sns.violinplot(ax=axes[3][4],x="degree_t", y="etest_p",
                 hue="status",palette="rocket",
                data=data_student,split=True,
                    scale="count", inner="quartile")
sns.violinplot(ax=axes[3][5],x="workex", y="etest_p",
                 hue="status",palette="rocket",
                data=data_student,split=True,
                    scale="count", inner="quartile")
sns.violinplot(ax=axes[3][6],x="specialisation", y="etest_p",
                 hue="status",palette="rocket",
                data=data_student,split=True,
                    scale="count", inner="quartile")

sns.violinplot(ax=axes[4][0],x="gender", y="mba_p",
                 hue="status",palette="mako",
                data=data_student,split=True,
                    scale="count", inner="quartile")
sns.violinplot(ax=axes[4][1],x="ssc_b", y="mba_p",
                 hue="status",palette="mako",
                data=data_student,split=True,
                    scale="count", inner="quartile")
sns.violinplot(ax=axes[4][2],x="hsc_b", y="mba_p",
                 hue="status",palette="mako",
                data=data_student,split=True,
                    scale="count", inner="quartile")
sns.violinplot(ax=axes[4][3],x="hsc_s", y="mba_p",
                 hue="status",palette="mako",
                data=data_student,split=True,
                    scale="count", inner="quartile")
sns.violinplot(ax=axes[4][4],x="degree_t", y="mba_p",
                 hue="status",palette="mako",
                data=data_student,split=True,
                    scale="count", inner="quartile")
sns.violinplot(ax=axes[4][5],x="workex", y="mba_p",
                 hue="status",palette="mako",
                data=data_student,split=True,
                    scale="count", inner="quartile")
sns.violinplot(ax=axes[4][6],x="specialisation", y="mba_p",
                 hue="status",palette="mako",
                data=data_student,split=True,
                    scale="count", inner="quartile")

In [None]:
data_student.status.replace({'Placed':0,'Not Placed':1},inplace=True)
data_student.replace([np.inf, -np.inf], 0, inplace=True)

In [None]:
train_data = data_student.copy()

**Test of variance between the 2 poulations using Anova**

In [None]:
model = ols('status ~gender+ssc_p+ssc_b+hsc_p+hsc_b+hsc_s+degree_p+degree_t+etest_p+workex+mba_p+specialisation', data=data_student).fit()
table = anova_lm(model, typ=2)

In [None]:
# model = ols('status ~gender+ssc_p+ssc_b+hsc_p+hsc_b+hsc_s+degree_p+degree_t+etest_p+workex+mba_p+specialisation*', data=data_student).fit()
# table = anova_lm(model, typ=1)

In [None]:
table.sort_values('PR(>F)').head(50)

In [None]:
f,ax = plt.subplots(figsize=(18, 18))
sns.heatmap(data_student.corr(), annot=True, linewidths=2, fmt= '.1f',ax=ax,cmap="Dark2_r")

In [None]:
train_data = pd.get_dummies(train_data, columns=["gender", "ssc_b","hsc_b","hsc_s","degree_t","workex","specialisation"], prefix=["gender", "ssp","hsp","hss","degree","wrkex","major"])
train_data.columns
train_targets = train_data['status']
train_data.drop('status',axis=1,inplace=True)


In [None]:
scaler = StandardScaler()
scaled_data = scaler.fit_transform(train_data)
scaled_data.shape


In [None]:
n_comp =[5,8,11,15]
pca_exp = []
variance_exp_cumsum = []
for comp in n_comp:
    pca = PCA(n_components=comp)
    data_pca = pca.fit_transform(train_data)
    data_pca = pd.DataFrame(data_pca)
    pca_exp.append({"i": comp,'explained_variance_ratio': pca.explained_variance_ratio_.round(2)})
    variance_exp_cumsum.append({'i': comp,'variance_exp_cumsum':pca.explained_variance_ratio_.cumsum().round(2)})
# pca = PCA(n_components=2000)

In [None]:
variance_exp_cumsum = pca.explained_variance_ratio_.cumsum().round(2)
fig, axes = plt.subplots(1,1,figsize=(16,7), dpi=100)
plt.plot(variance_exp_cumsum, color='firebrick')
plt.title('Screeplot of Variance Explained %', fontsize=22)
plt.xlabel('# of PCs', fontsize=16)
plt.show()

**Since after 5 PC's the variance explaned remains the same, choose 5 PC**

In [None]:
pca = PCA(n_components=5)
data_pca = pca.fit_transform(train_data)
data_pca = pd.DataFrame(data_pca)

In [None]:
from scipy.spatial import ConvexHull

def encircle(x,y, ax=None, **kw):
    if not ax: ax=plt.gca()
    p = np.c_[x,y]
    hull = ConvexHull(p)
    poly = plt.Polygon(p[hull.vertices,:], **kw)
    ax.add_patch(poly)

In [None]:
rows_0 = train_targets==0;
rows_1 = train_targets==1; 

In [None]:
fig, ax = plt.subplots(1,1, figsize=(16,12))

# Plot
ax.scatter(data_pca.loc[rows_0.tolist(), 1], data_pca.loc[rows_0.tolist(), 2], c='blue', edgecolor='k', s=120, label='Placed')
ax.scatter(data_pca.loc[rows_1.tolist(), 1], data_pca.loc[rows_1.tolist(), 2], c='red', edgecolor='k', s=120, label='Not Placed')

# Encircle the boundaries
encircle(data_pca.loc[rows_0.tolist(), 1], data_pca.loc[rows_0.tolist(), 2], ec="blue", fc="none", linewidth=2.5)
encircle(data_pca.loc[rows_1.tolist(), 1], data_pca.loc[rows_1.tolist(), 2], ec="firebrick", fc="none", linewidth=2.5)

# Shading
encircle(data_pca.loc[rows_1.tolist(), 1], data_pca.loc[rows_1.tolist(), 2], ec="k", fc="firebrick", alpha=0.05)
encircle(data_pca.loc[rows_0.tolist(), 1], data_pca.loc[rows_0.tolist(), 2], ec="k", fc="blue", alpha=0.05)

# Labels
ax.set_title("Placed or Not Placed: Scatterplot of First Two PCA directions", fontsize=22)
ax.set_xlabel("1st Principal Component", fontsize=22)
ax.set_ylabel("2nd Principal Component", fontsize=22)
ax.legend(loc='best', title='Recruitment Status', fontsize=16)
plt.show();

In [None]:
data_pca['target'] = pd.Series(list(train_targets), index=data_pca.index)

In [None]:
total_var = pca.explained_variance_ratio_.sum() * 100

fig = px.scatter_3d(
    data_pca, x=0, y=1, z=2, color=data_pca['target'],
    title=f'Total Explained Variance: {total_var:.2f}%',
    labels={'0': 'PC 1', '1': 'PC 2', '2': 'PC 3'}
)
fig.update_layout(scene=dict(
                                     xaxis=dict(backgroundcolor="rgb(200, 200, 230)",gridcolor="white", 
                                                showbackground=True,zerolinecolor="white",),
                                     yaxis=dict(backgroundcolor="rgb(230, 200,230)",gridcolor="white", 
                                                showbackground=True,zerolinecolor="white",),
                                     zaxis=dict(backgroundcolor="rgb(230, 230,200)",gridcolor="white", 
                                                showbackground=True,zerolinecolor="white",),
                                     bgcolor='white'),
                             plot_bgcolor='white',template = 'simple_white',height=600,width=800,
                         )

fig.show()

In [None]:
counter = Counter(data_pca.target)
print(counter)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data_pca.iloc[:, :-1], data_pca.target, test_size=0.33, random_state=42)

In [None]:
ctrain = Counter(y_train)
print(f"Count of train targets {ctrain}")
ctest = Counter(y_test)
print(f"Count of test targets {ctest}")

**Using SMOTE for balancing target variable distribution**

In [None]:
over = SMOTE(sampling_strategy=0.5)
under = RandomUnderSampler(sampling_strategy=0.5)
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)
# transform the dataset
X_train, y_train = pipeline.fit_resample(X_train, y_train)
# summarize the new class distribution
counter = Counter(y_train)
print(counter)

**Training Logistic model with RandomizedSearchCV and looking at classification report, confusion matrix and ROC curve**

In [None]:
# define model
model = LogisticRegression()
# define evaluation
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# define search space
space = dict()
space['solver'] = ['newton-cg', 'lbfgs', 'liblinear']
space['penalty'] = ['none', 'l1', 'l2', 'elasticnet']
space['C'] = loguniform(1e-5, 100)
# define search
search = RandomizedSearchCV(model, space, n_iter=500, scoring='f1', n_jobs=-1, cv=cv, random_state=1)
# execute search
result = search.fit(X_train, y_train)
# summarize result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)


In [None]:
predicted_categories = result.predict(X_test)

In [None]:
print(classification_report(y_true=y_test,y_pred=predicted_categories))

In [None]:
plt.figure(figsize=(15,8))
conf_mx = confusion_matrix(y_test, predicted_categories)
conf_mx
sns.heatmap(conf_mx, annot=True, linewidths=.5, fmt= '.1f')

In [None]:
plt.figure(figsize=(20,12))
metrics.plot_roc_curve(result, X_test, y_test)  
plt.show()

**Training SVM model with GridSearchCV and looking at classification report, confusion matrix and ROC curve**

In [None]:
# defining parameter range
param_grid = {'C': [0.1, 1, 10, 100, 1000], 
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['linear', 'poly', 'rbf', 'sigmoid']} 
  
grid = GridSearchCV(SVC(), param_grid,scoring='recall', refit = True, verbose = 3)
  
# fitting the model for grid search
grid.fit(X_train, y_train)

In [None]:

# print best parameter after tuning
print(grid.best_params_)
  
# print how our model looks after hyper-parameter tuning
print(grid.best_estimator_)

In [None]:

grid_predictions = grid.predict(X_test)
  
# print classification report
print(classification_report(y_test, grid_predictions))

In [None]:
plt.figure(figsize=(15,8))
conf_mx_svm = confusion_matrix(y_test, grid_predictions)
sns.heatmap(conf_mx_svm, annot=True, linewidths=.5, fmt= '.1f')

In [None]:
plt.figure(figsize=(20,12))
metrics.plot_roc_curve(grid, X_test, y_test)  
plt.show()