# Import Libraries and Data

In [None]:
import pandas as pd
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report,confusion_matrix,f1_score,accuracy_score
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')
import random

In [None]:
df = pd.read_csv('../input/factors-affecting-campus-placement/Placement_Data_Full_Class.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
#Drop sl_no & salary columns
df.drop(['sl_no','salary'],axis=1,inplace=True)

# Data Visualization

### Label (Placed or Not)

In [None]:
sns.histplot(df['status'])

 ==> Unbalanced Data

## Overall Visualization (quantitative features)

### Pair Plot (Whit hue = Status)

In [None]:
sns.pairplot(df,hue="status")

 ==> It appears that ssc_p, hsc_p & degree_p have significant impact on response (Status).
 
==> Obviously some features such as ssc_p & hsc_p are highly correlated to others, let's quickly check the value of these correlations.

### Correlation Heatmap

In [None]:
#Encoding label (Placed/NotPlaces)
dum1= pd.get_dummies(df['status'],drop_first=True)
df = pd.concat([df,dum1],axis=1)

In [None]:
plt.figure(figsize=(12,6))
sns.heatmap(df.corr(),annot=True)

### Variance Inflation Factor

In [None]:
#Variance Inflation Factor(For grades)
from statsmodels.stats.outliers_influence import variance_inflation_factor
N = df[['ssc_p','hsc_p','degree_p','etest_p','mba_p']]
VIF = pd.DataFrame()
VIF['Features']=N.columns
VIF['VIF value']= [variance_inflation_factor(N.values,i) for i in range(len(N.columns))]
VIF.plot(kind='bar',x='Features',y='VIF value')

### Pair Plot (With hue=gender)

In [None]:
sns.pairplot(df,hue="gender",diag_kind='hist',diag_kws={'multiple':'dodge'})

 ==> Generally, we can say that features values for female and male follow the same trend, except for mba degrees and ssc_p wherein females tend to do better(According to the shape of distribution and mean value). Let's zoom in that gender effect...

## Categorical data

### Gender

In [None]:
#Pie Plot
plt.subplot(1,2,1)
df[df['gender']=='F'].groupby('status').count()['gender'].plot.pie(autopct="%.1f%%")
plt.title('Female')
plt.subplot(1,2,2)
df[df['gender']=='M'].groupby('status').count()['gender'].plot.pie(autopct="%.1f%%")
plt.title('Male')

 ==> Males have a higher 'Placed' rate than females

In [None]:
#Box Plots (label,all degrees,hue=gender)
L =['ssc_p','hsc_p','degree_p','etest_p','mba_p']
i=0
plt.figure(figsize=(24,6))
for j in range(len(L)):
    plt.subplot(1,len(L),j+1)
    
    sns.boxplot(x='status',y=L[j],hue='gender',data=df)
    

==> According to the previous plots, placed students have higher grades than others, also females have better grades than males except for etest_p (Employability test percentage)

### Board of education

In [None]:
plt.figure(figsize=(20,6))
#Secondary education
plt.subplot(1,2,1)
sns.kdeplot('ssc_p',hue='ssc_b',data=df)
plt.title('Secondary education percentage')

#Higher secondary education

plt.subplot(1,2,2)
sns.kdeplot('hsc_p',hue='hsc_b',data=df)
plt.title('Higher secondary education')


 ==> Distribution of grades varies between central board and others.

### Specialization

In [None]:
df['specialisation'].unique()

In [None]:
sns.countplot(df['specialisation'],hue='gender',data=df)

In [None]:
#Pie Plot
plt.figure(figsize=(14,6))
plt.subplot(2,2,1)
df[(df['specialisation']=='Mkt&HR')&(df['gender']=='M')].groupby(['status']).count()['mba_p'].plot.pie(autopct="%.1f%%")
plt.title('Mkt&HR Males')
plt.subplot(2,2,2)
df[(df['specialisation']=='Mkt&Fin')&(df['gender']=='M')].groupby(['status']).count()['mba_p'].plot.pie(autopct="%.1f%%")
plt.title('Mkt&Fin Males')
plt.subplot(2,2,3)
df[(df['specialisation']=='Mkt&HR')&(df['gender']=='F')].groupby(['status']).count()['mba_p'].plot.pie(autopct="%.1f%%")
plt.title('Mkt&HR Females')
plt.subplot(2,2,4)
df[(df['specialisation']=='Mkt&Fin')&(df['gender']=='F')].groupby(['status']).count()['mba_p'].plot.pie(autopct="%.1f%%")
plt.title('Mkt&Fin Females')

 ==> Mkt&Fin is a better choice than Mkt&HR for both Females and Males in term of employability rate

### Work experience

In [None]:
#Count Plot
sns.countplot(x='workex',hue='status',data=df)

==> It's clear that work experience plays a major role in increasing chances of employment

In [None]:
#3d Plot (Work experience,Etest,Specialization,status)
import plotly.express as px
px.scatter_3d(df,x='etest_p',y='specialisation',z='workex',color='status')

==> This 3d plot gives insights about the positive effect of work experience and employability test percentage on placement status in both specialization MRKT&HR and MRKT&FIN. 

# Preprocessing Data

In [None]:
df.head()

In [None]:
#Missing Data (None)
df.isnull().sum()

In [None]:
df.info()

In [None]:
#Getting categorical data for encoding
Cat_Col = []
for l in df.columns:
    if type(df[l].loc[1])== str:
        Cat_Col.append(l)
Cat_Col

In [None]:
#Since we already encoded status let's drop it from our list
Cat_Col.pop()

In [None]:
for l in Cat_Col : 
    df= pd.concat([df,pd.get_dummies(df[l],drop_first=True)],axis=1)

In [None]:
#Replacing dummy variable Others for degree_t by Comm&Mgmt for interpretability manners
df.columns = ['gender', 'ssc_p', 'ssc_b', 'hsc_p', 'hsc_b', 'hsc_s', 'degree_p','degree_t', 'workex', 'etest_p', 'specialisation', 'mba_p', 'status','Placed', 'M', 'Others_ssc', 'Others_hsc', 'Commerce', 'Science', 'Comm&Mgmt','Sci&Tech', 'Yes', 'Mkt&HR']
df['Comm&Mgmt'] = (df['Comm&Mgmt']+df['Sci&Tech']).apply(lambda x:(x+1)%2)

# Logistic Regression

In [None]:
#Splitting features and labels and dropping categorical Data which is already encoded
X = df.drop(['gender','ssc_b','hsc_b','hsc_s','degree_t','workex','specialisation','status','Placed'],axis=1)
y= df['Placed']

In [None]:
#Standard Scaler
scaler = StandardScaler()


Let's use k-folds in order to get accurate estimates of Accuracy and f1-score. Because we have a limited amount of data we can try some large numbers of folds.

In [None]:
#Defining a k-fold function :

def kfold_logistic(X,y,nb_fold,regularization_type='none',regularization_coefficient =1,solver_max_iter=500,display=True,fold_size=0.3):
    random.seed(1)
    scaler = StandardScaler()
    
    if regularization_type =='none' or regularization_type =='l2':
        solver = 'lbfgs'
    if regularization_type =='l1':
        solver= 'liblinear'
    mean_test_accuracy = 0
    mean_test_f1score = 0
    mean_train_accuracy = 0
    mean_train_f1score = 0

    for j in range(nb_fold):

        #Train/Test Split
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=fold_size)
        
        scaler.fit(X_train)
        #Scale Data
        X_train_scaled = scaler.transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        #Defining the model
        logreg = LogisticRegression(max_iter=500,penalty=regularization_type,solver=solver,C=1/regularization_coefficient)
        #fitting model
        logreg.fit(X_train_scaled,y_train)
        #Predicting X_test
        predictions = logreg.predict(X_test_scaled)
        mean_test_accuracy += accuracy_score(y_test,predictions)
        mean_test_f1score += f1_score(y_test,predictions)

        #Predicting X_train
    
        #predictions on training set : 
        predictions_train = logreg.predict(X_train_scaled)
        mean_train_accuracy += accuracy_score(y_train,predictions_train)
        mean_train_f1score += f1_score(y_train,predictions_train)
    

    mean_test_accuracy = mean_test_accuracy/nb_fold
    mean_test_f1score = mean_test_f1score/nb_fold
    mean_train_accuracy = mean_train_accuracy/nb_fold
    mean_train_f1score = mean_train_f1score/nb_fold 
    #If display = True print results
    if display==True :
        print('Test predictions report')
        print('Mean Test Accuracy =',mean_test_accuracy)
        print('Mean Test f1 score =',mean_test_f1score)
        print('\n')
    

        print('Train predictions report')
        print('Mean Train Accuracy =',mean_train_accuracy)
        print('Mean Train f1 score =',mean_train_f1score)
    #The function returns a list containing mean accuracy and mean f1 score for test and train sets    
    return [mean_test_accuracy,mean_train_accuracy,mean_test_f1score,mean_train_f1score]

In [None]:
kfold_logistic(X,y,nb_fold=100,solver_max_iter=500,display=True)

==> Since test accuracy is quite close to train accuracy, let's first try to improve train performance

### Adding interaction features

In [None]:
#Splitting features and labels and dropping categorical Data that is already encoded
X = df.drop(['gender','ssc_b','hsc_b','hsc_s','degree_t','workex','specialisation','status','Placed'],axis=1)
y= df['Placed']

In [None]:
X.columns

In [None]:
#Interaction terms
L=list(X.columns)
for i in range(len(L)-1):
    for j in range(i+1,len(L)):
        X[str(L[i]+'*'+L[j])] = X[L[i]]*X[L[j]]



In [None]:
X.info()

==> Now we have 105 features

In [None]:
#Scaler 
scaler = StandardScaler()

In [None]:
kfold_logistic(X,y,nb_fold=100)

==> We overfitted Data. For next, we will try some regularizations

### Regularization general function

In [None]:
#Defining a function for picking regularization factor using K-Fold Cross Validation
#factor_min/max = minimum/maximum regulariazation factor
#factor_nb = number of regularization factors to try
#search_type = is the way of selecting regularization factors (grid_search or random logarithmic)
def regularization(X,y,factor_min,factor_max,factor_nb,nb_fold,search_type = 'grid_search',regularization_type='l1',plot_display=True) :
    random.seed(1)
    #Initializing some host lists for results 
    TEST_acc =[]
    TEST_F1=[]
    TRAIN_ACC=[]
    TRAIN_F1=[]
    LAMBDA = []
    
    #Creating Regularization factor space
    if search_type == 'random_logarithmic_search' :
        for nb in range(factor_nb):
            #Random logarithmic selection of regularization coefficient
            r = (np.log10(factor_max)-np.log10(factor_min))*np.random.rand() + np.log10(factor_min)
            lam=10**r
            LAMBDA.append(lam)
    if search_type == 'grid_search':
        
        if factor_min==0: 
            #Avoid dividing by 0 (1/C in logistic function)
            factor_min += 10**-8 
        #Creating a grid    
        LAMBDA = list(np.linspace(factor_min,factor_max,factor_nb))
    
    #Compute accuracy and f1 score for all space
    for lam in LAMBDA:
        
        #K-Fold computation for regularization coefficient equal to lam
        results = kfold_logistic(X,y,nb_fold=nb_fold,regularization_type=regularization_type,regularization_coefficient=lam,display=False)
    
    
    
        #Appending results  
        TEST_acc.append(results[0])
        TEST_F1.append(results[2])
        TRAIN_ACC.append(results[1])
        TRAIN_F1.append(results[3])
    ind = TEST_acc.index(max(TEST_acc))    
    if plot_display == True :
        #Plot Test and Train accuray    
        fig = plt.figure(figsize=(15,6))
        ax = fig.add_axes([0,0,1,1])
        ax.plot(LAMBDA,TEST_acc,'ro')
        ax.plot(LAMBDA,TRAIN_ACC,'bo')
        ax.plot(LAMBDA[ind],TEST_acc[ind],marker='*',ms=20,markerfacecolor='yellow',markeredgewidth=3, markeredgecolor='green')
        ax.plot(LAMBDA[ind],TRAIN_ACC[ind],marker='*',ms=20,markerfacecolor='yellow',markeredgewidth=3, markeredgecolor='green')

        print('Maximum test accuracy reached at lambda =',LAMBDA[ind])
        print('Maximum test accuracy value =',TEST_acc[ind])
        print('Training set accuracy value =',TRAIN_ACC[ind])
    return [LAMBDA[ind],TEST_acc[ind],TRAIN_ACC[ind]]


### L1 Regularization

In [None]:
regularization(X,y,factor_min=0.001,factor_max=2,factor_nb=100,nb_fold=100,search_type='grid_search',regularization_type='l1')

### L2 Regularization

In [None]:
regularization(X,y,factor_min=0.001,factor_max=2,factor_nb=100,nb_fold=100,search_type='grid_search',regularization_type='l2')

==> l1 & l2 regularization helped in improving test performance...

==> Still, the effect of regularization is not that satisfying compared to the initial model (Without feature engineering and regularization). This might be due to the huge feature/observations ratio that we have. In next section we try some forward model selection techniques

### Forward Selection 1

In [None]:
#Add feature by feature using Test accuracy metric

X_candidates = X.copy()
X_model = pd.DataFrame()
score_table=[]
feature_table=[]
current_accuracy = 0



#Redo the process for 15 times

for n in range(15):
    
    print('round number :',n)
    score_table=[]
    feature_table=[]
    #For all features not in model
    for feat in X_candidates.columns :  
        #Add feature to the model
        X_model[feat] = X_candidates[feat]
        #Estimate test accuracy using K-Fold
        results = kfold_logistic(X_model,y,nb_fold=100,regularization_type='l2',display=False)
    
        #Append result
        mean_test_accuracy = results[0]
        score_table.append(mean_test_accuracy)
        feature_table.append(feat)
        #Drop feature to prepare the entry of next one
        X_model.drop(feat,axis=1,inplace=True)
    #Select index of feature that yields maximum accuracy
    index = score_table.index(max(score_table))
    print('Winner is : ',feature_table[index])
    print('Final test...')
    # If best feature improves current test accuracy add it definitely to the model if not discard it and retry, one might think of implementing backward here in order to look for other alternatives
    if score_table[index]>current_accuracy:
        X_model[feature_table[index]] = X_candidates[feature_table[index]]
        X_candidates.drop(feature_table[index],axis=1,inplace=True)
        current_accuracy = score_table[index]
        print(f'Fighter {feature_table[index]} is IN')
    else : 
        print(f'Fighter {feature_table[index]} disqualified')

In [None]:
#Our model 
X_model.columns

In [None]:
#Let's apply regularization to the model we obtained
regularization(X_model,y,factor_min=0.001,factor_max=5,factor_nb=100,nb_fold=100,search_type='grid_search',regularization_type='l2')

### Forward Selection 2

In this part we are going to get a model by selecting features that maximizes training accuracy. Our last model will be a 15 feature model. After, for all p in [1,15] we create a subset model containing first p features, then, we compare these models after regularization using test accuracy.


In [None]:


X_candidates = X.copy()
X_model = pd.DataFrame()
lr = LogisticRegression(penalty='none',max_iter=1000)
random.seed(1)
models = []
models_train_accuracy=[]

random.seed(1)
#Redo the process 15 times 

for n in range(15):
    print('round number :',n)
    score_table=[]
    feature_table=[]
    #For all features not in our model
    for feat in X_candidates.columns :       
        X_model[feat] = X_candidates[feat]
        #Get estimate of training accuracy using K-Fold (It's like we are getting the learning capability of our model)
        results = kfold_logistic(X_model,y,nb_fold=100,display=False)    
        score_table.append(results[1])
        feature_table.append(feat)
        X_model.drop(feat,axis=1,inplace=True)
    #Select index of feature that yields maximum training accuracy
    index = score_table.index(max(score_table))
    #Add feature to our model
    X_model[feature_table[index]] = X_candidates[feature_table[index]]
    X_candidates.drop(feature_table[index],axis=1,inplace=True) 
    models.append(feature_table[index])
    models_train_accuracy.append(score_table[index])
    print('Winner is : ',feature_table[index])
    print('Current model is : ')
    print(models)
    print('Current train accuracy is : ',score_table[index])

    
    


In [None]:
CV = []
LAMBDA = []
random.seed(1)
#For all subsets compute test accuracy and lambda value for regularization using K-Folds.
for p in range(len(models)):
    X_model2 = X_model[models[0:p+1]]
    print('model1 processing...')
    regu = regularization(X_model2,y,factor_min=0,factor_max=5,factor_nb=100,nb_fold=100,
                          search_type='grid_search',regularization_type='l2',plot_display=False)
    LAMBDA.append(regu[0])
    CV.append(regu[1])
    print(f'for model {p+1} cross validation test is equal to {regu[1]} for a value of lambda equal to {regu[0]}')

# Final models : 

## Forward selection model 1

Forward selection 1 yield the model defined by : ['ssc_p*degree_p', 'Comm&Mgmt*Yes', 'Science*Mkt&HR', 'etest_p*M',
       'etest_p*mba_p', 'ssc_p*Yes', 'Commerce*Yes', 'Science*Comm&Mgmt', 'Comm&Mgmt*Sci&Tech'] It containes only interaction terms :
       
       1/ interaction between ssc_p and degree_p
       2/ interaction between Work experience (Dummy variable) and Comm&Mgmt(Field of degree education)
       3/ Interaction between Science(Specialization in Higher Secondary Education) & Mkt&HR(Post graduation MBA specialization)
       4/ interaction between etest_p and gender
       5/ interaction between etest_p and mba_p
       6/ interaction between ssc_p and work experience
       7/ interaction between Commerce (Specialization in Higher Secondary Education) and work experience
       8/ interaction between Science(Specialization in Higher Secondary Education) and Comm&Mgmt(Field of degree education)
       9/ interaction between Comm&Mgmt and Sci&Tech (Fields of degree education)
 
 L2 Regularization factor value = 1.566343434343434
 
 Let's apply that model for a random train/test split

In [None]:
#Features selected 
X_forward1 = X[['ssc_p*degree_p', 'Comm&Mgmt*Yes', 'Science*Mkt&HR', 'etest_p*M',
       'etest_p*mba_p', 'ssc_p*Yes', 'Commerce*Yes', 'Science*Comm&Mgmt', 'Comm&Mgmt*Sci&Tech']]
#Scaler
scaler = StandardScaler()


In [None]:
random.seed(1)
#Logistic regression
lr1 = LogisticRegression(max_iter=500,C=1/1.566343434343434)
#Train Test split
X_train, X_test, y_train, y_test = train_test_split(X_forward1, y, test_size=0.3)

In [None]:
#Scale Data
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
#Fit the model
lr1.fit(X_train,y_train)
#Predict y_test
predictions = lr1.predict(X_test)
#Print Results
print(confusion_matrix(y_test,predictions))
print('\n')
print(classification_report(y_test,predictions))

 ==> Better accuracy and f1 score...

In [None]:
#Coefficients
model1_coefficients = pd.DataFrame({'Feature name' : X_forward1.columns , 'Coefficient value': lr1.coef_.reshape(9,)})
model1_coefficients

==> etest_p * gender has a positive coefficient, in contrast etest_p * mba_p coefficient is negative. This might be due to :
    
    1/ females have the best mba grades but there placement ratio is inferior to male's rate 
    2/ Males have better etest grades and better placement ratio (M refers to Male's dummy variable)
    
    See boxplots of grades per gender and placement status

Forward Selection 2 : 
    The best model according to test estimated accuracy is the one defined with : ['ssc_p*degree_p', 'Comm&Mgmt*Yes', 'mba_p', 'Science*Mkt&HR', 'degree_p*M', 'degree_p*etest_p', 'Science*Sci&Tech']
    
    1/ interaction between ssc_p and degree_p
    2/ interaction between Work experience (Dummy variable) and Comm&Mgmt(Field of degree education)
    3/ mba_p
    4/ interaction between Science(Specialization in Higher Secondary Education) & Mkt&HR(Post graduation MBA specialization)
    5/ interaction between degree_p and gender
    6/ interaction between degree_p and etest_p
    7/ interaction between Science(Specialization in Higher Secondary Education) and Sci&Tech(Field of degree education)
    
   L2 Regularization factor value =  1.364363636363636
   
    Let's apply that model for a random train/test split

In [None]:
#Features selected 
X_forward2 = X[['ssc_p*degree_p', 'Comm&Mgmt*Yes', 'mba_p', 'Science*Mkt&HR', 'degree_p*M', 'degree_p*etest_p', 'Science*Sci&Tech']]
#Scale
scaler = StandardScaler()

In [None]:
random.seed(1)
#Logistic regression
lr2 = LogisticRegression(max_iter=500,C=1/1.364363636363636)
#Train Test split
X_train, X_test, y_train, y_test = train_test_split(X_forward2, y, test_size=0.3)

In [None]:
#Scaler Data
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
#Fit the model
lr2.fit(X_train,y_train)
#Predict y_test
predictions = lr2.predict(X_test)
#Print Results
print(confusion_matrix(y_test,predictions))
print('\n')
print(classification_report(y_test,predictions))

In [None]:
#Coefficients 

model2_coefficients = pd.DataFrame({'Feature name' : X_forward2.columns , 'Coefficient value': lr2.coef_.reshape(7,)})
model2_coefficients

In [None]:
#Results might differ due to randomnes, I tried to make use of random.seed... Thanks!