In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve

In [None]:
df1=pd.read_csv('../input/factors-affecting-campus-placement/Placement_Data_Full_Class.csv')

In [None]:
df1

In [None]:
df1.fillna(method='ffill', inplace=True)

In [None]:
df1.info()

In [None]:
df1.describe()

In [None]:
####Droping serial number as it is all unique value and wont contribute to the model.
df1.drop(['sl_no'], axis=1, inplace=True)

In [None]:
##Ploting the distribution of continious values
###As we can see some the features are not normally distributed, we need to treat the skew.Power Transform in sklearn are created for this purpose.

for i in df1.columns:
    if df1[i].dtype !='object':
        sns.histplot(df1[i], kde=True)
        plt.show()

In [None]:
from sklearn.preprocessing import PowerTransformer
for i in df1.columns:
    if df1[i].dtype !='object' and i != 'salary':
        pw=PowerTransformer()
        df1[i]=pw.fit_transform(df1[i].values.reshape(-1,1))
        sns.histplot(df1[i], kde=True)
        plt.show()

In [None]:
###As the salary column is highly skewed hence going for discretization to contain outliers and remove noise
from sklearn.preprocessing import KBinsDiscretizer
kb=KBinsDiscretizer(n_bins=10, encode='ordinal',strategy="quantile")
df1['salary']=kb.fit_transform(df1.salary.values.reshape(-1,1))

In [None]:
####Categorical values encoding####
###Checking the frequency distribution of the object dtype features

for i in df1.columns:
    if df1[i].dtype =='object':
        print(i)
        print(df1[i].value_counts())

Trying different categorical encoding techniques

In [None]:
####1)trying out frequency encoding for categorical feature transformation

#for i in df1.columns:
    #if df1[i].dtype =='object' and i !='status':
    #    vl_1=df1.groupby(i).size()/len(df1)
    #    df1[i]=df1[i].map(vl_1)

In [None]:
#####2)trying out one hot encoding for categorical feature transformation

#df1=pd.get_dummies(df1, columns=['gender','ssc_b','hsc_b','hsc_s','degree_t','workex','specialisation'], drop_first=True, prefix=['gender','ssc_b','hsc_b','hsc_s','degree_t','workex','specialisation'])

In [None]:
######3)Trying mean encoding for categorical feature transformation
#for i in df1.columns:
#    if df1[i].dtype =='object' and i !='status':
#        vl_1=df1.groupby(i).status.count()/len(df1)
#        df1[i]=df1[i].map(vl_1)

In [None]:
#df1

In [None]:
def model_fit(df1, str1, enc_type):
    df2=df1.copy(deep=True)
    if enc_type =='frequency_encoding':
        for i in df2.columns:
            if df2[i].dtype =='object' and i !='status':
                vl_1=df2.groupby(i).size()/len(df2)
                df2[i]=df2[i].map(vl_1)
    elif enc_type =='mean_encoding':
        for i in df2.columns:
            if df2[i].dtype =='object' and i !='status':
                vl_1=df2.groupby(i).status.count()/len(df2)
                df2[i]=df2[i].map(vl_1)
    else:
        df2=pd.get_dummies(df2, columns=['gender','ssc_b','hsc_b','hsc_s','degree_t','workex','specialisation'], drop_first=True, prefix=['gender','ssc_b','hsc_b','hsc_s','degree_t','workex','specialisation'])
        
    X=df2.drop('status', axis=1)
    y=df2.status
    ####Removing Multicolinearity 
    from statsmodels.stats.outliers_influence import variance_inflation_factor
    vif=pd.DataFrame()
    vif['columum_values']=X.columns.tolist()
    lst_1=[]
    for i in range(0,len(X.columns)):
        vl_1=variance_inflation_factor(X.values, i)
        lst_1.append(vl_1)
    vif['VIF_value']=lst_1
    lst_2=vif[vif.VIF_value >5].columum_values.tolist()
    df2=df2.drop(lst_2, axis=1)
    lg=LogisticRegression()
    rf=RandomForestClassifier()
    xg=XGBClassifier()
    nb=GaussianNB()
    lb=LabelEncoder()
    target_names=df2.status.unique().tolist()
    target=lb.fit_transform(df2.status)
    df3=df2.drop('status', axis=1)
    X_train,X_test,y_train,y_test=train_test_split(df3,target, test_size=0.2,stratify=target)
    print(str1)
    for i in [lg,rf,xg,nb]:
        i.fit(X_train,y_train)
        y_pred=i.predict(X_test)
        print(i)
        print(classification_report(y_test, y_pred, target_names=target_names))
        print('=================================')
        pred_prob1 = i.predict_proba(X_test)
        fpr1, tpr1, thresh1 = roc_curve(y_test, pred_prob1[:,1], pos_label=1)
        df3=pd.DataFrame({'Fale_Postive_Rate': fpr1, 'True_Positive_Rate': tpr1, 'Threshold': thresh1})
        print(df3)
        

In [None]:
str1='Creating models with frequency encoding'
model_fit(df1, str1,enc_type='frequency_encoding')

In [None]:
str1='Creating models with mean encoding'
model_fit(df1, str1,enc_type='mean_encoding')

In [None]:
str1='Creating models with onehot encoding'
model_fit(df1, str1,enc_type='onehot_encoding')