# **Installing the dependencies**

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

#SK-Learn        
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
from xgboost import XGBClassifier
import time
from sklearn.feature_selection import RFE, f_regression
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from pprint import pprint
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

sns.set(style='whitegrid', palette='muted', font_scale=1.1)

import warnings
warnings.filterwarnings('ignore')
# Any results you write to the current directory are saved as output.

# **Loading the dataset**

In [None]:
data = pd.read_csv('/kaggle/input/factors-affecting-campus-placement/Placement_Data_Full_Class.csv', index_col='sl_no')
data.drop('salary', axis=1, inplace=True)
data['status'] = data['status'].map({'Placed':1, 'Not Placed': 0}).astype(int)
data.head()

In [None]:
data.info()

# **EDA : Heat Map**

In [None]:
plt.figure(figsize=(14,7))
plt.title('Heatmap')
sns.heatmap(data=data.corr(), annot=True)

* High correlation between status and (ssc_p, hsc_p, degree_p)
* Low correlation between status and (etest_p, mba_p)

# **EDA : Relation between different marks**

In [None]:
sns.pairplot(data, vars=['ssc_p', 'hsc_p', 'degree_p', 'etest_p', 'mba_p'], hue='status', kind='reg')

* For pairs of (ssc_p v/s hsc_p), (ssc_p v/s degree_p), (hsc_p v/s degree_p) it can be estabilished that good performance in all three fronts relate to higher chances of getting placed
* The other two columns do not point to any such correlation. The marks obtained in them seem to have no influence on the placement.

# **Splitting into training and test set**

In [None]:
y = data['status']
X = data.copy()
X.drop('status', axis=1, inplace=True)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.16, random_state=1)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

* Final accuracy would be checked on Test Dataset(X_test, y_test)
* Cross valiation would be used as dataset is not very big

# **Encoding Categorcial Variables**

In [None]:
from sklearn.preprocessing import OneHotEncoder

cols = [ 'gender','ssc_b', 'hsc_b', 'hsc_s', 'degree_t', 'workex', 'specialisation']
ohc = OneHotEncoder(handle_unknown='ignore', sparse=False)

n_cols_train = pd.DataFrame(ohc.fit_transform(X_train[cols]))
n_cols_test = pd.DataFrame(ohc.transform(X_test[cols]))

n_cols_train.index = X_train.index
n_cols_test.index = X_test.index

n_cols_train.columns = ohc.get_feature_names(cols)
n_cols_test.columns = ohc.get_feature_names(cols)

X_train = pd.concat([X_train, n_cols_train], axis=1)
X_test = pd.concat([X_test, n_cols_test], axis=1)

X_train.drop(cols, axis=1, inplace=True)
X_test.drop(cols, axis=1, inplace=True)

In [None]:
X_train.head()

# **Feature Scaling**

In [None]:
mean = X_train.mean()
std = X_train.std()

In [None]:
X_train = (X_train-mean)/std
X_test = (X_test-mean)/std

In [None]:
X_train.head(3)

# **Model Selection**

In [None]:
models = pd.DataFrame(columns=['model', 'score', 'std','Time to Train']) #DataFrame to store scores of all models

options = [GaussianNB(), 
           LogisticRegression(), 
           SVC(), 
           LinearSVC(), 
           DecisionTreeClassifier(), 
           RandomForestClassifier(), 
           KNeighborsClassifier(), 
           SGDClassifier(), 
           XGBClassifier()]   

model_names = ['Naive Bayes', 
               'Logistic Regression', 
               'Support Vector Machine', 
               'Linear SVC', 
               'Decison Tree',
               'Random Forest',
               'KNN', 
               'SGD Classifier',
               'XGBoost']  

for (opt, name) in zip(options, model_names):
    start=time.time()
    model = opt
    model.fit(X_train, y_train)
    
    scores = cross_val_score(model, X_train, y_train, cv = 5, scoring="accuracy")
    end=time.time()
    row = pd.DataFrame([[name, scores.mean(), scores.std(), end-start]], columns=['model', 'score', 'std','Time to Train'])
    models = pd.concat([models, row], ignore_index=True)

models.sort_values(by='score', ascending=False)

1. Random Forest is the best model on the basis of accuracy
2. Std deviation of Random Forest classifier is the best among all as well.

# **Hyperparameter tuning**

**Different hyperparameters used are:**
* n_estimators = number of trees in the foreset
* max_features = max number of features considered for splitting a node
* max_depth = max number of levels in each decision tree
* min_samples_split = min number of data points placed in a node before the node is split
* min_samples_leaf = min number of data points allowed in a leaf node
* bootstrap = method for sampling data points (with or without replacement)

In [None]:
rfc=RandomForestClassifier(random_state=42)
param_grid = { 
    'n_estimators': [200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [5,7,'none']
}
print(param_grid)

In [None]:
CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
# Fit the random search model
CV_rfc.fit(X_train, y_train)

In [None]:
CV_rfc.best_params_

In [None]:
rfc1=RandomForestClassifier(random_state=42, max_features='auto', n_estimators= 200, max_depth=7, criterion='entropy')
rfc1.fit(X_train, y_train)

# **Predicting result**

In [None]:
pred=rfc1.predict(X_test)
print("Accuracy for Random Forest on test data: ",accuracy_score(y_test,pred))