In [None]:
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from datetime import datetime
from datetime import date

from warnings import filterwarnings
filterwarnings('ignore')

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## 1. Reading Dataset

In [None]:
data = pd.read_csv('/kaggle/input/engineering-placements-prediction/collegePlace.csv')
print(data.shape)
data.head()

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
categorical = [var for var in data.columns if data[var].dtype=='O']
print('There are {} categorical variables'.format(len(categorical)))
print('The categorical variables are :', categorical)

In [None]:
for var in categorical: 
    print(data[var].value_counts())
    print(data[var].value_counts()/np.float(len(data)))
    print()

## 2. Exploratory Data Analysis

**Pie chart for target**

In [None]:
data['PlacedOrNot'].value_counts()

In [None]:
colors=('#c2c2f0','#ffb3e6')
explode=[0,0.1]
data['PlacedOrNot'].value_counts().plot(kind='pie',shadow=True,explode=explode,colors=colors,autopct='%.2f',figsize=(8,6))
plt.title('Ratio of Placed')
plt.show()

**Correlation between numerical features**

In [None]:
plt.figure(figsize=(12,6))
plt.title('Correlation between variables')
sns.heatmap(data.corr(),annot=True,square=True,cmap='Reds')

In [None]:
numcorr = data.corr()
Num = numcorr['PlacedOrNot'].sort_values(ascending=False).to_frame()
s = Num.style.background_gradient(cmap='Reds')
s

**Univariate Distribution and Bivariate Distribution**

In [None]:
sns.displot(data['Age'],rug=True)
plt.show()

sns.displot(data['CGPA'])
plt.show()

In [None]:
sns.jointplot(x=data['Age'],y=data['CGPA'],hue=data['PlacedOrNot'])

In [None]:
data.describe()

In [None]:
data.describe(include='object')

**Visualization for Categorical Variables**

* Hostel contains 2 labels
* HistoryOfBacklogs contains 2 labels
* Gender contains 2 labels
* Stream contains 6 labels

In [None]:
plt.figure(figsize=(8,4))
plt.title('count of Hostel colored by placed')
sns.countplot(data=data,x='Hostel',hue='PlacedOrNot')

In [None]:
plt.figure(figsize=(8,4))
plt.title('count of History Of Backlogs colored by placed')
sns.countplot(data=data,x='HistoryOfBacklogs',hue='PlacedOrNot')

In [None]:
plt.figure(figsize=(8,4))
plt.title('count of Gender colored by placed')
sns.countplot(data=data,x='Gender',hue='PlacedOrNot')

In [None]:
plt.figure(figsize=(8,4))
plt.title('count of Stream colored by placed')
sns.countplot(data=data,x='Stream',hue='PlacedOrNot')
plt.xticks(rotation=90)

## 3. Data Preprocessing

#### Label Encoding

In [None]:
from sklearn import preprocessing 
label_encoder = preprocessing.LabelEncoder() 
data['Gender'] = label_encoder.fit_transform(data['Gender'])

#### Dummy Variable Encoding

In [None]:
print(data['Stream'].unique())

In [None]:
data = pd.get_dummies(data=data,columns=['Stream'])
data.head()

## 4. Building Model

#### Split train and test data

In [None]:
features = data.drop('PlacedOrNot',axis=1)
target = data['PlacedOrNot']

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler() 
scaled_values = scaler.fit_transform(features) 
features.loc[:,:] = scaled_values
features.head()

In [None]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(features,target,test_size=0.2,random_state=0)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

#### Model Training

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [None]:
def fit_and_score(models, X_train, X_test, y_train, y_test):
    np.random.seed(0)
    
    model_scores = {}
    
    for name, model in models.items():
        model.fit(X_train,y_train)
        model_scores[name] = model.score(X_test,y_test)

    model_scores = pd.DataFrame(model_scores, index=['Score']).transpose()
    model_scores = model_scores.sort_values('Score')
        
    return model_scores

In [None]:
models = {'LogisticRegression': LogisticRegression(),
          'KNeighborsClassifier': KNeighborsClassifier(),
          'SVC': SVC(),
          'DecisionTreeClassifier': DecisionTreeClassifier(),
          'RandomForestClassifier': RandomForestClassifier(),
          'XGBClassifier': XGBClassifier()}

In [None]:
model_scores = fit_and_score(models, X_train, X_test, y_train, y_test)

In [None]:
model_scores.sort_values('Score')

From the baseline modeling, I will choose **SVC** to have a closer look

#### HyperTuning by Grid search CV

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
def randomsearch_cv_scores(models, params, X_train, X_test, y_train, y_test):
    np.random.seed(42)
    
    model_rs_scores = {}
    model_rs_best_param = {}
    
    for name, model in models.items():
        rs_model = RandomizedSearchCV(model,
                                     param_distributions=params[name],
                                      cv=5,
                                     n_iter=20,n_jobs=-1,
                                     verbose=2)        
        rs_model.fit(X_train,y_train)
        model_rs_scores[name] = rs_model.score(X_test,y_test)
        model_rs_best_param[name] = rs_model.best_params_
        
    return model_rs_scores, model_rs_best_param

In [None]:
models = {'SVC': SVC()}

params = {'SVC':{'C': [0.1,0.5,1,10,100,500], 
              'kernel':['linear', 'poly', 'rbf','sigmoid'],
              'gamma':['scale','auto'],
              'degree':[2,3,4]}}

In [None]:
model_rs_scores,model_rs_best_param = randomsearch_cv_scores(models,params,X_train,X_test,y_train,y_test)

In [None]:
model_rs_scores

In [None]:
model_rs_best_param

## 5. Model Evalution

In [None]:
from sklearn.metrics import classification_report,plot_confusion_matrix,plot_roc_curve
from sklearn.model_selection import cross_val_score

In [None]:
model = SVC(kernel='rbf',gamma='scale',degree=3,C=10)
model.fit(X_train,y_train)
y_pred = model.predict(X_test)

In [None]:
print(classification_report(y_test,y_pred))

In [None]:
plot_confusion_matrix(model,X_train,y_train,cmap='Purples')

In [None]:
cv_accuracy = cross_val_score(model,X_train,y_train,cv=5,scoring='accuracy')

print(f'Cross Validaion accuracy Scores: {cv_accuracy}')
print(f'Cross Validation accuracy Mean Score: {cv_accuracy.mean()}')

In [None]:
plot_roc_curve(model,X_test,y_test)