In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Getting the data ready

### Loading DataSet

In [None]:
data = pd.read_csv("/kaggle/input/factors-affecting-campus-placement/Placement_Data_Full_Class.csv")

In [None]:
data.head()

In [None]:
data['status'] = data['status'].map({'Placed': 1, 'Not Placed': 0})
data.groupby("status").count()

### Splitting Between X and y

In [None]:
X = data.drop(["sl_no","status","salary"],axis=1)
y= data["status"]

In [None]:
X.head(3),y.head(3)

#### Converting Categorical Data to Numerical

In [None]:
X_dummy = pd.get_dummies(X)
X_dummy.head()

### Train and Test Data Split

In [None]:
from sklearn.model_selection import train_test_split
np.random.seed(42)
X_train,X_test, y_train, y_test = train_test_split(X_dummy,y,test_size=0.2)

# Choosing the right maching learning estimator/aglorithm/model 
and Fitting  chosen machine learning model to data and using it to make a prediction

### Model Fitting

In [None]:
from sklearn.ensemble import RandomForestClassifier
np.random.seed(42)
clf = RandomForestClassifier()
clf.fit(X_train,y_train)
clf.score(X_test,y_test)

### Using Model to produce predictions

In [None]:
y_pred = clf.predict(X_test)
y_pred[:5]

# Evaluation of Model

### Scoring Metrics

#### Accuracy Score

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

#### Prediction Probability

In [None]:
clf.predict_proba(X_test[:5])

#### Cross Validation Score

In [None]:
from sklearn.model_selection import cross_val_score
np.random.seed(42)
cross_val_score(clf,X_dummy,y,cv=6)

#### Single Score vs Cross Validation Score

In [None]:
np.random.seed(42)
clf_single_score = clf.score(X_test,y_test)
clf_crossval_score = np.mean(cross_val_score(clf,X_dummy,y,cv=6))
pd.DataFrame([{"Classification Single Score":clf_single_score,"Cross Validation Score":clf_crossval_score}])

### Area under Receiver Operating Characteristic Curve (ROC)

In [None]:
from sklearn.metrics import roc_curve
y_prob = clf.predict_proba(X_test)
y_positive = y_prob[:,1]

#Calculate false positive rate, true positive rate and thresholds
fpr,tpr,thresholds = roc_curve(y_test,y_positive)
fpr

In [None]:
import matplotlib.pyplot as plt
def plot_roc(fpr,tpr):
    plt.plot(fpr,tpr,color='orange',label='ROC')
    plt.plot([0,1],[0,1],color='darkblue',linestyle='--',label="Guessing")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("Receiver Operating Characteristics Curve (ROC)")
    plt.legend()
    plt.show()
    
plot_roc(fpr,tpr)

#### ROC Score

In [None]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test,y_positive)

### Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix
y_pred = clf.predict(X_test)
confusion_matrix(y_test,y_pred)

#### Visualizing Confusion Matrix

In [None]:
pd.crosstab(y_test,y_pred,
           rownames=["Actual Label"],
           colnames=["Predicted Label"])

In [None]:
import seaborn as sns
sns.set(font_scale=1.5)
conf_mat = confusion_matrix(y_test,y_pred)
sns.heatmap(conf_mat);

In [None]:
def plot_conf_mat(conf_mat):
    """
    Plots a confusion matrix using Seaborn's heatmap().
    """
    fig, ax = plt.subplots(figsize=(3, 3))
    ax = sns.heatmap(conf_mat,
                     annot=True, # Annotate the boxes 
                     cbar=False)
    plt.xlabel('Predicted label')
    plt.ylabel('True label');

plot_conf_mat(conf_mat)

In [None]:
from sklearn.metrics import plot_confusion_matrix

plot_confusion_matrix(clf, X_dummy, y)

### Classification Report
* Precision - Indicates the proportion of positive identifications (model predicted class 1) which were actually correct. A model which produces no false positives has a precision of 1.0.
* Recall - Indicates the proportion of actual positives which were correctly classified. A model which produces no false negatives has a recall of 1.0.
* F1 score - A combination of precision and recall. A perfect model achieves an F1 score of 1.0.
* Support - The number of samples each metric was calculated on.
* Accuracy - The accuracy of the model in decimal form. Perfect accuracy is equal to 1.0, in other words, getting the prediction right 100% of the time.
* Macro avg - Short for macro average, the average precision, recall and F1 score between classes. Macro avg doesn't take class imbalance into effect. So if you do have class imbalances (more examples of one class than another), you should pay attention to this.
* Weighted avg - Short for weighted average, the weighted average precision, recall and F1 score between classes. Weighted means each metric is calculated with respect to how many samples there are in each class. This metric will favour the majority class (e.g. it will give a high value when one class out performs another due to having more samples).

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

### Using Scoring Parameter

In [None]:
np.random.seed(42)
#Default- Mean Accuracy
cv_acc = cross_val_score(clf,X_dummy,y,cv=5)
#Cross Validated Score
print(f"The Cross Validated Accuracy : {np.mean(cv_acc)*100:.2f}%")

In [None]:
np.random.seed(42)
#Accuracy Param
cv_acc = cross_val_score(clf,X_dummy,y,cv=5,scoring="accuracy")
#Cross Validated Score
print(f"The Cross Validated Accuracy : {np.mean(cv_acc)*100:.2f}%")

In [None]:
np.random.seed(42)
#Precision Param
cv_acc = cross_val_score(clf,X_dummy,y,cv=5,scoring="precision")
print(f"The Cross Validated Precision : {np.mean(cv_acc)*100:.2f}%")

In [None]:
np.random.seed(42)
#Recall Param
cv_acc = cross_val_score(clf,X_dummy,y,cv=5,scoring="recall")
print(f"The Cross Validated Recall : {np.mean(cv_acc)*100:.2f}%")

In [None]:
np.random.seed(42)
#F1 Param
cv_acc = cross_val_score(clf,X_dummy,y,cv=5,scoring="f1")
print(f"The Cross Validated F1 score : {np.mean(cv_acc)*100:.2f}%")

### Classification Functions

In [None]:
def classification_metrics(y_test,y_pred):
    from sklearn.metrics import accuracy_score,precision_score,f1_score,recall_score
    print("Classification Metrics: ")
    print(f"Accuracy: {accuracy_score(y_test,y_pred)*100 :.2f}%")
    print(f"Precision: {precision_score(y_test,y_pred)*100 :.2f}%")
    print(f"Recall: {recall_score(y_test,y_pred)*100 :.2f}%")
    print(f"F1: {f1_score(y_test,y_pred)*100 :.2f}%")
    metric_dict = {"accuracy": round(accuracy_score(y_test,y_pred), 2),
                   "precision": round(precision_score(y_test,y_pred), 2), 
                   "recall": round(recall_score(y_test,y_pred), 2),
                   "f1": round(f1_score(y_test,y_pred), 2)}
    return metric_dict
base_metrics = classification_metrics(y_test,y_pred)

# Improving model predictions through Experimentation (Hyperparameter Tuning)

In [None]:
clf.get_params()

### Hyperparameter tuning using `RandomizedSearchCV`

In [None]:
grid = {"n_estimators":[10,100,500,1000,1500,2000],
       "max_depth":[None,5,10,20,30],
       "max_features":["auto","sqrt"],
       "min_samples_split":[2,4,6],
       "min_samples_leaf":[1,2,4]}

In [None]:
np.random.seed(42)
clf= RandomForestClassifier(n_jobs=1)
from sklearn.model_selection import RandomizedSearchCV
rs_clf = RandomizedSearchCV(estimator=clf,
                   param_distributions=grid,
                   n_iter=100, #Increasing too 100 from 20
                   cv=5,
                   verbose=2)
rs_clf.fit(X_train,y_train)

In [None]:
rs_clf.best_params_

In [None]:
rs_y_preds = rs_clf.predict(X_test)
rs_metrics = classification_metrics(y_test,rs_y_preds)

### Hyperparameter tuning using `GridSearchCV`

In [None]:
grid_2 = {'n_estimators':[2000,2500,3000],
         'max_depth':[10],
         'max_features':['auto','sqrt'],
         'min_samples_split':[2,4],
         'min_samples_leaf':[2]}

In [None]:
pd.DataFrame([grid,grid_2],index=['Grid1','Grid2'])

In [None]:
from sklearn.model_selection import GridSearchCV
np.random.seed(42)

gs_clf = GridSearchCV(estimator=clf,
                     param_grid=grid_2,
                     cv=5,
                     verbose=2)

gs_clf.fit(X_train,y_train);

In [None]:
gs_clf.best_params_

In [None]:
gs_y_preds = gs_clf.predict(X_test)
gs_metrics= classification_metrics(y_test,gs_y_preds)

In [None]:
compare_metrics = pd.DataFrame({"baseline": base_metrics,
                                "random search": rs_metrics,
                                "grid search": gs_metrics})
compare_metrics.plot.bar(figsize=(10, 8))

# Saving and Loading Model for later use

In [None]:
import pickle

# Save an existing model to file
pickle.dump(gs_clf, open("gs_random_forest_model_1.pkl", "wb"))

In [None]:
# Load a saved model
loaded_pickle_model = pickle.load(open("gs_random_forest_model_1.pkl", "rb"))

In [None]:
# Make predictions and evaluate the loaded model
pickle_y_preds = loaded_pickle_model.predict(X_test)
classification_metrics(y_test, pickle_y_preds)