In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split,RandomizedSearchCV, GridSearchCV
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.metrics import accuracy_score, f1_score

In [None]:
df = pd.read_csv("../input/factors-affecting-campus-placement/Placement_Data_Full_Class.csv")
df.head()

In [None]:
df.info()

In [None]:
print("Salary Mean: ",df["salary"].mean())
print("Salary Median: ",df["salary"].median())
print("Salary Standard deviation: ",df["salary"].std())

In [None]:
def pipeline(df):
    df = pd.concat([df,pd.get_dummies(df["hsc_s"]),pd.get_dummies(df["degree_t"])],axis=1)
    df.drop(["hsc_s","degree_t","sl_no"],axis=1,inplace=True)
    #Ordinal Encoding
    ordinal_encoder = OrdinalEncoder()
    cat_features = ["gender","ssc_b","hsc_b","specialisation","workex","status"]
    for i in cat_features:
        temp = df[[i]]
        df[i] = ordinal_encoder.fit_transform(temp).astype(int)
    #Standard Scaling
    standard_scaler = StandardScaler()
    continuous_features = ["ssc_p","hsc_p","degree_p","etest_p","mba_p"]
    df["salary"].fillna(df.salary.median(),inplace=True)
    for i in continuous_features:
        temp = df[[i]]
        df[i] = standard_scaler.fit_transform(temp)
    return df

In [None]:
df_prepared = pipeline(df)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(df_prepared.drop(["status"],axis=1), df_prepared["status"], test_size=0.2, random_state=0)

In [None]:
import xgboost
xgb_params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        }
xgb = xgboost.XGBClassifier(learning_rate=0.02, n_estimators=600, objective='binary:logistic', nthread=1, eval_metric = "error",use_label_encoder=False)

In [None]:
random_search_xgb = RandomizedSearchCV(xgb, param_distributions=xgb_params, n_iter=5, scoring='roc_auc', n_jobs=4, cv=5, verbose=3, random_state=0)
random_search_xgb.fit(x_train,y_train)

In [None]:
xgb_best = random_search_xgb.best_estimator_
xgb_pred = xgb_best.predict(x_test)
xgb_proba = xgb_best.predict_proba(x_test)
print("Accuracy: ",accuracy_score(y_test,xgb_pred))
print("F1-score: ",f1_score(y_test,xgb_pred))

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf_params = {
            "n_estimators" : [300, 800, 1200],
            "max_depth" : [5, 8, 15],
            "min_samples_split" : [10, 20, 30],
            "min_samples_leaf" : [1, 2, 5],
            }
rf = RandomForestClassifier()
grid_search_rf = GridSearchCV(rf, rf_params, scoring="roc_auc", cv=5, verbose=1, n_jobs=-1)
grid_search_rf.fit(x_train,y_train)

In [None]:
rf_best = grid_search_rf.best_estimator_
rf_pred = rf_best.predict(x_test)
print("Accuracy: ",accuracy_score(y_test,rf_pred))
print("F1-score: ",f1_score(y_test,rf_pred))