# Engineering Placements Prediction

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [None]:
df=pd.read_csv(r"../input/engineering-placements-prediction/collegePlace.csv")

## Data Clean

In [None]:
df.head()

In [None]:
df.isna().sum()

* No missing values. Appreciate that!

In [None]:
df.duplicated().sum()

* Sometimes duplicated data is because there are few values per columns and not many columns. So the probs of getting equal rows in some aspects is very high

In [None]:
df[df.duplicated()]

## EDA

In [None]:
df.info()

In [None]:
df.describe().T

In [None]:
plt.figure(figsize=(12,5))
sns.countplot(data=df,x="PlacedOrNot")
plt.title("Placed or Not (0:No,1:yes)");

* We clearly see that there are more placed that not

In [None]:
labels = "0","1"
sizes = [df["PlacedOrNot"].value_counts()[0],df["PlacedOrNot"].value_counts()[1]]


fig1, ax1 = plt.subplots(figsize=(15,7))
ax1.pie(sizes,labels=labels, autopct='%1.1f%%',
        shadow=True, startangle=90)

plt.title("Placed or Not Pie Chart")

plt.show()

In [None]:
sns.pairplot(df,hue="PlacedOrNot");

#### Important!. We can the that one of the key factors for placement is CGPA. Internships a little bit too

In [None]:
df = df.astype({"Hostel": object, "PlacedOrNot": object,"HistoryOfBacklogs":object})

* We change the type of the columns for the EDA,although they are numerical, they represent  categorical values

In [None]:
def Count_Values(dataframe):
    categorical=dataframe.select_dtypes(include="object").columns.values.tolist()
    for col in categorical:
        print(f"Value Counts in {col} is: \n {dataframe[col].value_counts()}")
        print("\n")
Count_Values(df)

In [None]:
plt.figure(figsize=(12,5))
sns.countplot(data=df,x="Gender",hue="PlacedOrNot");

* In the dataset there are more males than females. But as we previously saw, both genders were more placed than not

In [None]:
plt.figure(figsize=(12,5))
sns.countplot(data=df,x="Stream",hue="PlacedOrNot")
plt.xticks(rotation=90);

* In the dataset we can see (as in value_counts function we previously did) that the two top streams are CS and IT. While, in Mechanical and Civil engineer the majority of students where not placed

In [None]:
plt.figure(figsize=(12,5))
sns.countplot(data=df,x="Hostel",hue="PlacedOrNot");

In [None]:
plt.figure(figsize=(12,5))
sns.countplot(data=df,x="HistoryOfBacklogs",hue="PlacedOrNot");

In [None]:
plt.figure(figsize=(14,6))
sns.histplot(data=df,x="Age",bins=10,kde=True,hue="PlacedOrNot",multiple="stack");

* Seems that age has no relation with being placed or not

In [None]:
plt.figure(figsize=(14,6))
sns.histplot(data=df,x="Internships",bins=4,kde=True,hue="PlacedOrNot",multiple="stack");

* As we said, internships in values more than 2 tends to pass the "Not" curve.

In [None]:
plt.figure(figsize=(14,6))
sns.histplot(data=df,x="CGPA",bins=5,kde=True,hue="PlacedOrNot",multiple="stack");

* CGPA shows a really clear tendence. The more your CGPA is bigger the chances to get placed

In [None]:
sns.displot(df,x="PlacedOrNot",col="Gender",color="red");

In [None]:
sns.displot(df,x="PlacedOrNot",col="Stream",color="Red");

In [None]:

sns.catplot(x="Age", y="CGPA", kind="violin",data=df);

* Here we can see also that no matter the age, it has no correlation with the CGPA

In [None]:
plt.figure(figsize=(16,8),dpi=200)
sns.countplot(x="Stream",data=df,hue="Gender");

* As we said, the dataset has more males than females

In [None]:
plt.figure(figsize=(16,8),dpi=200)
sns.countplot(x="Internships",data=df,hue="Gender");

In [None]:
plt.figure(figsize=(16,8),dpi=200)
sns.countplot(x="Hostel",data=df,hue="Gender");

In [None]:
plt.figure(figsize=(16,8),dpi=200)
sns.countplot(x="CGPA",data=df,hue="Gender");

In [None]:
plt.figure(figsize=(16,8),dpi=200)
sns.countplot(x="HistoryOfBacklogs",data=df,hue="Gender");

In [None]:
sns.lineplot(data=df,x="Age",y="CGPA",hue="PlacedOrNot");

* A nice lineplot showing how the CGPA impacts in the placement. Also that the age is indeferent in the CGPA values. As we can see that at lower CGPA values at maybe 23 years tends to go higher again. While at the top CGPA at 23 it keeps going down

In [None]:
plt.figure(figsize=(16,8),dpi=200)
sns.boxplot(data=df,x="Stream",y="CGPA");

* I personally thought that depending on the carreer the CGPA was going to change. I was wrong

In [None]:
plt.figure(figsize=(16,8),dpi=200)
sns.boxplot(data=df,x="Age",y="CGPA");

* We see again, age has very little variance.

In [None]:
sns.boxplot(data=df,x="PlacedOrNot",y="CGPA");

In [None]:
sns.boxplot(data=df,x="PlacedOrNot",y="Age");

In [None]:
sns.boxplot(data=df,x="PlacedOrNot",y="Internships");

In [None]:
from mpl_toolkits.mplot3d import Axes3D

fig=plt.figure(figsize=(16,8))
ax=fig.add_subplot(111,projection="3d")

ax.scatter(df["Age"],df["CGPA"],df["PlacedOrNot"],c=df["PlacedOrNot"])


ax.set_xlabel('Age')
ax.set_ylabel('CGPA')
ax.set_zlabel('Placed or Not')

plt.show();

* 3D plot between Age, CGPA and the target variable. Showing the CGPA impact

### Now we are goin to encode the labels for making the Corr Matrix

In [None]:
def Label_Encoder(dataframe):
    from sklearn.preprocessing import LabelEncoder
    cat_cols = [col for col in dataframe.columns if dataframe[col].dtype == 'O']
    num_cols = [col for col in dataframe.columns if dataframe[col].dtype =='float64' or dataframe[col].dtype=="int64"]
    dataframe2=dataframe.copy()
    subset=cat_cols
    subset2=num_cols
    dataframe2.drop(subset2,axis=1,inplace=True)
    dataframe2=dataframe2.apply(LabelEncoder().fit_transform)
    dataframe.drop(subset,axis=1,inplace=True)
    final_dataframe=pd.concat([dataframe,dataframe2],axis=1)
    
    return final_dataframe
df=Label_Encoder(df)
df.head()

In [None]:
def Corr(dataframe,target_variable):
    import seaborn as sns
    import matplotlib.pyplot as plt
    
    fig, ax = plt.subplots(figsize=(15,15))
    correlation_matrix = dataframe.corr().round(2)
    sns.heatmap(data=correlation_matrix, annot=True,cmap="viridis")
    
    correlation = dataframe.corr()[target_variable].abs().sort_values(ascending = False)
    
    return correlation

Corr(df,"PlacedOrNot")

* Now we confirm our suspects. CGPA has the highest Pearson value with the target feature. Followed behind by the Internships

# Data Preprocessing

In [None]:
X=df.drop("PlacedOrNot",axis=1)
y=df["PlacedOrNot"]

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=101)
scaler=StandardScaler()

X_train=scaler.fit_transform(X_train)
X_test=scaler.transform(X_test)

# Machine Learning Algorithms

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

In [None]:
lr=LogisticRegression(solver="liblinear",multi_class="ovr")
penalty=["l1","l2","elasticnet"]
l1_ratio=np.linspace(0,1,20)
C= np.logspace(0,10,20)
paramgrid={"penalty":penalty,
           "l1_ratio":l1_ratio,
           "C":C}

In [None]:
grid_model=GridSearchCV(lr,paramgrid,verbose=1)
grid_model.fit(X_train,y_train)

In [None]:
grid_model.best_params_

In [None]:
lr=LogisticRegression(solver="liblinear",multi_class="ovr",C=1.0,penalty="l1")
lr.fit(X_train,y_train)

In [None]:
preds=lr.predict(X_train)
preds2=lr.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score,plot_confusion_matrix,classification_report,plot_roc_curve

In [None]:
lr_score=accuracy_score(y_test,preds2)
print(f"Train Accuracy: {accuracy_score(y_train,preds)}")
print(f"Test Accuracy: {accuracy_score(y_test,preds2)}")

In [None]:
print(classification_report(y_test,preds2))

In [None]:
plot_confusion_matrix(lr,X_test,y_test);

#### Roc Curve

In [None]:
fig,ax=plt.subplots(figsize=(12,6),dpi=100)
plot_roc_curve(lr,X_test,y_test,ax=ax);

# K Nearest Neighboors

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
acc=[]
for n in range(1,36):
    KNC=KNeighborsClassifier(n_neighbors=n)
    KNC.fit(X_train,y_train)
    y_pred= KNC.predict(X_test)
    acc.append(accuracy_score(y_test,y_pred))

plt.figure(figsize=(14,7))
sns.lineplot(x=np.arange(1,36),y=acc,color="green")
plt.ylabel("Accuracy_Score (test)")
plt.xlabel("n_neighbors");

* We see that at 2-3 neighbors the algorithm gets the best acc

In [None]:
KNC=KNeighborsClassifier(n_neighbors=2)
KNC.fit(X_train,y_train)
preds=KNC.predict(X_train)
preds2=KNC.predict(X_test)


In [None]:
knc_score=accuracy_score(y_test,preds2)
print(f"Train Accuracy: {accuracy_score(y_train,preds)}")
print(f"Test Accuracy: {accuracy_score(y_test,preds2)}")

In [None]:
print(classification_report(y_test,preds2))

In [None]:
plot_confusion_matrix(KNC,X_test,y_test);

#### Roc Curve

In [None]:
fig,ax=plt.subplots(figsize=(12,6),dpi=200)
plot_roc_curve(KNC,X_test,y_test,ax=ax);

In [None]:
import plotly.express as px
y_score = KNC.predict_proba(X_test)[:, 1]

fig = px.scatter(
    X_test, x=0, y=1,
    color=y_score, color_continuous_scale='RdBu',
    symbol=y_test, symbol_map={'0': 'square-dot', '1': 'circle-dot'},
    labels={'symbol': 'label', 'color': 'score of <br>first class'}
)
fig.update_traces(marker_size=12, marker_line_width=1.5)
fig.update_layout(legend_orientation='h')
fig.show()

# Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
dt=DecisionTreeClassifier()
paramgrid={
    "criterion":["gini","entropy"],
    "max_depth":[2,4,6,8,10],
    "max_leaf_nodes":[3,6,9,12]
}

In [None]:
grid_model=GridSearchCV(dt,paramgrid,verbose=1)
grid_model.fit(X_train,y_train)

In [None]:
grid_model.best_params_

In [None]:
dt=DecisionTreeClassifier(criterion="entropy",max_depth=4,max_leaf_nodes=9)
dt.fit(X_train,y_train)

In [None]:
preds=dt.predict(X_train)
preds2=dt.predict(X_test)

In [None]:
dt_score=accuracy_score(y_test,preds2)
print(f"Train Accuracy: {accuracy_score(y_train,preds)}")
print(f"Test Accuracy: {accuracy_score(y_test,preds2)}")

In [None]:
print(classification_report(y_test,preds2))

In [None]:
plot_confusion_matrix(dt,X_test,y_test);

#### Roc Curve

In [None]:
fig,ax=plt.subplots(figsize=(12,6),dpi=200)
plot_roc_curve(dt,X_test,y_test,ax=ax);

In [None]:
plt.figure(figsize=(14,6))
sns.barplot(x=X.columns,y=dt.feature_importances_)
plt.title("Feature Importance")
plt.axhline(y=0.825,ls="--",color="red")
plt.ylabel("Feature Importance Level");

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rfc=RandomForestClassifier()
param_grid={
    "n_estimators":[64,100,128,200],
    "max_features":[2,3,4],
    "bootstrap":[True,False],
}

In [None]:
grid=GridSearchCV(rfc,param_grid,verbose=1)
grid.fit(X_train,y_train)

In [None]:
grid.best_params_

In [None]:
rfc=RandomForestClassifier(max_features=4,n_estimators=128,bootstrap=False)
rfc.fit(X_train,y_train)
preds=rfc.predict(X_train)
preds2=rfc.predict(X_test)

In [None]:
rfc_score=accuracy_score(y_test,preds2)
print(f"Train Accuracy: {accuracy_score(y_train,preds)}")
print(f"Test Accuracy: {accuracy_score(y_test,preds2)}")

In [None]:
print(classification_report(y_test,preds2))

In [None]:
plot_confusion_matrix(rfc,X_test,y_test);

#### Roc Curve

In [None]:
fig,ax=plt.subplots(figsize=(12,6),dpi=200)
plot_roc_curve(rfc,X_test,y_test,ax=ax);

# Ada Boost Classifier

In [None]:
from sklearn.ensemble import AdaBoostClassifier

In [None]:
abc=AdaBoostClassifier()
param_grid={
    "n_estimators":[64,100,128,200],
    "learning_rate":[0.1,0.05,0.02],
}

In [None]:
grid=GridSearchCV(abc,param_grid,verbose=1)
grid.fit(X_train,y_train)

In [None]:
grid.best_params_

In [None]:
abc=AdaBoostClassifier(learning_rate=0.1,n_estimators=200)

In [None]:
abc.fit(X_train,y_train)
preds=abc.predict(X_train)
preds2=abc.predict(X_test)

In [None]:
abc_score=accuracy_score(y_test,preds2)
print(f"Train Accuracy: {accuracy_score(y_train,preds)}")
print(f"Test Accuracy: {accuracy_score(y_test,preds2)}")

In [None]:
print(classification_report(y_test,preds2))

In [None]:
plot_confusion_matrix(abc,X_test,y_test);

#### Roc Curve

In [None]:
fig,ax=plt.subplots(figsize=(12,6),dpi=200)
plot_roc_curve(abc,X_test,y_test,ax=ax);

# Gradient Boost Classifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
grc=GradientBoostingClassifier()
param_grid={
    "n_estimators":[64,100,128,200],
    "learning_rate":[0.1,0.05,0.02],
    "max_depth":[2,4,6,8,10]
}

In [None]:
grid=GridSearchCV(grc,param_grid,verbose=1)
grid.fit(X_train,y_train)

In [None]:
grid.best_params_

In [None]:
grc=GradientBoostingClassifier(learning_rate=0.1,max_depth=6,n_estimators=64)
grc.fit(X_train,y_train)

In [None]:
preds=grc.predict(X_train)
preds2=grc.predict(X_test)

In [None]:
grc_score=accuracy_score(y_test,preds2)
print(f"Train Accuracy: {accuracy_score(y_train,preds)}")
print(f"Test Accuracy: {accuracy_score(y_test,preds2)}")

In [None]:
print(classification_report(y_test,preds2))

In [None]:
plot_confusion_matrix(grc,X_test,y_test);

#### Roc Curve

In [None]:
fig,ax=plt.subplots(figsize=(12,6),dpi=200)
plot_roc_curve(grc,X_test,y_test,ax=ax);

# Extra Trees

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

In [None]:
etc=ExtraTreesClassifier()

param_grid={
    "n_estimators":[64,100,128,200],
    "max_depth":[2,4,6,8,10],
    "criterion":["gini","entropy"],
    
    
}

In [None]:
grid=GridSearchCV(etc,param_grid,verbose=1)
grid.fit(X_train,y_train)

In [None]:
grid.best_params_

In [None]:
etc=ExtraTreesClassifier(criterion="entropy",max_depth=10,n_estimators=100)
etc.fit(X_train,y_train)
preds=etc.predict(X_train)
preds2=etc.predict(X_test)

In [None]:
etc_score=accuracy_score(y_test,preds2)
print(f"Train Accuracy: {accuracy_score(y_train,preds)}")
print(f"Test Accuracy: {accuracy_score(y_test,preds2)}")

In [None]:
print(classification_report(y_test,preds2))

In [None]:
plot_confusion_matrix(etc,X_test,y_test);

#### Roc Curve

In [None]:
fig,ax=plt.subplots(figsize=(12,6),dpi=200)
plot_roc_curve(etc,X_test,y_test,ax=ax);

# Bernoulli NB

In [None]:
from sklearn.naive_bayes import BernoulliNB
bnb=BernoulliNB()
bnb.fit(X_train,y_train)

In [None]:
preds=bnb.predict(X_train)
preds2=bnb.predict(X_test)

In [None]:
bnb_score=accuracy_score(y_test,preds2)
print(f"Train Accuracy: {accuracy_score(y_train,preds)}")
print(f"Test Accuracy: {accuracy_score(y_test,preds2)}")

In [None]:
print(classification_report(y_test,preds2))

In [None]:
plot_confusion_matrix(bnb,X_test,y_test);

#### Roc Curve

In [None]:
fig,ax=plt.subplots(figsize=(12,6),dpi=200)
plot_roc_curve(bnb,X_test,y_test,ax=ax);

# Voting Classifier

In [None]:
from sklearn.ensemble import VotingClassifier
classifiers = [('Gradient Boosting Classifier', grc),  ('Decision Tree', dt),
               ('Extra Tree', etc), ('Random Forest', rfc), ('Ada Boost', abc), ('Logistic', lr),
               ('Knn', KNC),("BernoulliNB",bnb)]
vc = VotingClassifier(estimators = classifiers)
vc.fit(X_train, y_train)

In [None]:
preds=vc.predict(X_train)
preds2=vc.predict(X_test)

In [None]:
vc_score=accuracy_score(y_test,preds2)
print(f"Train Accuracy: {accuracy_score(y_train,preds)}")
print(f"Test Accuracy: {accuracy_score(y_test,preds2)}")

In [None]:
print(classification_report(y_test,preds2))

In [None]:
plot_confusion_matrix(vc,X_test,y_test);

# Model Summary

In [None]:
models = pd.DataFrame({
    'Model' : ['Logistic Regression', 'KNN', 'Decision Tree Classifier', 'Random Forest Classifier','Ada Boost Classifier',
             'Gradient Boosting Classifier',"Extra Trees Classifier",'Voting Classifier'
               , 'BernoulliNB'],
    'Score' : [lr_score, knc_score, dt_score, rfc_score, abc_score, grc_score,etc_score,vc_score,bnb_score]
})


models.sort_values(by = 'Score', ascending = False)

In [None]:
px.bar(data_frame = models, x = 'Score', y = 'Model', color = 'Score', template = "plotly_dark", title = 'Models Comparison')

## We see that Gradient Boost Classifier is the best model with 88,2% of acc.
## Thx, and if you liked it upvote!