# Beginner approach to campus palcement prediction

In [None]:
#Import all relevant libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

In [None]:
#Read the data and show a sample
url = '../input/factors-affecting-campus-placement/Placement_Data_Full_Class.csv'
df = pd.read_csv(url, index_col = 'sl_no')
dict = {'Placed': 1, 'Not Placed': 0}
#Here I map the status feature to 0s and 1s
df['status'] = df['status'].map(dict)
df.head(10)

In [None]:
#Check missing values in df
df.isnull().any()
#We see that missing values are not present except in salary which I drop later, so there is no need to replace NaNs

# Basic EDA
Here we see that gender, ssc_p, hsc_p, degree_p, workex, specialization and mba_p are the more relevant features

In [None]:
sns.countplot(x='status' , hue = 'gender' , data =df)

In [None]:
sns.countplot(x='status' , hue = 'degree_t' , data =df)

In [None]:
sns.countplot(x='status' , hue = 'specialisation' , data =df)

In [None]:
sns.countplot(x='status' , hue = 'workex' , data =df)

In [None]:
sns.countplot(x='status' , hue = 'hsc_b' , data =df)

In [None]:
sns.countplot(x='status' , hue = 'ssc_b' , data =df)

In [None]:
sns.countplot(x='status' , hue = 'hsc_s' , data =df)

In [None]:
sns.scatterplot(x=df.mba_p, y=df.status)

In [None]:
sns.scatterplot(x=df.etest_p, y=df.status)

In [None]:
sns.scatterplot(x=df.degree_p, y=df.status)

In [None]:
sns.scatterplot(x=df.hsc_p, y=df.status)

In [None]:
sns.scatterplot(x=df.ssc_p, y=df.status)

In [None]:
df.status.value_counts()
#We can see that the data is not balanced, we can attempt to balance the data but I didn't see any significant improvement in the predictions when doing that

# Preprocessing

In [None]:
#Split the data in X and y, I'm dropping the salary feature since we are trying to predict the status so it doesn't make sense to consider the salary as a predictor.
y = df.status
X = df.drop(['status', 'salary'], axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                                random_state=0)

In [None]:
#Selecting categorical colums
cat_cols = [cname for cname in X_train.columns if X_train[cname].dtype == "object"]

# Select numerical columns
num_cols = [cname for cname in X_train.columns if X_train[cname].dtype in ['int64', 'float64']]

In [None]:
# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[cat_cols]), index = X_train.index)
OH_cols_test = pd.DataFrame(OH_encoder.transform(X_test[cat_cols]), index = X_test.index)


# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([X_train[num_cols], OH_cols_train], axis=1)
OH_X_test = pd.concat([X_test[num_cols], OH_cols_test], axis=1)

In [None]:
#scaling
from sklearn.preprocessing import scale
scaled_X_train = pd.DataFrame(scale(OH_X_train[num_cols]), index=OH_X_train.index)
scaled_X_test = pd.DataFrame(scale(OH_X_test[num_cols]), index=OH_X_test.index)

scaled_X_train.columns = OH_X_train[num_cols].columns
scaled_X_test.columns = OH_X_test[num_cols].columns

OH_X_train[num_cols] = scaled_X_train[num_cols]
OH_X_test[num_cols] = scaled_X_test[num_cols]

In [None]:
OH_X_train.head(10)

# 3 different models for predicting placement status

In [None]:
#KNN neighbors classifier model
knn_model = KNeighborsClassifier(n_neighbors=10)
knn_model.fit(OH_X_train, y_train)
preds = knn_model.predict(OH_X_test)
print(accuracy_score(y_test, preds))

In [None]:
#Random forestclassifier model
rf_model = RandomForestClassifier(n_estimators = 100, random_state = 0)
rf_model.fit(OH_X_train, y_train)
preds = rf_model.predict(OH_X_test)
print(accuracy_score(y_test, preds))

In [None]:
#logistic regression model
log_model = LogisticRegression(max_iter=100)
log_model.fit(OH_X_train, y_train)
preds = log_model.predict(OH_X_test)
print(accuracy_score(y_test, preds))

The model that showed the best performance was logistic regression.

# Thanks for checking my notebook out, I'm still a beginner so any comment or suggestion is welcome, thanks!