# Identifying Patterns And Trends In Campus Placement Data Using Machine Learning

## Importing Required Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore', category=UserWarning)
import pickle
import joblib

## Creating Dataset of Campus Placement

In [None]:
np.random.seed(42)

num_samples = 1650

data = {   
    'Student_ID': np.arange(20001, 20001 + num_samples),
    'Gender': np.random.choice(['Male', 'Female'], size=num_samples),
    'Age': np.random.randint(21, 31, size=num_samples),
    'Education': np.random.choice(['Bachelor', 'Master'], size=num_samples),
    'CGPA': np.round(np.random.uniform(5, 10, size=num_samples), 2),
    'Internships': np.random.randint(0, 5, size=num_samples),
    'Year': np.random.randint(2018, 2024, size=num_samples),
    'Hostel': np.random.choice(['Opted', 'Not Opted'], size=num_samples),
    'Projects' : np.random.randint(0, 6, size=num_samples),
    'Letter_of_Recommendation': np.random.choice(['Yes', 'No'], size=num_samples),
    'Specialization': np.random.choice(['Computer Science', 'Information Technology', 'Electrical', 'Electronics', 'Mechanical', 'Civil'], size=num_samples, p=[0.30, 0.20, 0.18, 0.17, 0.10, 0.05]),
    'On/Off Campus': np.random.choice(['On Campus', 'Off Campus'], size=num_samples),
    'Package(LPA)': np.random.randint(200000, 1000000, size=num_samples),
}

# Introduce missing values only for some features
missing_indices = np.random.choice(np.arange(num_samples), size=int(num_samples * 0.05), replace=False)
data['CGPA'][missing_indices] = np.nan

# Create DataFrame
df = pd.DataFrame(data)

# Set max age to 27 for bachelor's degree students
df.loc[(df['Education'] == "Bachelor") & (df['Age'] > 27), 'Age'] = 27

# Set min age to 24 for master's degree students
df.loc[(df['Education'] == "Master") & (df['Age'] < 24), 'Age'] = 24

# Add the 'HistoryOfBacklogs' column with random data
num_rows = df.shape[0]
df['HistoryOfBacklogs'] = np.random.randint(0, 5, size=num_rows)

# Add the 'Placement_Status' column with default 'Yes' values
df['Placement_Status'] = 'Placed'

# Update 'Placement_Status' based on conditions
df.loc[df['HistoryOfBacklogs'] > 2, 'Placement_Status'] = 'Not Placed'

# Modify the 'Package(LPA)' column to set salary to 0 for "Not Placed" category
df['Package(LPA)'] = np.where(df['Placement_Status'] == 'Not Placed', 0, df['Package(LPA)'])

# Save the dataset to CSV file
df.to_csv('campus_placement_dataset.csv', index=False)

print("Dataset created and saved as 'campus_placement_dataset.csv'")

## Data Collection

In [None]:
#Load the dataset
df = pd.read_csv('campus_placement_dataset.csv')
df

In [None]:
# Getting to know the shape of data  
df.shape  

In [None]:
# Showing the first 5 rows of the dataset  
df.head()  

In [None]:
# Showing the last 10 rows of the dataset  
df.tail(10)

In [None]:
# Showing 4 rows of the dataset at random  
df.sample(4)  

In [None]:
# Getting to know the data type of columns that are in the dataset  
df.dtypes 

In [None]:
# Getting to know the detailed information of the columns  
df.info()  

In [None]:
# Statistical Descriptions of the numerical values in the dataset  
df.describe()  

## Data Preprocessing

In [None]:
# missing values  
df.isna().sum()  

In [None]:
# Group by education level and specialization and fill missing CGPA with the median CGPA of the group
df['CGPA'] = df.groupby(['Education', 'Specialization'])['CGPA'].transform(lambda x: x.fillna(x.median()))

In [None]:
df.isna().sum() 

In [None]:
# duplicate rows  
df.duplicated().sum()  

In [None]:
#drop duplicates  
df.drop_duplicates(inplace=True)  

In [None]:
# Check if the duplicate rows are removed  
df.duplicated().sum()

## EDA

In [None]:
# Getting to know the correlation between the target column and other features.  
df.corr(numeric_only=True)

In [None]:
# Correlation matrix to visualize the correlation between variables
plt.figure(figsize=(8, 6))
corr_matrix = df.corr(numeric_only=True)
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title("Correlation Matrix")
plt.show()

In [None]:
# Plotting  the graph so that we can visualize the output with respect to major features  
figure = px.scatter(df, x="CGPA", y="Internships", color="Placement_Status", color_discrete_map={"Placed": "green", "Not Placed": "red"}, hover_data=['CGPA'])  
figure.show() 

In [None]:
fig = px.box(df, x="Placement_Status", y="HistoryOfBacklogs",color="Placement_Status",color_discrete_map={"Placed": "green", "Not Placed": "red"}, title="Box Plot of Placement Status by History of Backlogs")
fig.show()

In [None]:
fig = px.box(df, x="Education", y="CGPA", title="Box and Violin Plot of CGPA by Education")
fig.update_traces(boxpoints="all", jitter=0.3, pointpos=-1.8)  
fig.show()

In [None]:
# Plotting Histogram for the count of place and not placed  
px.histogram(df, x='Placement_Status', color='Placement_Status',color_discrete_map={"Placed": "green", "Not Placed": "red"}, barmode='group')  

In [None]:
# Pie Chart: Percentage pie chart of Placed or Not Placed  
figure = px.pie(df, values=df['Placement_Status'].value_counts().values, names=df['Placement_Status'].value_counts().index, title='Placed Vs Not Placed')  
figure.show()  

In [None]:
# Pie chart for Gender distribution
plt.figure(figsize=(8, 6))
gender_counts = df['Gender'].value_counts()
plt.pie(gender_counts, labels=gender_counts.index, autopct='%1.1f%%', colors=['skyblue', 'pink'])
plt.title("Gender Distribution")
plt.show()

In [None]:
# Histogram for Numeric varibles

plt.figure(figsize=(8, 6))
sns.histplot(data=df, x='Age', kde=True)
plt.title("Age Distribution")
plt.show()

plt.figure(figsize=(8, 6))
sns.histplot(data=df, x="CGPA", kde=True)
plt.title("CGPA Distribution")
plt.show()

plt.figure(figsize=(8, 6))
sns.histplot(data=df, x="Internships", kde=True)
plt.title("Internships Distribution")
plt.show()

plt.figure(figsize=(8, 6))
sns.histplot(data=df, x="Year", kde=True)
plt.title("Year Distribution")
plt.show()

plt.figure(figsize=(8, 6))
sns.histplot(data=df, x="Package(LPA)", kde=True)
plt.title("Package(LPA) Distribution")
plt.show()

plt.figure(figsize=(8, 6))
sns.histplot(data=df, x="Projects", kde=True)
plt.title("Projects Distribution")
plt.show()

plt.figure(figsize=(8, 6))
sns.histplot(data=df, x="HistoryOfBacklogs", kde=True)
plt.title("HistoryOfBacklogs Distribution")
plt.show()

In [None]:
# Box plot for Numeric varibles by Placement_Status
plt.figure(figsize=(8, 6))
sns.boxplot(x="Placement_Status", y="Age", data=df, palette='Set1')
plt.title("Age Distribution by Placement Status")
plt.show()

plt.figure(figsize=(8, 6))
sns.boxplot(x="Placement_Status", y="CGPA", data=df, palette='Set2')
plt.title("CGPA Distribution by Placement Status")
plt.show()

plt.figure(figsize=(8, 6))
sns.boxplot(x="Placement_Status", y="Internships", data=df, palette='Set3')
plt.title("Internships Distribution by Placement Status")
plt.show()

plt.figure(figsize=(8, 6))
sns.boxplot(x="Placement_Status", y="Year", data=df, palette='Set1')
plt.title("Year Distribution by Placement Status")
plt.show()

plt.figure(figsize=(8, 6))
sns.boxplot(x="Placement_Status", y="Projects", data=df, palette='Set2')
plt.title("Projects Distribution by Placement Status")
plt.show()

In [None]:
colors={"Placed": "blue", "Not Placed": "red"}
sns.countplot(x="Age", hue="Placement_Status", data=df, palette=colors)
plt.title("Age Distribution by Placement Status")
plt.show()
sns.countplot(x="Gender", hue="Placement_Status", data=df, palette=colors)
plt.title("Gender Distribution by Placement Status")
plt.show()
sns.countplot(x="Education", hue="Placement_Status", data=df, palette=colors)
plt.title("Education Distribution by Placement Status")
plt.show()
sns.countplot(x="Internships", hue="Placement_Status", data=df, palette=colors)
plt.title("Internships Distribution by Placement Status")
plt.show()
sns.countplot(x="Hostel", hue="Placement_Status", data=df, palette=colors)
plt.title("Hostel distribution by Placement Status")
plt.show()
sns.countplot(x="Letter_of_Recommendation", hue="Placement_Status", data=df, palette=colors)
plt.title("Letter_of_Recommendation distribution by Placement Status")
plt.show()
plt.figure(figsize=(12, 8))
sns.countplot(x="Specialization", hue="Placement_Status", data=df, palette=colors)
plt.title("Specialization distribution by Placement Status")
plt.show()
sns.countplot(x="HistoryOfBacklogs", hue="Placement_Status", data=df, palette=colors)
plt.title("HistoryOfBacklogs distribution by Placement Status")
plt.show()
sns.countplot(x="On/Off Campus", hue="Placement_Status", data=df, palette=colors)
plt.title("On/Off Campus distribution by Placement Status")
plt.show()
sns.countplot(x="Projects", hue="Placement_Status", data=df, palette=colors)
plt.title("Projects Distribution by Placement Status")
plt.show()

In [None]:
# Stacked bar chart for Education vs. Placement_Status
education_placement = df.groupby(['Education', 'Placement_Status']).size().unstack()
plt.figure(figsize=(8, 6))
education_placement.plot(kind='bar', stacked=True, color={"Placed": 'skyblue', "Not Placed": 'lightcoral'})
plt.title("Education vs. Placement Status")
plt.show()

In [None]:
# Calculate maximum and minimum placement counts per year
placement_counts = df.groupby('Year')['Placement_Status'].value_counts().unstack()
max_placement_year = placement_counts['Placed'].idxmax()
min_placement_year = placement_counts['Placed'].idxmin()

# Plot maximum and minimum placement counts
plt.figure(figsize=(10, 6))
placement_counts['Placed'].plot(marker='o', label='Placed')
plt.scatter(max_placement_year, placement_counts['Placed'].max(), color='green', label='Max Placement')
plt.scatter(min_placement_year, placement_counts['Placed'].min(), color='red', label='Min Placement')
plt.title("Placement Counts Over the Years")
plt.xlabel("Year")
plt.ylabel("Number of Placements")
plt.legend()
plt.show()

print(f"Year with the maximum placements: {max_placement_year}")
print(f"Year with the minimum placements: {min_placement_year}")

In [None]:
# Calculate total placements per year
total_placements_per_year = df[df['Placement_Status'] == 'Placed'].groupby('Year').size()

print("\nTotal placements per yearwise:")
print(total_placements_per_year)

In [None]:
# Line chart for Placement trends over years
plt.figure(figsize=(10, 6))
placement_trends = df.groupby('Year')['Placement_Status'].value_counts().unstack().fillna(0)
placement_trends.plot(marker='o')
plt.title("Placement Trends Over Years")
plt.xlabel("Year")
plt.ylabel("Number of Students")
plt.legend(title="Placement Status")
plt.show()

In [None]:
# Filter placed students
placed_students = df[df['Placement_Status'] == 'Placed']

# Find the youngest and eldest placed students using the 'Age' column
youngest_student = placed_students.loc[placed_students['Age'].idxmin()]
eldest_student = placed_students.loc[placed_students['Age'].idxmax()]

# Print the results
print("Youngest Student:")
print(youngest_student)

print("\nEldest Student:")
print(eldest_student)

In [None]:
# Calculate maximum and minimum internships for placed students
max_internships = placed_students['Internships'].max()
min_internships = placed_students['Internships'].min()

# Set up Seaborn style
plt.figure(figsize=(8, 6))
plt.title('Box Plot of Maximum and Minimum Internships for Placed Students')

# Create a box plot
plt.boxplot([placed_students['Internships']], labels=['Placed Students'])
plt.ylabel('Number of Internships')
plt.ylim(0, 5)  # Set y-axis limit to better visualize the data

plt.annotate(f"Max: {max_internships}", xy=(1.1, max_internships), xytext=(1.3, max_internships + 0.2),
             arrowprops=dict(facecolor='black', arrowstyle='->'))
plt.annotate(f"Min: {min_internships}", xy=(1.1, min_internships), xytext=(1.3, min_internships - 0.2),
             arrowprops=dict(facecolor='black', arrowstyle='->'))
plt.show()

# Calculate the count of students with the maximum and minimum number of internships
count_max_internships = len(df[(df['Placement_Status'] == 'Yes') & (df['Internships'] == max_internships)])
count_min_internships = len(df[(df['Placement_Status'] == 'Yes') & (df['Internships'] == min_internships)])

# Print the results
print("Maximum number of internships done by placed students: ", max_internships)
print("Number of students who did the maximum internships: ", count_max_internships)
print("Minimum number of internships done by placed students: ", min_internships)
print("Number of students who did the minimum internships: ", count_min_internships)

In [None]:
# Calculate the counts for each CGPA bin
cgpa_counts = df['CGPA'].value_counts().sort_index()

# Find the maximum and minimum CGPA values
max_cgpa = df['CGPA'].max()
min_cgpa = df['CGPA'].min()

# Set up Seaborn style
plt.figure(figsize=(10, 6))

# Create bar plot for CGPA counts
plt.bar(cgpa_counts.index, cgpa_counts.values, color='blue', alpha=0.5, label='CGPA Counts')

# Mark the maximum and minimum CGPA values with red color
plt.bar(max_cgpa, cgpa_counts[max_cgpa], color='red', label='Max CGPA')
plt.bar(min_cgpa, cgpa_counts[min_cgpa], color='green', label='Min CGPA')
plt.title('Bar Plot: Max and Min CGPA Counts')
plt.xlabel('CGPA')
plt.ylabel('Counts')
plt.legend()
plt.grid(True)
plt.show()

# Count the number of students with maximum and minimum CGPA
count_max_cgpa = df[df['CGPA'] == max_cgpa].shape[0]
count_min_cgpa = df[df['CGPA'] == min_cgpa].shape[0]

# Print the results
print("Maximum CGPA: ", max_cgpa)
print("Number of students with maximum CGPA: ", count_max_cgpa)
print("Minimum CGPA: ", min_cgpa)
print("Number of students with minimum CGPA: ", count_min_cgpa)

In [None]:
# Define the desired specialization order
specialization_order = ['Computer Science', 'Information Technology', 'Electrical', 'Electronics', 'Mechanical', 'Civil']

# Create a count plot for placement trends by specialization
plt.figure(figsize=(10, 6))
sns.countplot(x='Specialization', hue='Placement_Status', data=df, order=specialization_order, palette=['skyblue', 'red'])
plt.title('Placement Trends by Specialization')
plt.xlabel('Specialization')
plt.ylabel('Number of Students')
plt.legend(title='Placement Status')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Filter only placed students
placed_df = df[df['Placement_Status'] == 'Placed']

# Calculate average placement package by year
average_placement_by_year = placed_df.groupby('Year')['Package(LPA)'].mean()

# Find highest and lowest placement packages
highest_placement = placed_df.loc[placed_df['Package(LPA)'].idxmax()]
lowest_placement = placed_df.loc[placed_df['Package(LPA)'].idxmin()]

# Create a bar plot for average placement by year
plt.figure(figsize=(10, 6))
sns.barplot(x=average_placement_by_year.index, y=average_placement_by_year.values, palette='viridis')
plt.xlabel('Year')
plt.ylabel('Average Placement Package (in LPA)')
plt.title('Average Placement Package by Year')
plt.tight_layout()

# Display the plot
plt.show()

# Visualize the highest and lowest placement packages
plt.figure(figsize=(10, 6))
sns.barplot(x=['Highest', 'Lowest'], y=[highest_placement['Package(LPA)'], lowest_placement['Package(LPA)']], palette='magma')
plt.ylabel('Placement Package (in LPA)')
plt.title('Highest and Lowest Placement Packages')
plt.tight_layout()

# Display the plot
plt.show()

print("Average placement by year:")
print(average_placement_by_year)
print("\nHighest placement:")
print(highest_placement)
print("\nLowest placement:")
print(lowest_placement)

In [None]:
# Set up Seaborn style
sns.set(style="whitegrid")

# Create a swarm plot for CGPA by Placement_Status
plt.figure(figsize=(10, 6))
sns.swarmplot(x='Placement_Status', y='CGPA', data=df, hue='Placement_Status', palette={'Placed': 'green', 'Not Placed': 'red'})
plt.title("CGPA Distribution by Placement Status")
plt.xlabel("Placement Status")
plt.ylabel("CGPA")
plt.show()

In [None]:
from sklearn.cluster import KMeans

# Select relevant attributes for clustering
attributes_for_clustering = ['Age', 'CGPA', 'Internships', 'Package(LPA)']

# Remove rows with missing values
df_cleaned = df.dropna(subset=attributes_for_clustering)

# Normalize the data
normalized_data = (df_cleaned[attributes_for_clustering] - df_cleaned[attributes_for_clustering].mean()) / df_cleaned[attributes_for_clustering].std()

# Perform K-Means clustering
num_clusters = 4
kmeans = KMeans(n_clusters=num_clusters, n_init=10, random_state=42)
df_cleaned['Cluster'] = kmeans.fit_predict(normalized_data)

# Visualize the clusters
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df_cleaned, x='CGPA', y='Package(LPA)', hue='Cluster', palette='tab10')
plt.title('K-Means Clustering of Campus Placement Data')
plt.xlabel('CGPA')
plt.ylabel('Package(LPA)')
plt.show()

In [None]:
sns.pairplot(data=df, hue="Placement_Status")

## Feature Engineering:

In [None]:
df

In [None]:
# Convert categorical variables to numerical using LabelEncoder
le = LabelEncoder()
df['Gender'] = le.fit_transform(df['Gender'])
df['Education'] = le.fit_transform(df['Education'])
df['Hostel'] = le.fit_transform(df['Hostel'])
df['Letter_of_Recommendation'] = le.fit_transform(df['Letter_of_Recommendation'])
df['Specialization'] = le.fit_transform(df['Specialization'])
df['On/Off Campus'] = le.fit_transform(df['On/Off Campus'])
df['Placement_Status'] = le.fit_transform(df['Placement_Status'])

In [None]:
df.head(20)

In [None]:
plt.figure(figsize=(15, 8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.title("Correlation Matrix")
plt.show()

In [None]:
#Extracting Input and Output Columns
X = df.drop(['Year','Hostel','On/Off Campus','Package(LPA)','Placement_Status'], axis=1)
y = df['Placement_Status']  
X

In [None]:
y

In [None]:
# Getting the shape of the X and Y  
print(X.shape)  
print(y.shape)  

## Data Splitting

In [None]:
# Splitting the dataset into training and testing datasets.  
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.33, random_state=42)  

In [None]:
# Getting the Shape of all the training and testing dataset  
print(X_train.shape)  
print(X_test.shape)  
print(y_train.shape)  
print(y_test.shape)  

## Model Training

In [None]:
from sklearn.linear_model import LogisticRegression
logreg= LogisticRegression(max_iter=1000, random_state=42)  
logreg.fit(X_train, y_train)

## Model Evalution

In [None]:
y_pred= logreg.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

# Print the evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", classification_rep)

## Prediction

In [None]:
print("Accuracy Score for Test Dataset is ",logreg.score(X_test, y_test)*100,"%")
print("Accuracy Score for Train Dataset is",logreg.score(X_train,y_train)*100,"%")

## Scaling

In [None]:
# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Hypermeter Tuning

In [None]:
param_grid = {
    'penalty': ['l2'],  # L2 regularization
    'C': [0.001, 0.01, 0.1, 1, 10]  # Inverse of regularization strength
}

# Perform hyperparameter tuning using GridSearchCV with cross-validation
grid_search = GridSearchCV(logreg, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_scaled, y_train)

best_logreg = grid_search.best_estimator_

# Evaluate on the test set
y_pred = best_logreg.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
best_params = grid_search.best_params_

print(f"Best hyperparameters: {best_params}")
print(f"Accuracy on test set: {accuracy:.2f}")

In [None]:
model = LogisticRegression(C=0.001, penalty='l2', solver='lbfgs', max_iter=1000, random_state=42)
model.fit(X_train, y_train)

In [None]:
y_pred= model.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

# Print the evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", classification_rep)

In [None]:
# Create a heatmap of the confusion matrix
plt.figure(figsize=(8, 6))
sns.set(font_scale=1.2)
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", cbar=False,
            annot_kws={"size": 16}, linewidths=0.5, square=True)
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()

In [None]:
df1=pd.DataFrame({'Actual': y_test, 'Predict': y_pred})
df1

In [None]:
print("Accuracy Score for Test Dataset is ",model.score(X_test, y_test)*100,"%")
print("Accuracy Score for Train Dataset is",model.score(X_train,y_train)*100,"%")

## Save the model

In [None]:
with open("mytrainedplacement_model.pkl", "wb") as model_file:
    pickle.dump(model, model_file)

## Load the Model

In [None]:
with open("mytrainedplacement_model.pkl", "rb") as model_file:
    loaded_model = pickle.load(model_file)