In [1]:
# Import necessary libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import numpy as np

# Step 1: Load dataset
file_path = '100 ML/placed.csv'
data = pd.read_csv(file_path)

# Step 2: Data Exploration and Preprocessing
# Check for missing values
print("Missing Values:\n", data.isnull().sum())

# Label encoding for binary categorical features (gender, work experience, placement status)
label_encoder = LabelEncoder()
data['gender'] = label_encoder.fit_transform(data['gender'])
data['workex'] = label_encoder.fit_transform(data['workex'])
data['status'] = label_encoder.fit_transform(data['status'])  # 1 for placed, 0 for not placed

# One-hot encoding for categorical variables with more than two categories (degree type, specialisation, etc.)
data = pd.get_dummies(data, columns=['degree_t', 'specialisation', 'hsc_s', 'ssc_b', 'hsc_b'])

# Drop irrelevant columns (e.g., 'sl_no')
data = data.drop(['sl_no'], axis=1)

# Step 3: Exploratory Data Analysis (EDA)
# Visualize placement status distribution
sns.countplot(x='status', data=data)
plt.title('Placement Status Distribution')
plt.show()

# Correlation heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(data.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

# Step 4: Feature Selection
X = data.drop(['status'], axis=1)  # Independent variables (features)
y = data['status']  # Dependent variable (target)

# Step 5: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 6: Model Building
# Logistic Regression Model
model = LogisticRegression()
model.fit(X_train, y_train)

# Predict on test data
y_pred = model.predict(X_test)

# Evaluate the Logistic Regression model
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred))
print("Logistic Regression Classification Report:\n", classification_report(y_test, y_pred))

# Random Forest Classifier Model
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)

# Predict on test data using Random Forest
y_pred_rf = rf_model.predict(X_test)

# Evaluate the Random Forest model
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Random Forest Classification Report:\n", classification_report(y_test, y_pred_rf))

# Step 7: Confusion Matrix for Random Forest
cm = confusion_matrix(y_test, y_pred_rf)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Random Forest Confusion Matrix')
plt.show()

# Step 8: Feature Importance for Random Forest
importances = rf_model.feature_importances_
features = X.columns

# Sort feature importance in descending order and plot
indices = np.argsort(importances)[::-1]
plt.figure(figsize=(10, 6))
plt.title("Feature Importance")
plt.bar(range(X.shape[1]), importances[indices], align='center')
plt.xticks(range(X.shape[1]), features[indices], rotation=90)
plt.show()

# Conclusion: Compare Logistic Regression and Random Forest
print("Logistic Regression Accuracy: ", accuracy_score(y_test, y_pred))
print("Random Forest Accuracy: ", accuracy_score(y_test, y_pred_rf))


FileNotFoundError: [Errno 2] No such file or directory: '100 ML/placed.csv'