In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings('ignore')

In [2]:
#1.Load Data#

train_features_df = pd.read_csv('Train_Features.csv')
train_target_df = pd.read_csv('Train_Target.csv')
test_features_df = pd.read_csv('Test_Features.csv')
sample_submission_df = pd.read_csv('sample_submission.csv')

In [3]:
# Display basic information about the datasets
print("Training features shape:", train_features_df.shape)
print("Training target shape:", train_target_df.shape)
print("Test features shape:", test_features_df.shape)
print("Sample submission shape:", sample_submission_df.shape)

Training features shape: (300, 19)
Training target shape: (300, 2)
Test features shape: (91, 19)
Sample submission shape: (91, 2)


In [4]:
# Merge training features with target
train_data = pd.merge(train_features_df, train_target_df, on='ID')

In [5]:
# Check first few rows
print("\nFirst 5 rows of training data:")
print(train_data.head())


First 5 rows of training data:
   ID Gender  Percent_SSC Board_SSC  Percent_HSC Board_HSC Stream_HSC  \
0   1      M         56.0      ICSE         58.0       ISC   Commerce   
1   2      M         41.0    Others         51.0    Others    Science   
2   3      F         53.0    Others         40.0    Others       Arts   
3   4      M         59.0    Others         58.0    Others   Commerce   
4   5      F         61.5    Others         65.4      CBSE       Arts   

   Percent_Degree          Course_Degree  Experience_Yrs Entrance_Test  \
0           67.00             Management               0           NaN   
1           61.00  Computer Applications               1           MAT   
2           54.00                   Arts               1           MAT   
3           59.00             Management               0         G-MAT   
4           67.93             Management               0           MAT   

   S-TEST  Percentile_ET  S-TEST*SCORE  Percent_MBA   Specialization_MBA  \
0       

In [6]:
# Check data types and missing values
print("\nData types:")
print(train_data.dtypes)

print("\nMissing values in training data:")
print(train_data.isnull().sum())

print("\nMissing values in test data:")
print(test_features_df.isnull().sum())


Data types:
ID                       int64
Gender                  object
Percent_SSC            float64
Board_SSC               object
Percent_HSC            float64
Board_HSC               object
Stream_HSC              object
Percent_Degree         float64
Course_Degree           object
Experience_Yrs           int64
Entrance_Test           object
S-TEST                   int64
Percentile_ET          float64
S-TEST*SCORE           float64
Percent_MBA            float64
Specialization_MBA      object
Marks_Communication      int64
Marks_Projectwork        int64
Marks_BOCA               int64
Placement                int64
dtype: object

Missing values in training data:
ID                      0
Gender                  0
Percent_SSC             0
Board_SSC               0
Percent_HSC             0
Board_HSC               0
Stream_HSC              0
Percent_Degree          0
Course_Degree           0
Experience_Yrs          0
Entrance_Test          53
S-TEST                  0
Percent

In [7]:
# Basic statistics
print("\nBasic statistics of numerical columns:")
print(train_data.describe())


Basic statistics of numerical columns:
               ID  Percent_SSC  Percent_HSC  Percent_Degree  Experience_Yrs  \
count  300.000000   300.000000   300.000000      300.000000      300.000000   
mean   150.500000    64.472933    63.631667       63.228433        0.506667   
std     86.746758    11.183569    11.526602        9.172405        0.686678   
min      1.000000    37.000000    40.000000       35.000000        0.000000   
25%     75.750000    56.000000    54.000000       58.000000        0.000000   
50%    150.500000    64.250000    63.000000       63.845000        0.000000   
75%    225.250000    73.820000    72.250000       69.000000        1.000000   
max    300.000000    87.200000    94.700000       88.800000        3.000000   

           S-TEST  Percentile_ET  S-TEST*SCORE  Percent_MBA  \
count  300.000000     300.000000    300.000000   300.000000   
mean     0.823333      54.598300     54.598300    61.655733   
std      0.382024      31.120892     31.120892     5.840537

In [8]:
# Analyzing the target variable distribution
print("\nTarget variable distribution:")
print(train_data['Placement'].value_counts())
print(train_data['Placement'].value_counts(normalize=True) * 100)


Target variable distribution:
Placement
0    239
1     61
Name: count, dtype: int64
Placement
0    79.666667
1    20.333333
Name: proportion, dtype: float64


In [9]:
# Visualize the target distribution
plt.figure(figsize=(8, 6))
sns.countplot(x='Placement', data=train_data)
plt.title('Placement Distribution')
plt.xlabel('Placement (0: Placed, 1: Not Placed)')
plt.ylabel('Count')
plt.savefig('placement_distribution.png')
plt.close()

In [10]:
# Correlation analysis for numerical features
numerical_cols = train_data.select_dtypes(include=['int64', 'float64']).columns.tolist()
if 'ID' in numerical_cols:
    numerical_cols.remove('ID')
if 'Placement' in numerical_cols:
    numerical_cols.remove('Placement')

plt.figure(figsize=(12, 10))
correlation_matrix = train_data[numerical_cols + ['Placement']].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix')
plt.tight_layout()
plt.savefig('correlation_matrix.png')
plt.close()

In [11]:
# Handling missing values in Entrance_Test column
# Check unique values in the column
print("\nUnique values in Entrance_Test column:")
print(train_data['Entrance_Test'].value_counts())


Unique values in Entrance_Test column:
Entrance_Test
MAT      199
K-MAT     20
CAT       18
PGCET      6
GCET       2
G-MAT      1
G-SAT      1
Name: count, dtype: int64


In [12]:
# Let's look at Percentile_ET distribution based on Entrance_Test
print("\nPercentile_ET statistics by Entrance_Test:")
print(train_data.groupby('Entrance_Test')['Percentile_ET'].describe())


Percentile_ET statistics by Entrance_Test:
               count       mean        std    min      25%   50%     75%  \
Entrance_Test                                                              
CAT             18.0  59.299444  20.110281  13.00  53.8475  64.0  72.250   
G-MAT            1.0   0.000000        NaN   0.00   0.0000   0.0   0.000   
G-SAT            1.0   0.000000        NaN   0.00   0.0000   0.0   0.000   
GCET             2.0  45.000000   7.071068  40.00  42.5000  45.0  47.500   
K-MAT           20.0  74.459000  20.605304  14.99  64.7500  77.6  89.575   
MAT            199.0  68.336281  16.163695  22.95  57.1000  67.0  80.200   
PGCET            6.0  22.333333  36.423436   0.00   0.0000   0.0  36.750   

                 max  
Entrance_Test         
CAT            80.00  
G-MAT           0.00  
G-SAT           0.00  
GCET           50.00  
K-MAT          98.69  
MAT            98.00  
PGCET          85.00  


In [13]:
# Handling missing values in Entrance_Test column
# Check unique values in the column
print("\nUnique values in Entrance_Test column:")
print(train_data['Entrance_Test'].value_counts())

# Let's look at Percentile_ET distribution based on Entrance_Test
print("\nPercentile_ET statistics by Entrance_Test:")
print(train_data.groupby('Entrance_Test')['Percentile_ET'].describe())

# Strategy for handling missing values:
# 1. For numerical columns: impute with median
# 2. For categorical columns: impute with most frequent value

# Identify categorical and numerical columns
categorical_cols = train_data.select_dtypes(include=['object']).columns.tolist()
if 'ID' in categorical_cols:
    categorical_cols.remove('ID')

print("\nNumerical columns:", numerical_cols)
print("Categorical columns:", categorical_cols)

# Prepare preprocessing for numerical features
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Prepare preprocessing for categorical features
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])


Unique values in Entrance_Test column:
Entrance_Test
MAT      199
K-MAT     20
CAT       18
PGCET      6
GCET       2
G-MAT      1
G-SAT      1
Name: count, dtype: int64

Percentile_ET statistics by Entrance_Test:
               count       mean        std    min      25%   50%     75%  \
Entrance_Test                                                              
CAT             18.0  59.299444  20.110281  13.00  53.8475  64.0  72.250   
G-MAT            1.0   0.000000        NaN   0.00   0.0000   0.0   0.000   
G-SAT            1.0   0.000000        NaN   0.00   0.0000   0.0   0.000   
GCET             2.0  45.000000   7.071068  40.00  42.5000  45.0  47.500   
K-MAT           20.0  74.459000  20.605304  14.99  64.7500  77.6  89.575   
MAT            199.0  68.336281  16.163695  22.95  57.1000  67.0  80.200   
PGCET            6.0  22.333333  36.423436   0.00   0.0000   0.0  36.750   

                 max  
Entrance_Test         
CAT            80.00  
G-MAT           0.00  
G-SAT   

In [14]:
# Split training data into train and validation sets
X = train_data.drop(['ID', 'Placement'], axis=1)
y = train_data['Placement']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print("\nTraining set shape:", X_train.shape)
print("Validation set shape:", X_val.shape)



Training set shape: (240, 18)
Validation set shape: (60, 18)


In [16]:
print("\nMissing values in processed training data:")
print(X_train.isnull().sum().sum())
print("\nMissing values in processed validation data:")
print(X_val.isnull().sum().sum())


Missing values in processed training data:
43

Missing values in processed validation data:
10


In [17]:
# Apply preprocessor to check imputation
X_train_processed = pd.DataFrame(
    preprocessor.fit_transform(X_train),
    # Note: Column names will be lost after transformation
)
X_val_processed = pd.DataFrame(
    preprocessor.transform(X_val),
    # Note: Column names will be lost after transformation
)

print("\nMissing values in processed training data:")
print(X_train_processed.isnull().sum().sum())
print("\nMissing values in processed validation data:")
print(X_val_processed.isnull().sum().sum())


Missing values in processed training data:
0

Missing values in processed validation data:
0


In [None]:
#Check for class imbalance
print("\nClass distribution in training set:")
print(y_train.value_counts())

In [19]:
# Define models to try
models = {
    'RandomForest': RandomForestClassifier(random_state=42),
    'GradientBoosting': GradientBoostingClassifier(random_state=42),
    'LogisticRegression': LogisticRegression(max_iter=1000, random_state=42),
    'XGBoost': XGBClassifier(random_state=42)
}

# Dictionary to store model performance
model_scores = {}

In [20]:
# Train and evaluate each model
for name, model in models.items():
    print(f"\n{'-'*50}\nTraining {name}...")
    
    # Create a pipeline with preprocessing and the model
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])
    
    # Train the model
    pipeline.fit(X_train, y_train)
    
    # Make predictions on validation set
    y_pred = pipeline.predict(X_val)
    
    # Evaluate the model
    accuracy = accuracy_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)
    model_scores[name] = (accuracy, f1)
    
    print(f"{name} - Accuracy: {accuracy:.4f}, F1 Score: {f1:.4f}")
    print("Classification Report:")
    print(classification_report(y_val, y_pred))
    
    # Plot confusion matrix
    plt.figure(figsize=(8, 6))
    cm = confusion_matrix(y_val, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'Confusion Matrix - {name}')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.savefig(f'confusion_matrix_{name}.png')
    plt.close()



--------------------------------------------------
Training RandomForest...
RandomForest - Accuracy: 0.8000, F1 Score: 0.0000
Classification Report:
              precision    recall  f1-score   support

           0       0.80      1.00      0.89        48
           1       0.00      0.00      0.00        12

    accuracy                           0.80        60
   macro avg       0.40      0.50      0.44        60
weighted avg       0.64      0.80      0.71        60


--------------------------------------------------
Training GradientBoosting...
GradientBoosting - Accuracy: 0.7333, F1 Score: 0.1111
Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.90      0.84        48
           1       0.17      0.08      0.11        12

    accuracy                           0.73        60
   macro avg       0.48      0.49      0.48        60
weighted avg       0.67      0.73      0.70        60


-------------------------------------

In [21]:
# Select the best model based on F1 score
best_model_name = max(model_scores, key=lambda x: model_scores[x][1])
best_f1 = model_scores[best_model_name][1]
print(f"\nBest model: {best_model_name} with F1 Score of {best_f1:.4f}")

# Fine-tune the best model
print(f"\n{'-'*50}\nFine-tuning {best_model_name}...")

if best_model_name == 'RandomForest':
    param_grid = {
        'classifier__n_estimators': [100, 200, 300],
        'classifier__max_depth': [None, 10, 20, 30],
        'classifier__min_samples_split': [2, 5, 10]
    }
elif best_model_name == 'GradientBoosting':
    param_grid = {
        'classifier__n_estimators': [100, 200, 300],
        'classifier__learning_rate': [0.01, 0.1, 0.2],
        'classifier__max_depth': [3, 5, 7]
    }
elif best_model_name == 'LogisticRegression':
    param_grid = {
        'classifier__C': [0.01, 0.1, 1, 10, 100],
        'classifier__penalty': ['l1', 'l2', 'elasticnet', None],
        'classifier__solver': ['liblinear', 'saga']
    }
else:  # XGBoost
    param_grid = {
        'classifier__n_estimators': [100, 200, 300],
        'classifier__learning_rate': [0.01, 0.1, 0.2],
        'classifier__max_depth': [3, 5, 7]
    }

# Create the base pipeline with the best model
best_model = models[best_model_name]
best_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', best_model)
])

# Perform grid search
grid_search = GridSearchCV(
    best_pipeline,
    param_grid=param_grid,
    cv=5,
    scoring='f1',
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation F1 score: {grid_search.best_score_:.4f}")



Best model: LogisticRegression with F1 Score of 0.2105

--------------------------------------------------
Fine-tuning LogisticRegression...
Best parameters: {'classifier__C': 10, 'classifier__penalty': 'l2', 'classifier__solver': 'saga'}
Best cross-validation F1 score: 0.2096


In [22]:
# Evaluate the tuned model on validation set
tuned_model = grid_search.best_estimator_
y_pred_tuned = tuned_model.predict(X_val)
tuned_f1 = f1_score(y_val, y_pred_tuned)
tuned_accuracy = accuracy_score(y_val, y_pred_tuned)

print(f"\nTuned {best_model_name} - Accuracy: {tuned_accuracy:.4f}, F1 Score: {tuned_f1:.4f}")
print("Classification Report:")
print(classification_report(y_val, y_pred_tuned))



Tuned LogisticRegression - Accuracy: 0.7333, F1 Score: 0.2000
Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.88      0.84        48
           1       0.25      0.17      0.20        12

    accuracy                           0.73        60
   macro avg       0.53      0.52      0.52        60
weighted avg       0.70      0.73      0.71        60



In [23]:
# Feature importance analysis (if the model supports it)
if best_model_name in ['RandomForest', 'GradientBoosting', 'XGBoost']:
    # Get feature names after preprocessing
    preprocessor.fit(X_train)
    cat_features = preprocessor.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(categorical_cols)
    feature_names = numerical_cols + list(cat_features)
    
    # Get feature importances
    if best_model_name == 'XGBoost':
        importances = tuned_model.named_steps['classifier'].feature_importances_
    else:
        importances = tuned_model.named_steps['classifier'].feature_importances_
    
    # Create a DataFrame for better visualization
    importance_df = pd.DataFrame({
        'Feature': feature_names[:len(importances)],  # Ensure matching length
        'Importance': importances
    }).sort_values(by='Importance', ascending=False)
    
    # Plot top 20 features
    plt.figure(figsize=(10, 8))
    sns.barplot(x='Importance', y='Feature', data=importance_df.head(20))
    plt.title(f'Top 20 Feature Importances - {best_model_name}')
    plt.tight_layout()
    plt.savefig('feature_importance.png')
    plt.close()
    
    print("\nTop 10 important features:")
    print(importance_df.head(10))

# Final model training on complete training data
print("\nTraining final model on complete training dataset...")
final_model = grid_search.best_estimator_
final_model.fit(X, y)

# Make predictions on test data
print("\nMaking predictions on test data...")
test_predictions = final_model.predict(test_features_df.drop('ID', axis=1))

# Create submission file
submission = pd.DataFrame({
    'ID': test_features_df['ID'],
    'Placement': test_predictions
})

submission.to_csv('final_submission.csv', index=False)
print("Submission file created: final_submission.csv")

# Summary of model performance
print("\nSummary of model performance:")
for name, (acc, f1) in model_scores.items():
    print(f"{name:20} - Accuracy: {acc:.4f}, F1 Score: {f1:.4f}")
print(f"\nTuned {best_model_name:20} - Accuracy: {tuned_accuracy:.4f}, F1 Score: {tuned_f1:.4f}")



Training final model on complete training dataset...

Making predictions on test data...
Submission file created: final_submission.csv

Summary of model performance:
RandomForest         - Accuracy: 0.8000, F1 Score: 0.0000
GradientBoosting     - Accuracy: 0.7333, F1 Score: 0.1111
LogisticRegression   - Accuracy: 0.7500, F1 Score: 0.2105
XGBoost              - Accuracy: 0.7667, F1 Score: 0.1250

Tuned LogisticRegression   - Accuracy: 0.7333, F1 Score: 0.2000


In [24]:
# View the final submission file
print("\nFinal submission file:")
print(submission.head())


Final submission file:
   ID  Placement
0   1          0
1   2          0
2   3          0
3   4          0
4   5          1
