In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import LabelEncoder

In [8]:
# Load the data
train_features = pd.read_csv('Train_Features.csv')
train_target = pd.read_csv('Train_Target.csv')

In [9]:
# Merge features and target
train_data = pd.merge(train_features, train_target, on='ID')

In [10]:
train_data.head()

Unnamed: 0,ID,Gender,Percent_SSC,Board_SSC,Percent_HSC,Board_HSC,Stream_HSC,Percent_Degree,Course_Degree,Experience_Yrs,Entrance_Test,S-TEST,Percentile_ET,S-TEST*SCORE,Percent_MBA,Specialization_MBA,Marks_Communication,Marks_Projectwork,Marks_BOCA,Placement
0,1,M,56.0,ICSE,58.0,ISC,Commerce,67.0,Management,0,,0,0.0,0.0,65.28,Marketing & Finance,62,77,77,1
1,2,M,41.0,Others,51.0,Others,Science,61.0,Computer Applications,1,MAT,1,86.0,86.0,62.48,Marketing & Finance,59,72,75,0
2,3,F,53.0,Others,40.0,Others,Arts,54.0,Arts,1,MAT,1,65.0,65.0,56.11,Marketing & HR,54,66,75,1
3,4,M,59.0,Others,58.0,Others,Commerce,59.0,Management,0,G-MAT,1,0.0,0.0,59.81,Marketing & HR,53,66,78,0
4,5,F,61.5,Others,65.4,CBSE,Arts,67.93,Management,0,MAT,1,61.0,61.0,64.27,Marketing & Finance,69,69,61,0


In [13]:
# Preprocessing for correlation and feature importance analysis
def preprocess_data(df):
    # Create a copy of the dataframe
    processed_df = df.copy()
    
    # Handle categorical variables
    categorical_cols = processed_df.select_dtypes(include=['object']).columns
    le = LabelEncoder()
    
    for col in categorical_cols:
        # Fill missing values with most frequent value
        processed_df[col] = processed_df[col].fillna(processed_df[col].mode()[0])
        # Encode categorical variables
        processed_df[col] = le.fit_transform(processed_df[col].astype(str))
    
    # Fill missing numerical values with median
    numerical_cols = processed_df.select_dtypes(include=['int64', 'float64']).columns
    for col in numerical_cols:
        processed_df[col] = processed_df[col].fillna(processed_df[col].median())
    
    return processed_df

# Preprocess the data
processed_data = preprocess_data(train_data)

# Separate features and target
X = processed_data.drop(['ID', 'Placement'], axis=1)
y = processed_data['Placement']

# 1. Correlation Analysis
plt.figure(figsize=(20, 16))
correlation_matrix = X.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Feature Correlation Heatmap')
plt.tight_layout()
plt.savefig('feature_correlation_heatmap.png')
plt.close()

# Print top correlations with Placement
placement_correlations = X.apply(lambda col: col.corr(y))
print("Top Correlations with Placement:")
print(placement_correlations.sort_values(ascending=False).head(10))
print("\nBottom Correlations with Placement:")
print(placement_correlations.sort_values(ascending=True).head(10))


Top Correlations with Placement:
Board_HSC              0.118256
Board_SSC              0.108610
Entrance_Test          0.055773
Stream_HSC            -0.016793
Marks_Communication   -0.019092
Experience_Yrs        -0.035116
Course_Degree         -0.036039
Specialization_MBA    -0.039476
Gender                -0.044031
S-TEST                -0.048281
dtype: float64

Bottom Correlations with Placement:
Percent_SSC         -0.183258
Marks_Projectwork   -0.157975
S-TEST*SCORE        -0.131428
Percentile_ET       -0.131428
Percent_MBA         -0.094157
Percent_HSC         -0.082300
Marks_BOCA          -0.067801
Percent_Degree      -0.057246
S-TEST              -0.048281
Gender              -0.044031
dtype: float64


In [15]:
# 2. Mutual Information Analysis
def calculate_mutual_information(X, y):
    # Calculate mutual information scores
    mi_scores = mutual_info_classif(X, y)
    
    # Create a dataframe of features and their mutual information scores
    mi_df = pd.DataFrame({
        'Feature': X.columns,
        'Mutual Information Score': mi_scores
    })
    
    # Sort by mutual information score in descending order
    return mi_df.sort_values('Mutual Information Score', ascending=False)
# Calculate mutual information
mi_scores = calculate_mutual_information(X, y)
print("\nMutual Information Scores:")
print(mi_scores)

# Visualize Mutual Information Scores
plt.figure(figsize=(12, 8))
sns.barplot(x='Mutual Information Score', y='Feature', data=mi_scores.head(15))
plt.title('Top 15 Features by Mutual Information Score')
plt.tight_layout()
plt.savefig('mutual_information_scores.png')
plt.close()



Mutual Information Scores:
                Feature  Mutual Information Score
1           Percent_SSC                  0.038017
8        Experience_Yrs                  0.035490
3           Percent_HSC                  0.027001
2             Board_SSC                  0.021885
5            Stream_HSC                  0.020145
15  Marks_Communication                  0.020072
14   Specialization_MBA                  0.015842
4             Board_HSC                  0.008828
13          Percent_MBA                  0.008533
17           Marks_BOCA                  0.006271
16    Marks_Projectwork                  0.005800
6        Percent_Degree                  0.004849
0                Gender                  0.000000
12         S-TEST*SCORE                  0.000000
11        Percentile_ET                  0.000000
10               S-TEST                  0.000000
7         Course_Degree                  0.000000
9         Entrance_Test                  0.000000


In [16]:
# 3. Chi-Square Feature Selection (for categorical variables)
from sklearn.feature_selection import chi2, SelectKBest

# Prepare data for chi-square test (non-negative values)
X_chi2 = X.copy()
X_chi2 = X_chi2 - X_chi2.min()

# Perform chi-square test
chi2_selector = SelectKBest(chi2, k=10)
chi2_selector.fit(X_chi2, y)

# Get feature scores
chi2_scores = pd.DataFrame({
    'Feature': X.columns,
    'Chi-Square Score': chi2_selector.scores_
})
chi2_scores = chi2_scores.sort_values('Chi-Square Score', ascending=False)

print("\nChi-Square Feature Importance:")
print(chi2_scores)

# Visualize Chi-Square Scores
plt.figure(figsize=(12, 8))
sns.barplot(x='Chi-Square Score', y='Feature', data=chi2_scores.head(15))
plt.title('Top 15 Features by Chi-Square Score')
plt.tight_layout()
plt.savefig('chi_square_scores.png')
plt.close()



Chi-Square Feature Importance:
                Feature  Chi-Square Score
11        Percentile_ET         91.616182
12         S-TEST*SCORE         91.616182
1           Percent_SSC         45.714550
16    Marks_Projectwork         19.888529
3           Percent_HSC         11.386340
17           Marks_BOCA          8.457160
13          Percent_MBA          8.352736
6        Percent_Degree          2.920369
2             Board_SSC          2.299714
4             Board_HSC          2.207039
15  Marks_Communication          0.807423
7         Course_Degree          0.358386
8        Experience_Yrs          0.343132
14   Specialization_MBA          0.327608
9         Entrance_Test          0.313255
0                Gender          0.186118
10               S-TEST          0.123546
5            Stream_HSC          0.020091


In [17]:
# 4. Logistic Regression Coefficients
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Fit Logistic Regression
lr = LogisticRegression(max_iter=1000)
lr.fit(X_scaled, y)

# Get feature coefficients
coef_df = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': np.abs(lr.coef_[0])
})
coef_df = coef_df.sort_values('Coefficient', ascending=False)

print("\nLogistic Regression Feature Coefficients:")
print(coef_df)

# Visualize Logistic Regression Coefficients
plt.figure(figsize=(12, 8))
sns.barplot(x='Coefficient', y='Feature', data=coef_df.head(15))
plt.title('Top 15 Features by Logistic Regression Coefficient Magnitude')
plt.tight_layout()
plt.savefig('logistic_regression_coefficients.png')
plt.close()



Logistic Regression Feature Coefficients:
                Feature  Coefficient
1           Percent_SSC     0.657072
16    Marks_Projectwork     0.455159
10               S-TEST     0.434558
15  Marks_Communication     0.378881
14   Specialization_MBA     0.302402
12         S-TEST*SCORE     0.286837
11        Percentile_ET     0.286837
0                Gender     0.269917
4             Board_HSC     0.204752
9         Entrance_Test     0.198178
5            Stream_HSC     0.190910
2             Board_SSC     0.185638
13          Percent_MBA     0.127119
7         Course_Degree     0.116158
3           Percent_HSC     0.105067
8        Experience_Yrs     0.056984
6        Percent_Degree     0.050692
17           Marks_BOCA     0.044879


In [18]:
# Comprehensive Summary
def summarize_feature_importance():
    print("\n--- Comprehensive Feature Importance Analysis ---")
    
    # Combine different feature importance methods
    methods = {
        'Correlation with Placement': placement_correlations,
        'Mutual Information Score': mi_scores.set_index('Feature')['Mutual Information Score'],
        'Chi-Square Score': chi2_scores.set_index('Feature')['Chi-Square Score'],
        'Logistic Regression Coefficient': coef_df.set_index('Feature')['Coefficient']
    }
    
    # Find common top features across methods
    top_features = {}
    for method_name, scores in methods.items():
        top_features[method_name] = list(scores.sort_values(ascending=False).head(10).index)
    
    print("\nTop 10 Features by Different Methods:")
    for method, features in top_features.items():
        print(f"\n{method}:")
        for i, feature in enumerate(features, 1):
            print(f"{i}. {feature}")
    
    # Find features that appear consistently across methods
    from collections import Counter
    all_top_features = [feature for features in top_features.values() for feature in features]
    consistent_features = [feat for feat, count in Counter(all_top_features).items() if count > 1]
    
    print("\nConsistently Important Features:")
    for feature in consistent_features:
        print(feature)

# Run summary
summarize_feature_importance()


--- Comprehensive Feature Importance Analysis ---

Top 10 Features by Different Methods:

Correlation with Placement:
1. Board_HSC
2. Board_SSC
3. Entrance_Test
4. Stream_HSC
5. Marks_Communication
6. Experience_Yrs
7. Course_Degree
8. Specialization_MBA
9. Gender
10. S-TEST

Mutual Information Score:
1. Percent_SSC
2. Experience_Yrs
3. Percent_HSC
4. Board_SSC
5. Stream_HSC
6. Marks_Communication
7. Specialization_MBA
8. Board_HSC
9. Percent_MBA
10. Marks_BOCA

Chi-Square Score:
1. Percentile_ET
2. S-TEST*SCORE
3. Percent_SSC
4. Marks_Projectwork
5. Percent_HSC
6. Marks_BOCA
7. Percent_MBA
8. Percent_Degree
9. Board_SSC
10. Board_HSC

Logistic Regression Coefficient:
1. Percent_SSC
2. Marks_Projectwork
3. S-TEST
4. Marks_Communication
5. Specialization_MBA
6. S-TEST*SCORE
7. Percentile_ET
8. Gender
9. Board_HSC
10. Entrance_Test

Consistently Important Features:
Board_HSC
Board_SSC
Entrance_Test
Stream_HSC
Marks_Communication
Experience_Yrs
Specialization_MBA
Gender
S-TEST
Percent_SS