In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import numpy as np

In [4]:
file_path = "C:/Users/rashi/OneDrive/Desktop/BE SEM1/ML/ML FINAL ASSIGNMENTS/Assignment-1/ML/Cleaned_data.csv"
data = pd.read_csv(file_path)

In [5]:
print(data.head())

   SR.NO      Status  10TH MARKS Branch 12TH MARKS DIPLOMA  ENGG.AVG%
0      1  Not Placed       86.80     IT        NaN   93.44      73.39
1      2      Placed       84.40     IT         85     NaN      74.53
2      3  Not Placed       87.08     IT       80.4     NaN      71.64
3      4      Placed       92.60     IT      80.15     NaN      80.74
4      5  Not Placed       83.40     IT         58     NaN      80.74


In [6]:
 data.drop(columns=['SR.NO'], inplace=True)

In [7]:
data['10TH MARKS'] = pd.to_numeric(data['10TH MARKS'], errors='coerce')
data['12TH MARKS'] = pd.to_numeric(data['12TH MARKS'], errors='coerce')
data['DIPLOMA'] = pd.to_numeric(data['DIPLOMA'], errors='coerce')
data['ENGG.AVG%'] = pd.to_numeric(data['ENGG.AVG%'], errors='coerce')

In [8]:
print(data.head())

       Status  10TH MARKS Branch  12TH MARKS  DIPLOMA  ENGG.AVG%
0  Not Placed       86.80     IT         NaN    93.44      73.39
1      Placed       84.40     IT       85.00      NaN      74.53
2  Not Placed       87.08     IT       80.40      NaN      71.64
3      Placed       92.60     IT       80.15      NaN      80.74
4  Not Placed       83.40     IT       58.00      NaN      80.74


In [9]:
data['12TH MARKS OR DIPLOMA'] = data['12TH MARKS'].fillna(data['DIPLOMA'])

In [10]:
print(data.head())

       Status  10TH MARKS Branch  12TH MARKS  DIPLOMA  ENGG.AVG%  \
0  Not Placed       86.80     IT         NaN    93.44      73.39   
1      Placed       84.40     IT       85.00      NaN      74.53   
2  Not Placed       87.08     IT       80.40      NaN      71.64   
3      Placed       92.60     IT       80.15      NaN      80.74   
4  Not Placed       83.40     IT       58.00      NaN      80.74   

   12TH MARKS OR DIPLOMA  
0                  93.44  
1                  85.00  
2                  80.40  
3                  80.15  
4                  58.00  


In [11]:
data = data.drop(columns=['12TH MARKS', 'DIPLOMA'])

In [12]:
print(data.head())

       Status  10TH MARKS Branch  ENGG.AVG%  12TH MARKS OR DIPLOMA
0  Not Placed       86.80     IT      73.39                  93.44
1      Placed       84.40     IT      74.53                  85.00
2  Not Placed       87.08     IT      71.64                  80.40
3      Placed       92.60     IT      80.74                  80.15
4  Not Placed       83.40     IT      80.74                  58.00


In [13]:
data['10TH MARKS'] = data['10TH MARKS'].fillna(data['10TH MARKS'].median())
data['12TH MARKS OR DIPLOMA'] = data['12TH MARKS OR DIPLOMA'].fillna(data['12TH MARKS OR DIPLOMA'].median())
data['ENGG.AVG%'] = data['ENGG.AVG%'].fillna(data['ENGG.AVG%'].median())

In [14]:
print(data.head())
data.to_csv('Preprocessed_data.csv', index=False)

       Status  10TH MARKS Branch  ENGG.AVG%  12TH MARKS OR DIPLOMA
0  Not Placed       86.80     IT      73.39                  93.44
1      Placed       84.40     IT      74.53                  85.00
2  Not Placed       87.08     IT      71.64                  80.40
3      Placed       92.60     IT      80.74                  80.15
4  Not Placed       83.40     IT      80.74                  58.00


In [15]:
numerical_cols = ['10TH MARKS', '12TH MARKS OR DIPLOMA', 'ENGG.AVG%']
scaler = MinMaxScaler()
data[numerical_cols] = scaler.fit_transform(data[numerical_cols])

In [16]:
standardizer = StandardScaler()
data[numerical_cols] = standardizer.fit_transform(data[numerical_cols])

In [17]:
encoder = OneHotEncoder(sparse=False)
branch_encoded = encoder.fit_transform(data[['Branch']])
branch_encoded_df = pd.DataFrame(branch_encoded, columns=encoder.get_feature_names_out(['Branch']))
data = pd.concat([data, branch_encoded_df], axis=1)



In [18]:
data = data.drop_duplicates()

for col in numerical_cols:
    data = data[(np.abs(data[col] - data[col].mean()) <= (3 * data[col].std()))]

In [19]:
data['Status'] = data['Status'].map({'Placed': 1, 'Not Placed': 0})

In [20]:
X = data[['10TH MARKS', '12TH MARKS OR DIPLOMA', 'ENGG.AVG%'] + list(branch_encoded_df.columns)]
y = data['Status']

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [22]:
model = LogisticRegression()
model.fit(X_train, y_train)

In [23]:
y_pred = model.predict(X_test)
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.55      0.67      0.60        27
           1       0.64      0.52      0.57        31

    accuracy                           0.59        58
   macro avg       0.59      0.59      0.59        58
weighted avg       0.60      0.59      0.58        58



In [24]:
def predict_placement(model, input_data, scaler, standardizer, encoder):
    numerical_features = ['10TH MARKS', '12TH MARKS OR DIPLOMA', 'ENGG.AVG%']
    branch_feature = ['Branch']
    
    input_df = pd.DataFrame([input_data], columns=numerical_features + branch_feature)
    
    input_df[numerical_features] = scaler.transform(input_df[numerical_features])  # Normalize
    input_df[numerical_features] = standardizer.transform(input_df[numerical_features])  # Standardize
    
    branch_encoded = encoder.transform(input_df[branch_feature])
    branch_encoded_df = pd.DataFrame(branch_encoded, columns=encoder.get_feature_names_out(branch_feature))
    
    input_df = pd.concat([input_df[numerical_features], branch_encoded_df], axis=1)
    
    prediction = model.predict(input_df)
    return 'Placed' if prediction[0] == 1 else 'Not Placed'

In [25]:
new_sample = {
    '10TH MARKS': 75,
    '12TH MARKS OR DIPLOMA': 70,  
    'ENGG.AVG%': 80,
    'Branch': 'COMP'
}

In [26]:
result = predict_placement(model, new_sample, scaler, standardizer, encoder)
print(f"The predicted placement status is: {result}")

The predicted placement status is: Placed
