In [25]:
import pandas as pd
import numpy as np

In [26]:
df = pd.read_csv("my_data.csv")

In [27]:
df

Unnamed: 0,Roll No,Name,Age,DOB,CGPA,Courses,Graduation Year,Placements,M.Tech/MS,Startup
0,19BCI0876,Akhil,19,23-12-2003,8.45,8.0,2023,Yes,Yes,No
1,20BCE0076,Ram,20,3-10-2002,6.75,7.0,2024,Yes,No,Yes
2,20BDS0957,Rishab,21,2-12-2001,7.16,6.0,2024,Yes,No,No
3,20BDS0294,Sujay,20,02-12-2002,8.02,9.0,2024,No,No,Yes
4,20BCI0805,Atul,19,12-07-2003,9.14,12.0,2024,Yes,Yes,No
5,20BKT0012,Nivas,20,3-1-2002,9.54,6.0,2024,No,No,Yes
6,20BCT0121,Harshil,19,2-06-2003,8.9,5.0,2024,Yes,No,No
7,20BCI0234,Robert,20,23-1-2002,5.56,9.0,2024,No,Yes,Yes
8,20BCE0294,Richard,21,13-09-2001,6.98,8.0,2024,Yes,Yes,Yes
9,20BCE2265,Nicolas,21,17-08-2001,7.23,13.0,2024,No,No,No


In [28]:
df = df.dropna(axis=0)

In [29]:
data = df.drop(["Roll No","Name","DOB","M.Tech/MS","Startup"],axis=1)

In [30]:
data["Placements"] = data["Placements"].apply(lambda row: 1 if row=="Yes" else 0)

In [31]:
data

Unnamed: 0,Age,CGPA,Courses,Graduation Year,Placements
0,19,8.45,8.0,2023,1
1,20,6.75,7.0,2024,1
2,21,7.16,6.0,2024,1
3,20,8.02,9.0,2024,0
4,19,9.14,12.0,2024,1
5,20,9.54,6.0,2024,0
6,19,8.9,5.0,2024,1
7,20,5.56,9.0,2024,0
8,21,6.98,8.0,2024,1
9,21,7.23,13.0,2024,0


In [32]:
from sklearn.preprocessing import StandardScaler
features = ['Age', 'CGPA', 'Courses']
x = data.loc[:, features].values
y = data.loc[:,['Placements']].values
x = StandardScaler().fit_transform(x)

In [33]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(x)
principalDf = pd.DataFrame(data = principalComponents, columns = ['principal component 1', 'principal component 2'])

In [34]:
finalDf = pd.concat([principalDf, data[['Placements']]], axis = 1)

In [35]:
finalDf

Unnamed: 0,principal component 1,principal component 2,Placements
0,-1.135094,-0.347803,1.0
1,0.75792,0.346056,1.0
2,1.686181,-0.257391,1.0
3,-0.20947,0.217022,0.0
4,-2.224,0.531577,1.0
5,-0.324395,-1.695244,0.0
6,-0.760783,-1.602193,1.0
7,0.915314,1.729521,0.0
8,1.381773,0.505089,1.0
9,0.300693,1.980902,0.0


In [36]:
pca.explained_variance_ratio_

array([0.45007149, 0.36334954])

In [38]:
finalDf = finalDf.dropna(axis=0)

## Applying Logistic Regression

In [57]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,accuracy_score

## APPLYING WITHOUT PCA

In [53]:
X = data[["Age","CGPA","Courses","Graduation Year"]]
y = data[["Placements"]]
X_train, X_test,y_train,y_test = train_test_split(X,y,random_state=1)

In [60]:
model1 = LogisticRegression()
model1.fit(X_train,y_train)
y_pred = model1.predict(X_test)
print(accuracy_score(y_pred,y_test))
print(classification_report(y_pred,y_test))

0.6
              precision    recall  f1-score   support

           0       0.33      1.00      0.50         1
           1       1.00      0.50      0.67         4

    accuracy                           0.60         5
   macro avg       0.67      0.75      0.58         5
weighted avg       0.87      0.60      0.63         5



  y = column_or_1d(y, warn=True)


## APPLYING WITH PCA

In [101]:
X1 = finalDf[["principal component 1","principal component 2"]]
y1 = finalDf[["Placements"]]
X_train, X_test,y_train,y_test = train_test_split(X1,y1,random_state=23)

In [102]:
model1 = LogisticRegression()
model1.fit(X_train,y_train)
y_pred = model1.predict(X_test)
print(accuracy_score(y_pred,y_test))
print(classification_report(y_pred,y_test))

0.25
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00         3
         1.0       0.25      1.00      0.40         1

    accuracy                           0.25         4
   macro avg       0.12      0.50      0.20         4
weighted avg       0.06      0.25      0.10         4



  y = column_or_1d(y, warn=True)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## IMBALANCED DATA

In [86]:
df["Placements"].value_counts()

Yes    10
No      8
Name: Placements, dtype: int64

In [87]:
df.corr()

  df.corr()


Unnamed: 0,Age,CGPA,Courses,Graduation Year
Age,1.0,-0.288503,-0.266259,0.048564
CGPA,-0.288503,1.0,-0.090373,-0.344974
Courses,-0.266259,-0.090373,1.0,0.216875
Graduation Year,0.048564,-0.344974,0.216875,1.0


## UNDERSAMPLING WITHOUT PCA

In [112]:
from imblearn.under_sampling import RandomUnderSampler

In [113]:
rus = RandomUnderSampler(random_state=42)
X_res, y_res = rus.fit_resample(X, y)
X_res_train, X_res_test, y_res_train, y_res_test = train_test_split(X_res,y_res,random_state=1)

In [114]:
y_res.value_counts()

Placements
0             8
1             8
dtype: int64

In [115]:
model1.fit(X_res_train,y_res_train)
y_pred = model1.predict(X_res_test)
print(accuracy_score(y_pred,y_res_test))
print(classification_report(y_pred,y_res_test))

0.75
              precision    recall  f1-score   support

           0       0.67      1.00      0.80         2
           1       1.00      0.50      0.67         2

    accuracy                           0.75         4
   macro avg       0.83      0.75      0.73         4
weighted avg       0.83      0.75      0.73         4



  y = column_or_1d(y, warn=True)


## UNDERSAMPLING WITH PCA

In [116]:
rus1 = RandomUnderSampler(random_state=42)
X_res, y_res = rus1.fit_resample(X1, y1)
X_res_train, X_res_test, y_res_train, y_res_test = train_test_split(X_res,y_res,random_state=23)

In [117]:
model1.fit(X_res_train,y_res_train)
y_pred = model1.predict(X_res_test)
print(accuracy_score(y_pred,y_res_test))
print(classification_report(y_pred,y_res_test))

0.5
              precision    recall  f1-score   support

         0.0       0.33      1.00      0.50         1
         1.0       1.00      0.33      0.50         3

    accuracy                           0.50         4
   macro avg       0.67      0.67      0.50         4
weighted avg       0.83      0.50      0.50         4



  y = column_or_1d(y, warn=True)


## OVERSAMPLING WITHOUT PCA

In [118]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=42)
X_res, y_res = ros.fit_resample(X, y)
X_res_train, X_res_test, y_res_train, y_res_test = train_test_split(X_res,y_res,random_state=1)

In [119]:
y_res.value_counts()

Placements
0             10
1             10
dtype: int64

In [120]:
model1.fit(X_res_train,y_res_train)
y_pred = model1.predict(X_res_test)
print(accuracy_score(y_pred,y_res_test))
print(classification_report(y_pred,y_res_test))

0.4
              precision    recall  f1-score   support

           0       1.00      0.25      0.40         4
           1       0.25      1.00      0.40         1

    accuracy                           0.40         5
   macro avg       0.62      0.62      0.40         5
weighted avg       0.85      0.40      0.40         5



  y = column_or_1d(y, warn=True)


## OVERSAMPLING WITH PCA

In [121]:
X_res, y_res = ros.fit_resample(X1, y1)
X_res_train, X_res_test, y_res_train, y_res_test = train_test_split(X_res,y_res,random_state=1)

In [122]:
model1.fit(X_res_train,y_res_train)
y_pred = model1.predict(X_res_test)
print(accuracy_score(y_pred,y_res_test))
print(classification_report(y_pred,y_res_test))

0.4
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00         0
         1.0       1.00      0.40      0.57         5

    accuracy                           0.40         5
   macro avg       0.50      0.20      0.29         5
weighted avg       1.00      0.40      0.57         5



  y = column_or_1d(y, warn=True)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
