In [1]:
import pandas as pd
# Split the data into training and testing sets (70% training, 30% testing)
from sklearn.model_selection import train_test_split
# Train a Logistic Regression model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [59]:
# Reload the dataset
file_path_new = r"C:\Pratham\programmingProjects\pythonProgrammes\Python-code-practice\GitHub_SRM\ClusteringProject\StudentsPerformance.csv"
students_data_new = pd.read_csv(file_path_new)

# Display the first few rows of the new dataset
students_data_new.head()

Unnamed: 0,gender,race/ethnicity,math score,reading score,writing score
0,female,group B,72,72,74
1,female,group C,69,90,88
2,female,group B,90,95,93
3,male,group A,47,57,44
4,male,group C,76,78,75


In [60]:
# Step 1: Create a pass/fail column based on the average score threshold (e.g., 40)
students_data_new['average_score'] = students_data_new[['math score', 'reading score', 'writing score']].mean(axis=1)
students_data_new['pass/fail'] = students_data_new['average_score'].apply(lambda x: 1 if x >= 40 else 0)
display(students_data_new.head())

Unnamed: 0,gender,race/ethnicity,math score,reading score,writing score,average_score,pass/fail
0,female,group B,72,72,74,72.666667,1
1,female,group C,69,90,88,82.333333,1
2,female,group B,90,95,93,92.666667,1
3,male,group A,47,57,44,49.333333,1
4,male,group C,76,78,75,76.333333,1


In [61]:
# Drop the 'average_score' column as it is not needed for the prediction
students_data_new = students_data_new.drop('average_score', axis=1)
display(students_data_new.head())

Unnamed: 0,gender,race/ethnicity,math score,reading score,writing score,pass/fail
0,female,group B,72,72,74,1
1,female,group C,69,90,88,1
2,female,group B,90,95,93,1
3,male,group A,47,57,44,1
4,male,group C,76,78,75,1


In [63]:
# Step 2: Preprocess the data
# Convert categorical data (gender, race/ethnicity) using one-hot encoding
students_data_new = pd.get_dummies(students_data_new, drop_first=True)
display(students_data_new.head())

Unnamed: 0,math score,reading score,writing score,pass/fail,gender_male,race/ethnicity_group B,race/ethnicity_group C,race/ethnicity_group D,race/ethnicity_group E
0,72,72,74,1,False,True,False,False,False
1,69,90,88,1,False,False,True,False,False
2,90,95,93,1,False,True,False,False,False
3,47,57,44,1,True,False,False,False,False
4,76,78,75,1,True,False,True,False,False


In [64]:
# Step 3: Split the data into features (X) and labels (y)
X = students_data_new.drop('pass/fail', axis=1)  # Features (gender, race/ethnicity, scores)
y = students_data_new['pass/fail']               # Labels (1 for pass, 0 for fail)

# Split the data into training and testing sets (70% training, 30% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [65]:
# Step 4: Train a Logistic Regression model
logreg_model = LogisticRegression()
logreg_model.fit(X_train, y_train)

# Predict on the test set
logreg_pred = logreg_model.predict(X_test)

In [68]:
# Step 5: Evaluate the model accuracy
logreg_accuracy = accuracy_score(y_test, logreg_pred)
logreg_accuracy  # Return the accuracy of the Logistic Regression model

0.9966666666666667

In [67]:

# Step 6: Predict probabilities for each student (probability of passing or failing)
probabilities = logreg_model.predict_proba(X_test)  # This returns probabilities for both classes (0 and 1)

# Create a DataFrame to display the student IDs, actual outcomes, and predicted probabilities
probability_df = pd.DataFrame({
    'Student_ID': X_test.index,                  # Unique identifier for each student in the test set
    'Pass_Probability': probabilities[:, 1],     # Probability of passing (class 1)
    'Fail_Probability': probabilities[:, 0]      # Probability of failing (class 0)
})

# Display the top rows of the DataFrame
print(probability_df)

     Student_ID  Pass_Probability  Fail_Probability
0           521          1.000000          0.000000
1           737          1.000000          0.000000
2           740          1.000000          0.000000
3           660          1.000000          0.000000
4           411          1.000000          0.000000
..          ...               ...               ...
295         468          1.000000          0.000000
296         935          1.000000          0.000000
297         428          1.000000          0.000000
298           7          0.863681          0.136319
299         155          1.000000          0.000000

[300 rows x 3 columns]
