In [6]:
import pandas as pd

# Load the datasets
train_data = pd.read_csv('CQ4_train.csv')
test_data = pd.read_csv('CQ4_test.csv')
val_data = pd.read_csv('CQ4_val.csv')

# Display the first few rows of the training data
print(train_data.head())

# Check for missing values
print(train_data.isnull().sum())

# Basic statistics
print(train_data.describe())


   id_student  test1  test2  test3  test4  final  pass
0      650565    5.1    9.6  16.10  12.95  43.75     0
1      679716    6.0    9.6  21.00  16.80  53.40     1
2      126769    4.0    8.0  21.00  21.00  54.00     1
3      653762    5.5   12.4  20.30  22.75  60.95     0
4      687037    7.6   13.4  24.85  25.20  71.05     1
id_student    0
test1         0
test2         0
test3         0
test4         0
final         0
pass          0
dtype: int64
         id_student       test1       test2       test3       test4  \
count  6.000000e+02  600.000000  600.000000  600.000000  600.000000   
mean   6.848843e+05    6.406833   12.484667   21.551833   21.786333   
std    4.441751e+05    1.354238    2.867077    4.984342    5.293458   
min    3.293000e+04    3.800000    0.000000    2.450000    5.250000   
25%    5.738300e+05    5.500000   10.950000   18.550000   18.200000   
50%    6.467785e+05    6.500000   12.800000   21.700000   22.400000   
75%    6.837522e+05    7.400000   14.600000   24

In [7]:
# Features and target
X_train = train_data[['test1', 'test2', 'test3', 'test4']]
y_train = train_data['pass']

X_val = val_data[['test1', 'test2', 'test3', 'test4']]
y_val = val_data['pass']

X_test = test_data[['test1', 'test2', 'test3', 'test4']]
y_test = test_data['pass']


In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Initialize the model
model = LogisticRegression(random_state=42)

# Fit the model
model.fit(X_train, y_train)

# Evaluate on the validation set
y_val_pred = model.predict(X_val)

# Metrics
val_accuracy = accuracy_score(y_val, y_val_pred)
val_conf_matrix = confusion_matrix(y_val, y_val_pred)
val_class_report = classification_report(y_val, y_val_pred)

print(f'Validation Accuracy: {val_accuracy}')
print('Confusion Matrix:')
print(val_conf_matrix)
print('Classification Report:')
print(val_class_report)


Validation Accuracy: 0.655
Confusion Matrix:
[[68 34]
 [35 63]]
Classification Report:
              precision    recall  f1-score   support

           0       0.66      0.67      0.66       102
           1       0.65      0.64      0.65        98

    accuracy                           0.66       200
   macro avg       0.65      0.65      0.65       200
weighted avg       0.65      0.66      0.65       200



In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Initialize the model
model = RandomForestClassifier(random_state=42)

# Fit the model
model.fit(X_train, y_train)

# Evaluate on the validation set
y_val_pred = model.predict(X_val)

# Metrics
val_accuracy = accuracy_score(y_val, y_val_pred)
val_conf_matrix = confusion_matrix(y_val, y_val_pred)
val_class_report = classification_report(y_val, y_val_pred)

print(f'Validation Accuracy: {val_accuracy}')
print('Confusion Matrix:')
print(val_conf_matrix)
print('Classification Report:')
print(val_class_report)


Validation Accuracy: 0.89
Confusion Matrix:
[[101   1]
 [ 21  77]]
Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.99      0.90       102
           1       0.99      0.79      0.88        98

    accuracy                           0.89       200
   macro avg       0.91      0.89      0.89       200
weighted avg       0.91      0.89      0.89       200



In [10]:
# Example of changing parameters
model = RandomForestClassifier(n_estimators=150, max_depth=20, random_state=42)
model.fit(X_train, y_train)
y_val_pred = model.predict(X_val)

# Re-evaluate
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f'Validation Accuracy: {val_accuracy}')


Validation Accuracy: 0.89


In [11]:
y_test_pred = model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
test_conf_matrix = confusion_matrix(y_test, y_test_pred)
test_class_report = classification_report(y_test, y_test_pred)

print(f'Test Accuracy: {test_accuracy}')
print('Confusion Matrix:')
print(test_conf_matrix)
print('Classification Report:')
print(test_class_report)


Test Accuracy: 0.9
Confusion Matrix:
[[102   2]
 [ 18  78]]
Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.98      0.91       104
           1       0.97      0.81      0.89        96

    accuracy                           0.90       200
   macro avg       0.91      0.90      0.90       200
weighted avg       0.91      0.90      0.90       200



In [12]:
import joblib

# Save the model
joblib.dump(model, 'final_model.pkl')

# Save the notebook
# In Jupyter, use File -> Download as -> Notebook (.ipynb)


['final_model.pkl']

In [15]:
X_test



Unnamed: 0,test1,test2,test3,test4
0,7.0,14.0,17.50,26.25
1,6.0,13.2,25.20,24.15
2,8.1,14.4,30.10,18.20
3,5.0,10.0,11.55,14.00
4,7.0,13.8,22.75,17.50
...,...,...,...,...
195,8.1,14.4,25.55,26.25
196,6.5,12.6,26.25,28.00
197,4.5,15.4,15.75,25.90
198,7.9,18.2,30.10,25.55
