# Building a Student Intervention System

## 1. Data Exploration

In [1]:
# Import libraries

import numpy as np
import pandas as pd
from sklearn.metrics import f1_score

In [2]:
# Read the student data
df = pd.read_csv('student-data.csv')

In [3]:
df.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,passed
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,no,no,4,3,4,1,1,3,6,no
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,yes,no,5,3,3,1,1,3,4,no
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,yes,no,4,3,2,2,3,3,10,yes
3,GP,F,15,U,GT3,T,4,2,health,services,...,yes,yes,3,2,2,1,1,5,2,yes
4,GP,F,16,U,GT3,T,3,3,other,other,...,no,no,4,3,2,1,2,5,4,yes


In [4]:
n_students = df.shape[0]
n_features = df.shape[1] - 1
n_passed = df['passed'].value_counts()[0]
n_failed = df['passed'].value_counts()[1]
print("Total number of students: {}".format(n_students))
print("Total number of features: {}".format(n_features))
print("Number of students who passed: {}".format(n_passed))
print("Number of students who failed: {}".format(n_failed))

Total number of students: 395
Total number of features: 30
Number of students who passed: 265
Number of students who failed: 130


In [5]:
print(df['Mjob'].value_counts())

other       141
services    103
at_home      59
teacher      58
health       34
Name: Mjob, dtype: int64


## 2. Preparing the data

In this section, data is preprocessed for training the model.

In [6]:
#Extract feature(X) and target(y) columns
feature_col = list(df.columns[:-1])
target_col = df.columns[-1]
print("Feature Columns: \n{}\n".format(feature_col))
print("Target Column: \n{}\n".format(target_col))

Feature Columns: 
['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu', 'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime', 'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences']

Target Column: 
passed



In [7]:
# Feature values of all student(X) and their corresponding labels
y = df['passed'] 
X = df.drop('passed', axis = 1)

In [8]:
X.head() #Top 5 student's feature data

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,higher,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,yes,no,no,4,3,4,1,1,3,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,yes,yes,no,5,3,3,1,1,3,4
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,yes,yes,no,4,3,2,2,3,3,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,yes,yes,yes,3,2,2,1,1,5,2
4,GP,F,16,U,GT3,T,3,3,other,other,...,yes,no,no,4,3,2,1,2,5,4


## Preprocess feature columns

Here we can see several features have non-numeric values that needs to be converted.Some of them are just simply yes/no, e.g. internet and romantic.These can be converted into binary values (1/0).

Columns like Mjob and Fjob having categorical values i.e. they have more than two differnt values.The common way to handle such a column is to create as many columns as possible values (e.g. Mjob_other, Mjob_at_home, Mjob_services, etc.), and assign a 1 to one of them and 0 to all others.

Thes generated columns are called dummy variables.To perform this transformation,we use pandas.get_dummies()

### Preprocesses the student data and converts non-numeric binary variables into binary (0/1) variables. Converts categorical variables into dummy variables.

In [9]:
def preprocess(X):
    # Initialize new output DataFrame
    output = pd.DataFrame(index = X.index)

    # Investigate each feature column for the data
    for col, col_data in X.iteritems():

        # If data type is non-numeric, replace all yes/no values with 1/0
        if col_data.dtype == object:
            col_data = col_data.replace(['yes', 'no'], [1, 0])

        # If data type is categorical, convert to dummy variables
        if col_data.dtype == object:
            # Example: 'school' => 'school_GP' and 'school_MS'
            col_data = pd.get_dummies(col_data, prefix = col)

        # Collect the revised columns
        output = output.join(col_data)

    return output

X = preprocess(X)
print ("Processed feature columns ({} total features):\n{}".format(len(X.columns), list(X.columns)))
X.head()

Processed feature columns (48 total features):
['school_GP', 'school_MS', 'sex_F', 'sex_M', 'age', 'address_R', 'address_U', 'famsize_GT3', 'famsize_LE3', 'Pstatus_A', 'Pstatus_T', 'Medu', 'Fedu', 'Mjob_at_home', 'Mjob_health', 'Mjob_other', 'Mjob_services', 'Mjob_teacher', 'Fjob_at_home', 'Fjob_health', 'Fjob_other', 'Fjob_services', 'Fjob_teacher', 'reason_course', 'reason_home', 'reason_other', 'reason_reputation', 'guardian_father', 'guardian_mother', 'guardian_other', 'traveltime', 'studytime', 'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences']


Unnamed: 0,school_GP,school_MS,sex_F,sex_M,age,address_R,address_U,famsize_GT3,famsize_LE3,Pstatus_A,...,higher,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences
0,1,0,1,0,18,0,1,1,0,1,...,1,0,0,4,3,4,1,1,3,6
1,1,0,1,0,17,0,1,1,0,0,...,1,1,0,5,3,3,1,1,3,4
2,1,0,1,0,15,0,1,0,1,0,...,1,1,0,4,3,2,2,3,3,10
3,1,0,1,0,15,0,1,1,0,0,...,1,1,1,3,2,2,1,1,5,2
4,1,0,1,0,16,0,1,1,0,0,...,1,0,0,4,3,2,1,2,5,4


### Split the data into training and testing set

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, test_size=95, random_state=42)

# Show the results of the split
print ("Training set has {} samples.".format(X_train.shape[0]))
print ("Testing set has {} samples.".format(X_test.shape[0]))

Training set has 300 samples.
Testing set has 95 samples.


# 3. Training and Evaluating Model

Using scikit-learn Logistic Regression as classification model.Fit the model to training data and try to predict labels.Use differnet size of training data (100,200,300) and constant test set size.

In [11]:
from time import time

#  Fits a classifier to the training data. 
def train_classifier(clf, X_train, y_train):

    # Start the clock, train the classifier, then stop the clock
    start = time()
    clf.fit(X_train, y_train)
    end = time()

    # Print the results
    print ("Trained model in {:.4f} seconds".format(end - start))

# Makes predictions using a fit classifier based on F1 score.
def predict_labels(clf, features, target):
    
    # Start the clock, make predictions, then stop the clock
    start = time()
    y_pred = clf.predict(features)
    end = time()

    # Print and return results
    print ("Made predictions in {:.4f} seconds.".format(end - start))
    return f1_score(target.values, y_pred, pos_label='yes')

# Train and predict using a classifer based on F1 score.
def train_predict(clf, X_train, y_train, X_test, y_test):

    # Indicate the classifier and the training set size
    print ("")
    print ("Training a {} using a training set size of {}. . .".format(clf.__class__.__name__, len(X_train)))

    # Train the classifier
    train_classifier(clf, X_train, y_train)

    # Print the results of prediction for both training and testing
    print ("F1 score for training set: {:.4f}.".format(predict_labels(clf, X_train, y_train)))
    print ("F1 score for test set: {:.4f}.".format(predict_labels(clf, X_test, y_test)))

In [12]:
# Import the LogisticRegression from sklearn

from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(C=1,random_state=101)

print ("\n{}: \n".format(clf.__class__.__name__))
for n in [100, 200, 300]:
    train_predict(clf, X_train[:n], y_train[:n], X_test, y_test)


LogisticRegression: 


Training a LogisticRegression using a training set size of 100. . .
Trained model in 0.0023 seconds
Made predictions in 0.0004 seconds.
F1 score for training set: 0.8671.
Made predictions in 0.0003 seconds.
F1 score for test set: 0.7068.

Training a LogisticRegression using a training set size of 200. . .
Trained model in 0.0023 seconds
Made predictions in 0.0004 seconds.
F1 score for training set: 0.8211.
Made predictions in 0.0002 seconds.
F1 score for test set: 0.7391.

Training a LogisticRegression using a training set size of 300. . .
Trained model in 0.0037 seconds
Made predictions in 0.0005 seconds.
F1 score for training set: 0.8512.
Made predictions in 0.0004 seconds.
F1 score for test set: 0.7500.


## Tuning the model 

In [13]:
# Import 'GridSearchCV' and 'make_scorer'
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.cross_validation import StratifiedShuffleSplit



In [14]:
# Create the parameters list you wish to tune
C = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
solver = ['sag']
max_iter = [1000]
param_grid = dict(C=C, solver=solver, max_iter=max_iter)

# Initialize the classifier
clf = LogisticRegression(random_state=42)

# Make an f1 scoring function using 'make_scorer' 
f1_scorer = make_scorer(f1_score, pos_label='yes')

# Stratified Shuffle Split
ssscv = StratifiedShuffleSplit(y_train, n_iter=10, test_size=0.20)

# Perform grid search on the classifier using the f1_scorer as the scoring method
grid_obj = GridSearchCV(clf, param_grid, cv=ssscv, scoring=f1_scorer)

# Fit the grid search object to the training data and find the optimal parameters
grid_obj = grid_obj.fit(X_train, y_train)

# Get the estimator and best parameters
clf = grid_obj.best_estimator_
print(grid_obj.best_params_)
# Final F1 score for training and testing after parameter tuning
print ("Tuned model has a training F1 score of {:.4f}.".format(predict_labels(clf, X_train, y_train)))
print ("Tuned model has a testing F1 score of {:.4f}.".format(predict_labels(clf, X_test, y_test)))





{'solver': 'sag', 'max_iter': 1000, 'C': 0.01}
Made predictions in 0.0016 seconds.
Tuned model has a training F1 score of 0.8182.
Made predictions in 0.0003 seconds.
Tuned model has a testing F1 score of 0.8000.


# 4. Results

## Before tuning the model best F1 scores(training size = 300)

### Training F1 score - 0.8512
### Testing F1 score - 0.7500

## After tuning the model best F1 scores
### Training F1 score  - 0.8182
### Testing F1 score - 0.8000