# Machine Learning Engineer Nanodegree
## Supervised Learning
## Project 2: Building a Student Intervention System

In [1]:
# Import libraries
import numpy as np
import pandas as pd
from time import time
from sklearn.metrics import f1_score


# Read student data
student_data = pd.read_csv("student-data.csv")
print "Student data read successfully!"

Student data read successfully!


In [2]:
# TODO: Calculate number of students
n_students = student_data.shape[0]

# TODO: Calculate number of features
n_features = student_data.shape[1]

# TODO: Calculate passing students
n_passed = student_data[student_data["passed"]=='yes'].shape[0]

# TODO: Calculate failing students
n_failed = student_data[student_data["passed"]=='no'].shape[0]

# TODO: Calculate graduation rate
grad_rate = float(n_passed)/float(n_students)

# Print the results
print "Total number of students: {}".format(n_students)
print "Number of features: {}".format(n_features)
print "Number of students who passed: {}".format(n_passed)
print "Number of students who failed: {}".format(n_failed)
print "Graduation rate of the class: {:.2f}%".format(grad_rate)

Total number of students: 395
Number of features: 31
Number of students who passed: 265
Number of students who failed: 130
Graduation rate of the class: 0.67%


In [3]:
print student_data.head()

print student_data[student_data["passed"]=='yes'].shape[0]

  school sex  age address famsize Pstatus  Medu  Fedu     Mjob      Fjob  \
0     GP   F   18       U     GT3       A     4     4  at_home   teacher   
1     GP   F   17       U     GT3       T     1     1  at_home     other   
2     GP   F   15       U     LE3       T     1     1  at_home     other   
3     GP   F   15       U     GT3       T     4     2   health  services   
4     GP   F   16       U     GT3       T     3     3    other     other   

   ...   internet romantic  famrel  freetime  goout Dalc Walc health absences  \
0  ...         no       no       4         3      4    1    1      3        6   
1  ...        yes       no       5         3      3    1    1      3        4   
2  ...        yes       no       4         3      2    2    3      3       10   
3  ...        yes      yes       3         2      2    1    1      5        2   
4  ...         no       no       4         3      2    1    2      5        4   

  passed  
0     no  
1     no  
2    yes  
3    yes  
4

## Preparing the Data
In this section, we will prepare the data for modeling, training and testing.

### Identify feature and target columns


In [4]:
# Extract feature columns
feature_cols = list(student_data.columns[:-1])

# Extract target column 'passed'
target_col = student_data.columns[-1] 

# Show the list of columns
print "Feature columns:\n{}".format(feature_cols)
print "\nTarget column: {}".format(target_col)

# Separate the data into feature data and target data (X_all and y_all, respectively)
X_all = student_data[feature_cols]
y_all = student_data[target_col]

# Show the feature information by printing the first five rows
print "\nFeature values:"
print X_all.head()

Feature columns:
['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu', 'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime', 'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences']

Target column: passed

Feature values:
  school sex  age address famsize Pstatus  Medu  Fedu     Mjob      Fjob  \
0     GP   F   18       U     GT3       A     4     4  at_home   teacher   
1     GP   F   17       U     GT3       T     1     1  at_home     other   
2     GP   F   15       U     LE3       T     1     1  at_home     other   
3     GP   F   15       U     GT3       T     4     2   health  services   
4     GP   F   16       U     GT3       T     3     3    other     other   

    ...    higher internet  romantic  famrel  freetime goout Dalc Walc health  \
0   ...       yes       no        no       4         3     4    1    1      3   
1   ...       

### Preprocess Feature Columns

In [5]:
def preprocess_features(X):
    ''' Preprocesses the student data and converts non-numeric binary variables into
        binary (0/1) variables. Converts categorical variables into dummy variables. '''
    
    # Initialize new output DataFrame
    output = pd.DataFrame(index = X.index)

    # Investigate each feature column for the data
    for col, col_data in X.iteritems():
        
        # If data type is non-numeric, replace all yes/no values with 1/0
        if col_data.dtype == object:
            col_data = col_data.replace(['yes', 'no'], [1, 0])

        # If data type is categorical, convert to dummy variables
        if col_data.dtype == object:
            # Example: 'school' => 'school_GP' and 'school_MS'
            col_data = pd.get_dummies(col_data, prefix = col)  
        
        # Collect the revised columns
        output = output.join(col_data)
    
    return output

X_all = preprocess_features(X_all)
print "Processed feature columns ({} total features):\n{}".format(len(X_all.columns), list(X_all.columns))

Processed feature columns (48 total features):
['school_GP', 'school_MS', 'sex_F', 'sex_M', 'age', 'address_R', 'address_U', 'famsize_GT3', 'famsize_LE3', 'Pstatus_A', 'Pstatus_T', 'Medu', 'Fedu', 'Mjob_at_home', 'Mjob_health', 'Mjob_other', 'Mjob_services', 'Mjob_teacher', 'Fjob_at_home', 'Fjob_health', 'Fjob_other', 'Fjob_services', 'Fjob_teacher', 'reason_course', 'reason_home', 'reason_other', 'reason_reputation', 'guardian_father', 'guardian_mother', 'guardian_other', 'traveltime', 'studytime', 'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences']


### Implementation: Training and Testing Data Split

In [6]:
# TODO: Import any additional functionality you may need here
from sklearn import cross_validation

# TODO: Set the number of training points
num_train = 300

# Set the number of testing points
num_test = X_all.shape[0] - num_train

# TODO: Shuffle and split the dataset into the number of training and testing points above
X_train = None
X_test = None
y_train = None
y_test = None
X_train, X_test, y_train, y_test = cross_validation.train_test_split( X_all, y_all, test_size=float(num_test)/float(X_all.shape[0]), random_state=42)


# Show the results of the split
print "Training set has {} samples.".format(X_train.shape[0])
print "Testing set has {} samples.".format(X_test.shape[0])

Training set has 300 samples.
Testing set has 95 samples.




## Training and Evaluating Models


In [18]:
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
import scipy



In [7]:
def train_classifier(clf, X_train, y_train):
    ''' Fits a classifier to the training data. '''
    
    # Start the clock, train the classifier, then stop the clock
    start = time()
    clf.fit(X_train, y_train)
    end = time()
    
    # Print the results
    print "Trained model in {:.4f} seconds".format(end - start)

    
def predict_labels(clf, features, target):
    ''' Makes predictions using a fit classifier based on F1 score. '''
    
    # Start the clock, make predictions, then stop the clock
    start = time()
    y_pred = clf.predict(features)
    end = time()
    
    # Print and return results
    print "Made predictions in {:.4f} seconds.".format(end - start)
    return f1_score(target.values, y_pred, pos_label='yes')


def train_predict(clf, X_train, y_train, X_test, y_test):
    ''' Train and predict using a classifer based on F1 score. '''
    
    # Indicate the classifier and the training set size
    print "Training a {} using a training set size of {}. . .".format(clf.__class__.__name__, len(X_train))
    
    # Train the classifier
    train_classifier(clf, X_train, y_train)
    
    # Print the results of prediction for both training and testing
    print "F1 score for training set: {:.4f}.".format(predict_labels(clf, X_train, y_train))
    print "F1 score for test set: {:.4f}.".format(predict_labels(clf, X_test, y_test))

In [9]:
### Logistic regression exploration

In [10]:
#y_train = y_train.apply(lambda x: 1 if x=='yes' else 0)

In [15]:
from sklearn import linear_model
from sklearn import metrics

lr = linear_model.LogisticRegression()
lr.fit(X_train, y_train)
y_train_predictions = lr.predict(X_train)
y_test_predictions = lr.predict(X_test)

print metrics.classification_report(y_test, y_test_predictions)
print "Overall Accuracy:", round(metrics.accuracy_score(y_test, y_test_predictions)
                                 ,3)

             precision    recall  f1-score   support

         no       0.71      0.43      0.54        35
        yes       0.73      0.90      0.81        60

avg / total       0.72      0.73      0.71        95

Overall Accuracy: 0.726


In [35]:
#https://blog.cambridgecoding.com/2016/05/16/expanding-your-machine-learning-toolkit-randomized-search-computational-budgets-and-new-algorithms-2/

# Specify HP distributions
penalty = ["l1", "l2"]
np.random.seed(123)
C_range = np.random.normal(5, 0.2, 25).astype(float)
 
# Check that C>0 
C_range[C_range < 0] = 0.0001
 
hyperparameters = {'penalty': penalty, 
                    'C': C_range}
 
print (hyperparameters)


{'penalty': ['l1', 'l2'], 'C': array([ 4.78287388,  5.19946909,  5.0565957 ,  4.69874106,  4.88427995,
        5.33028731,  4.51466415,  4.91421747,  5.25318725,  4.82665192,
        4.86422277,  4.98105821,  5.29827793,  4.8722196 ,  4.91120361,
        4.91312974,  5.44118602,  5.43735722,  5.20081078,  5.07723728,
        5.14747372,  5.29814641,  4.81283323,  5.23516581,  4.74922387])}


In [36]:
# Randomized search using cross-validation
randomCV = RandomizedSearchCV(linear_model.LogisticRegression(), 
                              param_distributions=hyperparameters,
                              cv=10)  
randomCV.fit(X_train, y_train)
 
best_penalty = randomCV.best_params_['penalty']
best_C       = randomCV.best_params_['C']
 
print ("The best performing penalty is: {}".format(best_penalty))
print ("The best performing C value is: {:5.2f}".format(best_C))

The best performing penalty is: l2
The best performing C value is:  5.44


In [37]:
# Train model and output predictions
classifier_logistic = linear_model.LogisticRegression(penalty=best_penalty, C=best_C)
classifier_logistic_fit = classifier_logistic.fit(X_train, y_train)
logistic_predictions = classifier_logistic_fit.predict(X_test)
 
print metrics.classification_report(y_test, logistic_predictions)
print "Overall Accuracy:", round(metrics.accuracy_score(y_test, logistic_predictions),3)

             precision    recall  f1-score   support

         no       0.64      0.40      0.49        35
        yes       0.71      0.87      0.78        60

avg / total       0.68      0.69      0.67        95

Overall Accuracy: 0.695


### Implementation: Model Performance Metrics

In [38]:
from sklearn import linear_model

# TODO: Initialize the three models
clf_E = linear_model.LogisticRegression(penalty=best_penalty, C=best_C)


list_of_clf=[clf_E]



# TODO: Set up the training set sizes
X_train_100 = X_train[0:100]
y_train_100 = y_train[0:100]

X_train_200 = X_train[0:200]
y_train_200 = y_train[0:200]

X_train_300 = X_train[0:300]
y_train_300 = y_train[0:300]

# TODO: Execute the 'train_predict' function for each classifier and each training set size
# train_predict(clf, X_train, y_train, X_test, y_test)
for i,clf in enumerate(list_of_clf):
    print "Classifier %d ..." %(i+1)
#    print clf
    train_predict(clf, X_train_100, y_train_100, X_test, y_test)
    train_predict(clf, X_train_200, y_train_200, X_test, y_test)
    train_predict(clf, X_train_300, y_train_300, X_test, y_test)
    print "\n"

Classifier 1 ...
Training a LogisticRegression using a training set size of 100. . .
Trained model in 0.0041 seconds
Made predictions in 0.0011 seconds.
F1 score for training set: 0.8872.
Made predictions in 0.0016 seconds.
F1 score for test set: 0.7519.
Training a LogisticRegression using a training set size of 200. . .
Trained model in 0.0115 seconds
Made predictions in 0.0015 seconds.
F1 score for training set: 0.8483.
Made predictions in 0.0021 seconds.
F1 score for test set: 0.7826.
Training a LogisticRegression using a training set size of 300. . .
Trained model in 0.0270 seconds
Made predictions in 0.0015 seconds.
F1 score for training set: 0.8442.
Made predictions in 0.0031 seconds.
F1 score for test set: 0.7820.




In [56]:
(y_train_predictions == y_train).sum().astype(float) / y_train.shape[0]

0.77333333333333332

In [59]:
(y_test_predictions == y_test).sum().astype(float) / y_test.shape[0]

0.72631578947368425

Book reference: Scikit Learn Cookbook