In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(style='white', context='notebook', palette='deep')
import matplotlib.style as style
style.use('fivethirtyeight')

# EDA

In [None]:
train = pd.read_csv("BUDStrain.csv", index_col = 0)
pd.set_option('display.max_columns', 999)
print(train.shape)
#train.describe(include = 'all')
train.info()

###### No nulls!

In [None]:
train.head()

In [None]:
all_features = train.columns.tolist()
num_features = train.describe().columns.tolist()
cat_features = [feat for feat in all_features if feat not in numerical_features]
assert(len(all_features) == len(num_features) + len(cat_features))
train.describe()

In [None]:
train[cat_features].nunique().sort_values(ascending=True)

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(train[num_features].corr(), annot=True)
plt.title("Correlation Heatmap between continuous variables")

In [None]:
plt.figure()
fig, axes = plt.subplots(4, 4, figsize=(18, 15), sharey=True)
fig.suptitle("Grades vs Categorical Features", fontsize=20)

sns.boxplot(ax=axes[0, 0], data=train, x='school', y='grade')
sns.boxplot(ax=axes[0, 1], data=train, x='higher', y='grade')
sns.boxplot(ax=axes[0, 2], data=train, x='internet', y='grade')
sns.boxplot(ax=axes[0, 3], data=train, x='nursery', y='grade', order=['yes', 'no'])
sns.boxplot(ax=axes[1, 0], data=train, x='activities', y='grade', order=['yes', 'no'])
sns.boxplot(ax=axes[1, 1], data=train, x='paid', y='grade', order=['yes', 'no'])
sns.boxplot(ax=axes[1, 2], data=train, x='famsup', y='grade')
sns.boxplot(ax=axes[1, 3], data=train, x='schoolsup', y='grade')
sns.boxplot(ax=axes[2, 0], data=train, x='romantic', y='grade', order=['yes', 'no'])
sns.boxplot(ax=axes[2, 1], data=train, x='Pstatus', y='grade')
sns.boxplot(ax=axes[2, 2], data=train, x='famsize', y='grade')
sns.boxplot(ax=axes[2, 3], data=train, x='address', y='grade')
sns.boxplot(ax=axes[3, 0], data=train, x='sex', y='grade')

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(15, 10), sharey=True)
fig.suptitle("Grades vs Categorical Features", fontsize=20)

sns.boxplot(ax=axes[0, 0], data=train, x='guardian', y='grade')
sns.boxplot(ax=axes[0, 1], data=train, x='reason', y='grade')
sns.boxplot(ax=axes[1, 0], data=train, x='Fjob', y='grade')
sns.boxplot(ax=axes[1, 1], data=train, x='Mjob', y='grade')

In [None]:
plt.figure()
fig, axes = plt.subplots(3, 4, figsize=(18, 12), sharey=True)
fig.suptitle("Grades vs Numerical Features", fontsize=20)

for i in range(3):
    for j in range(4):
        sns.boxplot(ax=axes[i, j], data=train, x=np.array(num_features[:12]).reshape(3,4)[i][j],y='grade')

In [None]:
# For unbalanced datasets
plt.figure()
fig, axes = plt.subplots(5, 4, figsize=(18, 18), sharey=True)
fig.suptitle("Categorical Features Counts", fontsize=20)

for i in range(4):
    for j in range(4):
        sns.countplot(ax=axes[i, j], data=train, x=np.array(cat_features[:16]).reshape(4,4)[i][j])
        for p in axes[i,j].patches:
            axes[i,j].annotate('{:.2f}'.format(p.get_height()), (p.get_x()+0.15, p.get_height()+1))
sns.countplot(ax=axes[4,0], data=train, x=cat_features[-1])
for p in axes[4,0].patches:
    axes[4,0].annotate('{:.2f}'.format(p.get_height()), (p.get_x()+0.15, p.get_height()+1))


In [None]:
# Basically also categorical
plt.figure()
fig, axes = plt.subplots(3, 4, figsize=(18, 12), sharey=True)
fig.suptitle("Numerical Features Counts", fontsize=20)

for i in range(3):
    for j in range(4):
        sns.countplot(ax=axes[i, j], data=train, x=np.array(num_features[:12]).reshape(3,4)[i][j])
        for p in axes[i,j].patches:
            axes[i,j].annotate('{:.2f}'.format(p.get_height()), (p.get_x()+0.15, p.get_height()+1))

# Preprocessing Data

In [None]:
# One Hot Encoding

train = pd.get_dummies(data=train, columns=cat_features)
train.head()

In [None]:
from sklearn.model_selection import train_test_split

X = train.drop('grade', axis=1)
y = train['grade']

train_X, val_X, train_y, val_y = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
all_feats = X.columns.tolist()

# ML Algos

All categorical variables vs grades

## LASSO

In [None]:
from sklearn import linear_model
from sklearn.metrics import mean_squared_error

import warnings
warnings.filterwarnings('ignore')

lmbdas = np.logspace(-5,5,11)

train_accuracy = np.zeros(len(lmbdas))
test_accuracy = np.zeros(len(lmbdas))

for i, lmbda in enumerate(lmbdas):

    lasso_reg = linear_model.Lasso(alpha = lmbda, random_state = 1)   
    lasso_reg.fit(train_X, train_y)

    # check accuracy
    train_accuracy[i] = lasso_reg.score(train_X, train_y)
    test_accuracy[i] = lasso_reg.score(val_X, val_y)
    
plt.semilogx(lmbdas, train_accuracy,'*-b', label='train')
plt.semilogx(lmbdas, test_accuracy,'*-r', label='test')
plt.title("LASSO: Regularization vs R2")
plt.ylabel("R2 Score")
plt.xlabel("Lambdas")
plt.legend()

max_acc = max(test_accuracy)
max_index = np.argmax(test_accuracy)
print("Optimal index:", max_index, "\nBest test accuracy:", max_acc, "\nOptimal Lambda:", lmbdas[max_index])

In [None]:
lasso_opt = linear_model.Lasso(alpha = 0.1, random_state = 1)   
lasso_opt.fit(train_X, train_y)
val_pred = lasso_opt.predict(val_X)
rms = mean_squared_error(val_y, val_pred, squared=False)
print('intercept: ', lasso_opt.intercept_)
for i in (list(zip(train_X[all_feats], lasso_opt.coef_))):
    print(i, sep='\n')
print(f"RMSE for Lasso w/ lambda=0.1: {rms}")

The variables with the most weights are:

`('age', 0.007666)
('Medu', 0.09155)
('Fedu', 0.31946)
('studytime', 0.3659)
('failures', -1.2136)
('freetime', -0.14597)
('Dalc', -0.2047)
('Walc', -0.1088)
('health', -0.1027)
('absences', -0.0156)
('school_GP', 0.999)
('address_R', -0.139)
('Fjob_services', -0.13262)
('schoolsup_no', 0.0381)
('higher_no', -0.7849)
('internet_no', -0.1459)`

In [None]:
from sklearn.tree import DecisionTreeRegressor
def sort_tuple(tup): 
    tup.sort(key = lambda x: x[1]) 
    return tup

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor

num_est = np.logspace(1,4,4, dtype=int)

for i, num in enumerate(num_est):

    rf = RandomForestRegressor(n_estimators = num, random_state = 1)
    rf.fit(train_X, train_y)
    val_pred = rf.predict(val_X)

    rmse = mean_squared_error(val_y, val_pred, squared=False)
    print("\nRMSE for num_est =", num, rmse)

In [None]:
print("Best Algo is RF with n_est = 10")
rf = RandomForestRegressor(n_estimators = 10, random_state = 1)
rf.fit(train_X, train_y)

# Feature Importance
importances = list(rf.feature_importances_)
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(all_feats, importances)]

f = sort_tuple(feature_importances)
for i in f[-5:]:
    # top 5 most important features
    print(i, sep='\n')

In [None]:
# source: https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74
# Narrowing down parameters for hyperparameter tuning with GridSearchCV
from pprint import pprint
from sklearn.model_selection import RandomizedSearchCV

n_estimators = [int(x) for x in np.linspace(200, stop = 2000, num = 10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

pprint(random_grid)

In [None]:
rf = RandomForestRegressor()
# 3 fold CV for run time
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, 
                               n_iter = 100, cv = 3, verbose=2, random_state=1, n_jobs = -1)
rf_random.fit(train_X, train_y)

In [None]:
rf_random.best_params_

In [None]:
best_params = {'n_estimators': 200,
               'min_samples_split': 5,
               'min_samples_leaf': 1,
               'max_features': 'sqrt',
               'max_depth': 90,
               'bootstrap': False}

rf = RandomForestRegressor(**best_params, random_state = 1)
rf.fit(train_X, train_y)
val_pred = rf.predict(val_X)

rmse = mean_squared_error(val_y, val_pred, squared=False)
print("\nRMSE for num_est =", num, rmse)

# Feature Importance
importances = list(rf.feature_importances_)
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(all_feats, importances)]

f = sort_tuple(feature_importances)
for i in f[-5:]:
    # top 5 most important features
    print(i, sep='\n')

In [None]:
from sklearn.model_selection import GridSearchCV
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [False],
    'max_depth': [80, 90, 100],
    'max_features': [7, 8],
    'min_samples_leaf': [1, 2, 3],
    'min_samples_split': [3, 4, 5, 6, 7],
    'n_estimators': [100, 150, 200, 250, 300]
}
# Create a based model
rf = RandomForestRegressor()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 10, n_jobs = -1, verbose = 2)

grid_search.fit(train_X, train_y)

In [None]:
grid_search.best_params_

In [None]:
best_params = {'bootstrap': False,
                'max_depth': 80,
                'max_features': 8,
                'min_samples_leaf': 1,
                'min_samples_split': 6,
                'n_estimators': 300}

rf = RandomForestRegressor(**best_params, random_state = 1)
rf.fit(train_X, train_y)
val_pred = rf.predict(val_X)

rmse = mean_squared_error(val_y, val_pred, squared=False)
print("\nRMSE for num_est =", num, rmse)

# Feature Importance
importances = list(rf.feature_importances_)
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(all_feats, importances)]

f = sort_tuple(feature_importances)
for i in f[-5:]:
    # top 5 most important features
    print(i, sep='\n')

## DNN

In [None]:
num_feats = len(all_feats)
tf.random.set_seed(1)
def create_DNN(unit):
    """creating DNN architechture """
    model = keras.Sequential()
    model.add(keras.layers.Dense(unit, input_dim=num_feats, activation='relu'))
    model.add(keras.layers.Dense(unit, activation='relu'))
    model.add(keras.layers.Dropout(0.5))
    model.add(keras.layers.Dense(1, activation='linear'))
    
    model.compile(loss='mean_squared_error', optimizer='adam', metrics=[keras.metrics.RootMeanSquaredError()])
    return model


In [None]:
from keras.wrappers.scikit_learn import KerasRegressor
model=KerasRegressor(build_fn=create_DNN)

In [None]:
params = {'batch_size':[20, 40, 60, 80, 100], 
          'epochs':[100, 200, 300, 400],
          'unit':[5, 10, 15, 20, 25]
           }
gs = GridSearchCV(estimator=model, param_grid=params, cv=10)
gs_result = gs.fit(train_X, train_y)

In [None]:
best_params=gs_result.best_params_
accuracy=gs_result.best_score_

best_params

## DNN 2

In [None]:
def create_DNN2(unit):
    """creating DNN architechture """
    model = keras.Sequential()
    model.add(keras.layers.Dense(unit, input_dim=num_feats, activation='relu'))
    model.add(keras.layers.Dense(unit, activation='relu'))
    model.add(keras.layers.Dropout(0.6))
    model.add(keras.layers.Dense(1, activation='linear'))
    
    model.compile(loss='mean_squared_error', optimizer='adam', metrics=[keras.metrics.RootMeanSquaredError()])
    return model

model=KerasRegressor(build_fn=create_DNN2)

params = {'batch_size':[10, 20, 30], 
          'epochs':[100, 200],
          'unit':[30, 50, 70]
           }
gs = GridSearchCV(estimator=model, param_grid=params, cv=10)
gs_result = gs.fit(train_X, train_y)

In [None]:
gs_result.best_params_

# Kaggle Submission

In [None]:
test = pd.read_csv("BUDStest.csv", index_col = 0)
test = pd.get_dummies(data=test, columns=cat_features)
test.head()

#### Random Forest Submission

In [None]:
best_params = {'bootstrap': False,
                'max_depth': 80,
                'max_features': 8,
                'min_samples_leaf': 1,
                'min_samples_split': 6,
                'n_estimators': 300}

rf = RandomForestRegressor(**best_params, random_state = 1)
rf.fit(train_X, train_y)
grade = rf.predict(test)

In [None]:
my_submission = pd.DataFrame({'ID': test.index, 'grade': grade})
# you could use any filename. We choose submission here
my_submission.to_csv('submission.csv', index=False)

#### DNN Submission

In [None]:
best_params = {'batch_size': 80, 'epochs': 400, 'unit': 25}


def create_best_DNN(unit):
    """creating DNN architechture """
    model = keras.Sequential()
    model.add(keras.layers.Dense(unit, input_dim=num_feats, activation='relu'))
    model.add(keras.layers.Dense(unit, activation='relu'))
    model.add(keras.layers.Dropout(0.5))
    model.add(keras.layers.Dense(1, activation='linear'))
    
    model.compile(loss='mean_squared_error', optimizer='adam', metrics=[keras.metrics.RootMeanSquaredError()])
    return model

DNN = create_best_DNN(25)
DNN.fit(train_X, train_y, epochs=400, batch_size=80, verbose=1, validation_data=(val_X, val_y))
score = DNN.evaluate(val_X, val_y, verbose=1)

In [None]:
grade = DNN.predict(test)
grade = grade.reshape(len(grade))

In [None]:
my_submission = pd.DataFrame({'ID': test.index, 'grade': grade})
# you could use any filename. We choose submission here
my_submission.to_csv('submission.csv', index=False)

### DNN Submission 2

In [None]:
#{'batch_size': 20, 'epochs': 100, 'unit': 50}

def create_best_DNN2(unit):
    """creating DNN architechture """
    model = keras.Sequential()
    model.add(keras.layers.Dense(unit, input_dim=num_feats, activation='relu'))
    model.add(keras.layers.Dense(unit, activation='relu'))
    model.add(keras.layers.Dense(unit, activation='relu'))
    model.add(keras.layers.Dropout(0.5))
    model.add(keras.layers.Dense(1, activation='linear'))
    
    model.compile(loss='mean_squared_error', optimizer='adam', metrics=[keras.metrics.RootMeanSquaredError()])
    return model

DNN = create_best_DNN2(60)
DNN.fit(train_X, train_y, epochs=100, batch_size=20, verbose=1, validation_data=(val_X, val_y))
score = DNN.evaluate(val_X, val_y, verbose=1)


In [None]:
score

In [None]:
grade = DNN.predict(test)
grade = grade.reshape(len(grade))

In [None]:
my_submission = pd.DataFrame({'ID': test.index, 'grade': grade})
# you could use any filename. We choose submission here
my_submission.to_csv('submission.csv', index=False)

### For next time

Tensorboard and early stopping. Batch Norm? Callback? How to grid search faster?