Load packages:

In [99]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn import model_selection
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn import metrics
from math import sqrt
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

# Set seed
np.random.seed(333)

Load the data and make necessary splits:

In [100]:
dfm_train = pd.read_csv("dfm_train.csv")
dfm_test = pd.read_csv("dfm_test.csv")
y_train = dfm_train[dfm_train.columns[0]]
X_train = dfm_train[dfm_train.columns[1:1735]]
y_test = dfm_test[dfm_test.columns[0]]
X_test = dfm_test[dfm_test.columns[1:1735]]

Start with penalized classification models:

In [101]:
#build a ridge model

ridge = Ridge()

parameters = {"alpha": [1e-15, 1e-10, 1e-8, 1e-4, 1e-3, 1e-2, 1, 5, 10, 20, 30, 40, 50]}

ridge_regressor = GridSearchCV(ridge, parameters, scoring = "neg_mean_squared_error", cv = 5)

ridge_regressor.fit(X_train, y_train)



GridSearchCV(cv=5, error_score=nan,
             estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True,
                             max_iter=None, normalize=False, random_state=None,
                             solver='auto', tol=0.001),
             iid='deprecated', n_jobs=None,
             param_grid={'alpha': [1e-15, 1e-10, 1e-08, 0.0001, 0.001, 0.01, 1,
                                   5, 10, 20, 30, 40, 50]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='neg_mean_squared_error', verbose=0)

In [102]:
# what's the best model? 
print(ridge_regressor.best_params_) # alpha = 30 is best 

{'alpha': 30}


In [103]:
# find the best parameter using mean squared error 
pd.DataFrame.from_dict(ridge_regressor.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.033782,0.0016,0.004796,0.000745,1e-15,{'alpha': 1e-15},-0.64268,-0.443169,-0.545957,-0.664898,-0.634874,-0.586316,0.082255,13
1,0.023787,0.003308,0.005596,0.0012,1e-10,{'alpha': 1e-10},-0.64268,-0.443169,-0.545957,-0.664898,-0.634874,-0.586316,0.082255,12
2,0.022788,0.00256,0.005196,0.000979,1e-08,{'alpha': 1e-08},-0.64268,-0.443169,-0.545957,-0.664898,-0.634874,-0.586316,0.082255,11
3,0.02039,0.001854,0.004797,0.000749,0.0001,{'alpha': 0.0001},-0.642439,-0.443074,-0.545802,-0.664463,-0.634605,-0.586076,0.082156,10
4,0.02019,0.001166,0.004396,0.0008,0.001,{'alpha': 0.001},-0.640295,-0.442215,-0.54441,-0.660589,-0.632211,-0.583944,0.081277,9
5,0.019789,0.002226,0.004596,0.000799,0.01,{'alpha': 0.01},-0.620832,-0.434013,-0.531547,-0.625741,-0.610849,-0.564596,0.073736,8
6,0.01799,0.000632,0.004197,0.0004,1.0,{'alpha': 1},-0.284432,-0.248211,-0.302656,-0.235614,-0.319058,-0.277994,0.031683,7
7,0.01899,0.002529,0.004396,0.000491,5.0,{'alpha': 5},-0.205778,-0.205458,-0.239625,-0.181303,-0.230097,-0.212452,0.020559,6
8,0.022388,0.002332,0.005395,0.0008,10.0,{'alpha': 10},-0.18907,-0.202366,-0.219679,-0.170546,-0.213,-0.198932,0.017568,5
9,0.024986,0.001414,0.007195,0.000747,20.0,{'alpha': 20},-0.180818,-0.204715,-0.20564,-0.167917,-0.206427,-0.193103,0.015842,2


In [104]:
# use the new model to predict
best_ridge = ridge_regressor.best_estimator_

#train set
yhat_train_ridge = best_ridge.predict(X_train)
print(np.sqrt(mean_squared_error(y_train,yhat_train_ridge))) # RMSE for training is .239
print(r2_score(y_train, yhat_train_ridge)) # R squared is 76.5 percent

#test set
yhat_test_ridge = best_ridge.predict(X_test)
print(np.sqrt(mean_squared_error(y_test, yhat_test_ridge))) # RMSE for test data is .448
print(r2_score(y_test, yhat_test_ridge)) # R squared is 18.7 percent yikes

# accuracy score (same as R2)
best_ridge.score(X_test, y_test)

0.2675444313918127
0.7069524631759821
0.4427620409556435
0.20759633331345495


0.20759633331345498

In [105]:
# lasso model
lasso = Lasso()

parameters = {"max_iter": [10000, 15000], "alpha": [1e-15, 1e-10, 1e-8, 1e-4, 1e-3, 1e-2, 1, 5, 10, 20, 30, 40, 50]}

lasso_regressor = GridSearchCV(lasso, parameters, scoring = "neg_mean_squared_error", cv = 5)

lasso_regressor.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True,
                             max_iter=1000, normalize=False, positive=False,
                             precompute=False, random_state=None,
                             selection='cyclic', tol=0.0001, warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid={'alpha': [1e-15, 1e-10, 1e-08, 0.0001, 0.001, 0.01, 1,
                                   5, 10, 20, 30, 40, 50],
                         'max_iter': [10000, 15000]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='neg_mean_squared_error', verbose=0)

In [106]:
# what are the parameters for this model?
lasso.get_params().keys()

dict_keys(['alpha', 'copy_X', 'fit_intercept', 'max_iter', 'normalize', 'positive', 'precompute', 'random_state', 'selection', 'tol', 'warm_start'])

In [107]:
# what's the best model given alphas?
print(lasso_regressor.best_params_) # alpha = 0.01 and max_iter = 10000 are best 

{'alpha': 0.01, 'max_iter': 10000}


In [108]:
# use the new model to predict
best_lasso = lasso_regressor.best_estimator_

yhat_train_lasso = best_lasso.predict(X_train)
print(np.sqrt(mean_squared_error(y_train, yhat_train_lasso))) # RMSE for training is .343
print(r2_score(y_train, yhat_train_lasso)) # R squared is 51.9 percent

yhat_test_lasso = best_lasso.predict(X_test)
print(np.sqrt(mean_squared_error(y_test, yhat_test_lasso))) # RMSE for test data is .447
print(r2_score(y_test, yhat_test_lasso)) # R squared is 19.11 percent yikes but one percent better than ridge?

# accuracy score (same as R2)
best_lasso.score(X_test, y_test)

0.34272984504446125
0.5191051874470141
0.4473260686684912
0.19117581024126473


0.19117581024126473

In [109]:
# build an elasticnet

enet = ElasticNet()

parameters = {"alpha": [1e-15, 1e-10, 1e-8, 1e-4, 1e-3, 1e-2, 1, 5, 10, 20, 30, 40, 50], "max_iter": [10000, 15000],
             "l1_ratio": [.1, .5, .7, .9, .95, .99, 1]}

enet_regressor = GridSearchCV(enet, parameters, scoring = "neg_mean_squared_error", cv = 5)

enet_regressor.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True,
                                  l1_ratio=0.5, max_iter=1000, normalize=False,
                                  positive=False, precompute=False,
                                  random_state=None, selection='cyclic',
                                  tol=0.0001, warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid={'alpha': [1e-15, 1e-10, 1e-08, 0.0001, 0.001, 0.01, 1,
                                   5, 10, 20, 30, 40, 50],
                         'l1_ratio': [0.1, 0.5, 0.7, 0.9, 0.95, 0.99, 1],
                         'max_iter': [10000, 15000]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='neg_mean_squared_error', verbose=0)

In [110]:
# elasticnet parameters
enet.get_params().keys()

dict_keys(['alpha', 'copy_X', 'fit_intercept', 'l1_ratio', 'max_iter', 'normalize', 'positive', 'precompute', 'random_state', 'selection', 'tol', 'warm_start'])

In [111]:
# what's the best model given alpha?
print(enet_regressor.best_params_) # alpha = 0.01, l1_ratio = 0.5, and max_iter = 10000 are best 

{'alpha': 0.01, 'l1_ratio': 0.5, 'max_iter': 10000}


In [112]:
# use the new model to predict
best_enet = enet_regressor.best_estimator_

yhat_train_enet = best_enet.predict(X_train)
print(np.sqrt(mean_squared_error(y_train,yhat_train_enet))) # RMSE for training is .279
print(r2_score(y_train, yhat_train_enet)) # R squared is 68 percent

yhat_test_enet = best_enet.predict(X_test)
print(np.sqrt(mean_squared_error(y_test, yhat_test_enet))) # RMSE for test data is .469
print(r2_score(y_test, yhat_test_enet)) # R squared is 11.3 percent yikesssss

# accuracy score (same as R2)
best_enet.score(X_test, y_test)

0.27946245119000845
0.6802627973317135
0.4685148244338771
0.11273696320698612


0.11273696320698612

That's all for penalized regression classifiers. Now on to random forest:

In [113]:
#number of estimators set to length of X_test (above)
clf = RandomForestClassifier(n_estimators=1734)

#train the model using the training sets
clf.fit(X_train,y_train.values.ravel())
# originally had an issue with the above line - https://stackoverflow.com/questions/34165731/a-column-vector-y-was-passed-when-a-1d-array-was-expected helped

yhat_train_rf = clf.predict(X_train)
yhat_test_rf = clf.predict(X_test)

print("Accuracy:",metrics.accuracy_score(y_train, yhat_train_rf))
print("Accuracy:",metrics.accuracy_score(y_test, yhat_test_rf))

Accuracy: 1.0
Accuracy: 0.7142857142857143


Do some random forest interpretation:

In [114]:
names = []
scores = []
for name, score in zip(X_train.columns,clf.feature_importances_):
    names.append(name)
    scores.append(np.round(score,4))
    
score_df = pd.DataFrame({'feature':names,'importance_score':scores})

score_df.sort_values('importance_score',ascending=False)

Unnamed: 0,feature,importance_score
214,tax,0.0289
59,think,0.0177
190,yes,0.0169
105,rich,0.0143
242,work,0.0135
...,...,...
1112,district,0.0000
1113,durast,0.0000
1117,ran,0.0000
1119,trial,0.0000


Other models:

In [115]:
# Naive Bayes
Naive = naive_bayes.MultinomialNB()
Naive.fit(X_train,y_train)
yhat_train_NB = Naive.predict(X_train)
print("Training set accuracy:", accuracy_score(yhat_train_NB, y_train)*100)
yhat_test_NB = Naive.predict(X_test)
print("Test set accuracy:", accuracy_score(yhat_test_NB, y_test)*100)

Training set accuracy: 93.93939393939394
Test set accuracy: 70.40816326530613


In [116]:
# SVM
SVM = svm.SVC(C=1.7)
SVM.fit(X_train,y_train)
yhat_train_SVM = SVM.predict(X_train)
print("Training set accuracy:",accuracy_score(yhat_train_SVM, y_train)*100)
yhat_test_SVM = SVM.predict(X_test)
print("Test set accuracy:", accuracy_score(yhat_test_SVM, y_test)*100)

Training set accuracy: 99.13419913419914
Test set accuracy: 71.42857142857143


They do OK. Try some ensemble techniques:

In [123]:
# Voting ensemble
np.random.seed(333)
rf_clf = RandomForestClassifier()
svm_clf = SVC()
knn_clf = KNeighborsClassifier()
nb_clf = naive_bayes.MultinomialNB()

voting_clf = VotingClassifier(
                [('rf',rf_clf),
                ('svm',svm_clf),
                 ('nb', nb_clf),
                ('knn',knn_clf)],
                voting = "hard")

voting_clf.fit(X_train,y_train)

for name,clf in (["rf_clf",rf_clf],
                 ["svm_clf",svm_clf],["knn_clf",knn_clf], ["nb_clf", nb_clf],
                 ["voting_clf",voting_clf]):
    # fit the model
    clf.fit(X_train,y_train)
    
    # predict
    y_pred = clf.predict(X_test)
    
    # get acc
    acc = sum(y_test == y_pred)/len(y_pred)
    
    print(name, np.round(acc,5))

rf_clf 0.70408
svm_clf 0.69388
knn_clf 0.63265
nb_clf 0.70408
voting_clf 0.7551


The voting classifier can get north of 70% accuracy. Try bagging:

In [118]:
bag_clf = BaggingClassifier(SVC(),
                            n_estimators = 1000,
                            max_samples = 100,
                            bootstrap = True
                           )
bag_clf.fit(X_train,y_train)
yhat_train_bag = bag_clf.predict(X_train)
print("Training set accuracy:", accuracy_score(yhat_train_bag, y_train)*100)
yhat_test_bag = bag_clf.predict(X_test)
print("Test set accuracy:", accuracy_score(yhat_test_bag, y_test)*100)

Training set accuracy: 77.48917748917748
Test set accuracy: 60.204081632653065


Try boosting:

In [119]:
ada_clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),
                n_estimators = 1000,
                algorithm="SAMME.R",
                learning_rate = 0.5
            )
ada_clf.fit(X_train, y_train)
yhat_train_ada_clf = ada_clf.predict(X_train)
print("Training set accuracy:", accuracy_score(yhat_train_ada_clf, y_train)*100)
yhat_test_ada_clf = ada_clf.predict(X_test)
print("Test set accuracy:", accuracy_score(yhat_test_ada_clf, y_test)*100)

Training set accuracy: 100.0
Training set accuracy: 70.40816326530613


The voting classifier might work the best, but the random forest classifier doesn't do much worse, so it might be best to work with it because of the interpretability tradeoffs:

In [139]:
forest_clf = RandomForestClassifier(n_estimators=5000, max_features=1734, max_samples=230, random_state=333)
forest_clf.fit(X_train,y_train)
yhat_train_forest_clf = forest_clf.predict(X_train)
print("Training set accuracy:", accuracy_score(yhat_train_forest_clf, y_train)*100)
yhat_test_forest_clf = forest_clf.predict(X_test)
print("Test set accuracy:", accuracy_score(yhat_test_forest_clf, y_test)*100)

Training set accuracy: 100.0
Test set accuracy: 74.48979591836735
