Load packages:

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn import model_selection
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn import metrics
from math import sqrt
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

# Set seed
np.random.seed(333)

Load the data and make necessary splits:

In [3]:
dfm_train = pd.read_csv("dfm_train.csv")
dfm_test = pd.read_csv("dfm_test.csv")
y_train = dfm_train[dfm_train.columns[0]]
X_train = dfm_train[dfm_train.columns[1:1735]]
y_test = dfm_test[dfm_test.columns[0]]
X_test = dfm_test[dfm_test.columns[1:1735]]

Start with penalized classification models:

In [187]:
#build a ridge model

ridge = RidgeClassifier(alpha=30).fit(X_train, y_train.ravel())
yhat_train_ridge = ridge.predict(X_train)
yhat_test_ridge = ridge.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_train, yhat_train_ridge))
print("Accuracy:",metrics.accuracy_score(y_test, yhat_test_ridge))

Accuracy: 0.9826839826839827
Accuracy: 0.7040816326530612


In [188]:
#build an elastic net

enet = SGDClassifier(loss='log', penalty='elasticnet', alpha=0.01, l1_ratio=0.5, max_iter = 10000).fit(X_train, y_train.ravel())
yhat_train_enet = enet.predict(X_train)
yhat_test_enet = enet.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_train, yhat_train_enet))
print("Accuracy:",metrics.accuracy_score(y_test, yhat_test_enet))

Accuracy: 0.961038961038961
Accuracy: 0.6836734693877551


That's all for penalized regression classifiers. Now on to random forest:

In [113]:
#number of estimators set to length of X_test (above)
clf = RandomForestClassifier(n_estimators=1734)

#train the model using the training sets
clf.fit(X_train,y_train.values.ravel())
# originally had an issue with the above line - https://stackoverflow.com/questions/34165731/a-column-vector-y-was-passed-when-a-1d-array-was-expected helped

yhat_train_rf = clf.predict(X_train)
yhat_test_rf = clf.predict(X_test)

print("Accuracy:",metrics.accuracy_score(y_train, yhat_train_rf))
print("Accuracy:",metrics.accuracy_score(y_test, yhat_test_rf))

Accuracy: 1.0
Accuracy: 0.7142857142857143


Do some random forest interpretation:

In [114]:
names = []
scores = []
for name, score in zip(X_train.columns,clf.feature_importances_):
    names.append(name)
    scores.append(np.round(score,4))
    
score_df = pd.DataFrame({'feature':names,'importance_score':scores})

score_df.sort_values('importance_score',ascending=False)

Unnamed: 0,feature,importance_score
214,tax,0.0289
59,think,0.0177
190,yes,0.0169
105,rich,0.0143
242,work,0.0135
...,...,...
1112,district,0.0000
1113,durast,0.0000
1117,ran,0.0000
1119,trial,0.0000


Other models:

In [115]:
# Naive Bayes
Naive = naive_bayes.MultinomialNB()
Naive.fit(X_train,y_train)
yhat_train_NB = Naive.predict(X_train)
print("Training set accuracy:", accuracy_score(yhat_train_NB, y_train)*100)
yhat_test_NB = Naive.predict(X_test)
print("Test set accuracy:", accuracy_score(yhat_test_NB, y_test)*100)

Training set accuracy: 93.93939393939394
Test set accuracy: 70.40816326530613


In [116]:
# SVM
SVM = svm.SVC(C=1.7)
SVM.fit(X_train,y_train)
yhat_train_SVM = SVM.predict(X_train)
print("Training set accuracy:",accuracy_score(yhat_train_SVM, y_train)*100)
yhat_test_SVM = SVM.predict(X_test)
print("Test set accuracy:", accuracy_score(yhat_test_SVM, y_test)*100)

Training set accuracy: 99.13419913419914
Test set accuracy: 71.42857142857143


They do OK. Try some ensemble techniques:

In [123]:
# Voting ensemble
np.random.seed(333)
rf_clf = RandomForestClassifier()
svm_clf = SVC()
knn_clf = KNeighborsClassifier()
nb_clf = naive_bayes.MultinomialNB()

voting_clf = VotingClassifier(
                [('rf',rf_clf),
                ('svm',svm_clf),
                 ('nb', nb_clf),
                ('knn',knn_clf)],
                voting = "hard")

voting_clf.fit(X_train,y_train)

for name,clf in (["rf_clf",rf_clf],
                 ["svm_clf",svm_clf],["knn_clf",knn_clf], ["nb_clf", nb_clf],
                 ["voting_clf",voting_clf]):
    # fit the model
    clf.fit(X_train,y_train)
    
    # predict
    y_pred = clf.predict(X_test)
    
    # get acc
    acc = sum(y_test == y_pred)/len(y_pred)
    
    print(name, np.round(acc,5))

rf_clf 0.70408
svm_clf 0.69388
knn_clf 0.63265
nb_clf 0.70408
voting_clf 0.7551


The voting classifier can get north of 70% accuracy. Try bagging:

In [4]:
bag_clf = BaggingClassifier(SVC(),
                            n_estimators = 5000,
                            max_samples = 230,
                            bootstrap = True
                           )
bag_clf.fit(X_train,y_train)
yhat_train_bag = bag_clf.predict(X_train)
print("Training set accuracy:", accuracy_score(yhat_train_bag, y_train)*100)
yhat_test_bag = bag_clf.predict(X_test)
print("Test set accuracy:", accuracy_score(yhat_test_bag, y_test)*100)

Training set accuracy: 99.13419913419914
Test set accuracy: 70.40816326530613


Try boosting:

In [119]:
ada_clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),
                n_estimators = 1000,
                algorithm="SAMME.R",
                learning_rate = 0.5
            )
ada_clf.fit(X_train, y_train)
yhat_train_ada_clf = ada_clf.predict(X_train)
print("Training set accuracy:", accuracy_score(yhat_train_ada_clf, y_train)*100)
yhat_test_ada_clf = ada_clf.predict(X_test)
print("Test set accuracy:", accuracy_score(yhat_test_ada_clf, y_test)*100)

Training set accuracy: 100.0
Training set accuracy: 70.40816326530613


The voting classifier might work the best, but the random forest classifier doesn't do much worse, so it might be best to work with it because of the interpretability tradeoffs:

In [200]:
forest_clf = RandomForestClassifier(n_estimators=5000, max_features=1734, max_samples=230, random_state=333)
forest_clf.fit(X_train,y_train)
yhat_train_forest_clf = forest_clf.predict(X_train)
print("Training set accuracy:", accuracy_score(yhat_train_forest_clf, y_train)*100)
yhat_test_forest_clf = forest_clf.predict(X_test)
print("Test set accuracy:", accuracy_score(yhat_test_forest_clf, y_test)*100)

Training set accuracy: 100.0
Test set accuracy: 74.48979591836735
