In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import AdaBoostClassifier
from sklearn.decomposition import PCA

In [426]:
def dummy_variable(columns, dataframe):
    enc = OneHotEncoder()
    labler = LabelEncoder()
    
    for colname in columns:
        labels = labler.fit_transform(dataframe[colname])
        labels = labels.reshape(-1,1)
        categor_vars = enc.fit_transform(labels)
        categor_vars = categor_vars.toarray()
        df = pd.DataFrame(categor_vars)
        columns = df.columns
        column_labels = labler.inverse_transform(columns)
        df.columns = [str(col) + "_" + colname for col in column_labels]

        del dataframe[colname]
        dataframe = pd.concat([dataframe.reset_index(drop=True), df.iloc[:,:-1].reset_index(drop=True)], axis=1)
    
    return dataframe 

def cross_val(estimator, x, y, k=10, reg=True):
    from sklearn.model_selection import KFold
    from sklearn.metrics import mean_squared_error
    from sklearn.metrics import accuracy_score
    import numpy as np
    
    
    kf = KFold(n_splits=k, shuffle=True)
    score = []
    for train_index, test_index in kf.split(X):
        X_train, Y_train = x.iloc[train_index,:], y[train_index] 
        X_test, Y_test = x.iloc[test_index,:], y[test_index]
        
        estimator.fit(X_train, Y_train)
        y_predict = estimator.predict(X_test)
        if reg:
            score.append(mean_squared_error(Y_test, y_predict))
        else:
            score.append(accuracy_score(Y_test, y_predict))
    return np.mean(score)


In [197]:
port = pd.read_csv('student/student-por.csv', sep=";")
port['class'] = "portuguese"
math = pd.read_csv('student/student-mat.csv', sep=";")
math['class'] = 'math'
data = pd.concat([port, math], axis=0)
data = data.drop_duplicates(["school","sex","age","address","famsize","Pstatus","Medu","Fedu","Mjob","Fjob","reason","nursery","internet"])
data = data.reset_index(drop=True)

In [198]:
data.columns

Index([u'school', u'sex', u'age', u'address', u'famsize', u'Pstatus', u'Medu',
       u'Fedu', u'Mjob', u'Fjob', u'reason', u'guardian', u'traveltime',
       u'studytime', u'failures', u'schoolsup', u'famsup', u'paid',
       u'activities', u'nursery', u'higher', u'internet', u'romantic',
       u'famrel', u'freetime', u'goout', u'Dalc', u'Walc', u'health',
       u'absences', u'G1', u'G2', u'G3', u'class'],
      dtype='object')

In [418]:
Y = (data['Walc'] * 2 + data['Dalc'] * 5) / 7.0
Y = Y >= 3
X = data.drop(['Walc'], axis=1)
columns = ['studytime', 'freetime', 'famrel', 'goout']
X = data.loc[:,columns]
X = dummy_variable(columns, X)
extra_col = ['failures', 'G1']
X = pd.concat([X, data.loc[:,extra_col]], axis=1)

In [434]:

Y.groupby(Y).count()

False    584
True      78
dtype: int64

In [369]:
cross_val(estimator=LinearRegression(), x=X, y=Y)

1.5893566910443397

In [315]:
cross_val(estimator=KNeighborsRegressor(n_neighbors=11), x=X, y=Y)

1.2781472053705927

In [316]:
cross_val(estimator=DecisionTreeRegressor(max_depth=3), x=X, y=Y)

1.0152185466958918

In [317]:
cross_val(estimator=KNeighborsClassifier(n_neighbors=4), x=X, y=Y, reg=False)

0.40933966530981458

In [427]:
priors = [584/662.0, 78/662.0]
cross_val(estimator=LinearDiscriminantAnalysis(priors=priors), x=X, y=Y, reg=False)

0.87021709633649935

In [405]:
456/662.0

0.6888217522658611

In [377]:
cross_val(estimator=DecisionTreeClassifier(criterion='gini', max_depth=15), x=X, y=Y, reg=False)

0.28991406603346903

In [433]:
from sklearn.ensemble import RandomForestClassifier
estimators=9
cross_val(estimator=RandomForestClassifier(n_estimators = estimators, random_state=False ,class_weight='auto'), x=X, y=Y, reg=False)

0.86404341926729988

In [423]:
rate = .5
ada = AdaBoostClassifier(learning_rate=rate, random_state=False)
cross_val(estimator=ada, x=X, y=Y, reg=False)


0.87620985979194932

In [417]:
from sklearn.ensemble import BaggingClassifier
estimators = 20
bagger = BaggingClassifier(n_estimators=estimators, random_state=False)
cross_val(estimator=bagger, x=X, y=Y, reg=False)

0.66327453640886491