In [52]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import AdaBoostClassifier
from sklearn.decomposition import PCA

In [53]:
def dummy_variable(columns, dataframe):
    enc = OneHotEncoder()
    labler = LabelEncoder()
    
    for colname in columns:
        labels = labler.fit_transform(dataframe[colname])
        labels = labels.reshape(-1,1)
        categor_vars = enc.fit_transform(labels)
        categor_vars = categor_vars.toarray()
        df = pd.DataFrame(categor_vars)
        columns = df.columns
        column_labels = labler.inverse_transform(columns)
        df.columns = [str(col) + "_" + colname for col in column_labels]

        del dataframe[colname]
        dataframe = pd.concat([dataframe.reset_index(drop=True), df.iloc[:,:-1].reset_index(drop=True)], axis=1)
    
    return dataframe 

def cross_val(estimator, x, y, k=10, reg=True):
    from sklearn.model_selection import KFold
    from sklearn.metrics import mean_squared_error
    from sklearn.metrics import accuracy_score
    import numpy as np
    
    
    kf = KFold(n_splits=k, shuffle=True)
    score = []
    for train_index, test_index in kf.split(X):
        X_train, Y_train = x.iloc[train_index,:], y[train_index] 
        X_test, Y_test = x.iloc[test_index,:], y[test_index]
        
        estimator.fit(X_train, Y_train)
        y_predict = estimator.predict(X_test)
        if reg:
            score.append(mean_squared_error(Y_test, y_predict))
        else:
            score.append(accuracy_score(Y_test, y_predict))
    return np.mean(score)


In [54]:
port = pd.read_csv('student-por.csv', sep=";")
# port['class'] = "portuguese"
math = pd.read_csv('student-mat.csv', sep=";")
# math['class'] = 'math'
data = math.merge(port, left_on = ["school","sex","age","address","famsize","Pstatus","Medu","Fedu","Mjob","Fjob","reason","nursery","internet"],
                 right_on = ["school","sex","age","address","famsize","Pstatus","Medu","Fedu","Mjob","Fjob","reason","nursery","internet"])

In [55]:
len(port.columns)

33

In [56]:
data.columns

Index([u'school', u'sex', u'age', u'address', u'famsize', u'Pstatus', u'Medu',
       u'Fedu', u'Mjob', u'Fjob', u'reason', u'guardian_x', u'traveltime_x',
       u'studytime_x', u'failures_x', u'schoolsup_x', u'famsup_x', u'paid_x',
       u'activities_x', u'nursery', u'higher_x', u'internet', u'romantic_x',
       u'famrel_x', u'freetime_x', u'goout_x', u'Dalc_x', u'Walc_x',
       u'health_x', u'absences_x', u'G1_x', u'G2_x', u'G3_x', u'guardian_y',
       u'traveltime_y', u'studytime_y', u'failures_y', u'schoolsup_y',
       u'famsup_y', u'paid_y', u'activities_y', u'higher_y', u'romantic_y',
       u'famrel_y', u'freetime_y', u'goout_y', u'Dalc_y', u'Walc_y',
       u'health_y', u'absences_y', u'G1_y', u'G2_y', u'G3_y'],
      dtype='object')

In [57]:
data = data.drop(['guardian_x', 'traveltime_x',
       'studytime_x', 'failures_x', 'schoolsup_x', 'famsup_x', 'paid_x',
       'activities_x', 'higher_x', 'romantic_x',
       'famrel_x', 'freetime_x', 'goout_x', 'Dalc_x', 'Walc_x',
       'health_x', 'absences_x'], axis = 1)

In [58]:
data.columns

Index([u'school', u'sex', u'age', u'address', u'famsize', u'Pstatus', u'Medu',
       u'Fedu', u'Mjob', u'Fjob', u'reason', u'nursery', u'internet', u'G1_x',
       u'G2_x', u'G3_x', u'guardian_y', u'traveltime_y', u'studytime_y',
       u'failures_y', u'schoolsup_y', u'famsup_y', u'paid_y', u'activities_y',
       u'higher_y', u'romantic_y', u'famrel_y', u'freetime_y', u'goout_y',
       u'Dalc_y', u'Walc_y', u'health_y', u'absences_y', u'G1_y', u'G2_y',
       u'G3_y'],
      dtype='object')

In [59]:
# data = data.drop_duplicates(["school","sex","age","address","famsize","Pstatus","Medu","Fedu","Mjob","Fjob","reason","nursery","internet"])
# data = data.reset_index(drop=True)

In [60]:
Y = data['Walc_y']
#Y = (data['Walc'] * 2 + data['Dalc'] * 5) / 7.0
#Y = Y >= 3
X = data.drop(['Walc_y'], axis=1)
columns = [u'school', u'sex', u'address', u'famsize', u'Pstatus', u'Medu',
       u'Fedu', u'Mjob', u'Fjob', u'reason', u'nursery', u'internet', u'guardian_y', u'traveltime_y', u'studytime_y',
           u'schoolsup_y', u'famsup_y', u'paid_y', u'activities_y',
       u'higher_y', u'romantic_y', u'famrel_y', u'freetime_y', u'goout_y',
       u'Dalc_y', u'health_y']
X = data.loc[:,columns]
X = dummy_variable(columns, X)
extra_col = ['failures_y', u'G1_x',
       u'G2_x', u'G3_x', u'G1_y', u'G2_y',
       u'G3_y','absences_y', 'age']
X = pd.concat([X, data.loc[:,extra_col]], axis=1)

In [61]:
X.head()

Unnamed: 0,GP_school,F_sex,R_address,GT3_famsize,A_Pstatus,0_Medu,1_Medu,2_Medu,3_Medu,0_Fedu,...,4_health_y,failures_y,G1_x,G2_x,G3_x,G1_y,G2_y,G3_y,absences_y,age
0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0,5,6,6,0,11,11,4,18
1,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0,5,5,6,9,11,11,2,17
2,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0,7,8,10,12,13,12,6,15
3,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0,15,14,15,14,14,14,0,15
4,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0,6,10,10,11,13,13,0,16


In [62]:
Y.groupby(Y).count()

Walc_y
1    144
2     86
3     76
4     49
5     27
Name: Walc_y, dtype: int64

In [68]:
cross_val(estimator=LinearDiscriminantAnalysis(), x=X, y=Y, reg=False)

0.4426450742240215

In [75]:
cross_val(estimator=DecisionTreeClassifier(criterion='gini', max_depth=15), x=X, y=Y, reg=False)

0.38751686909581651

In [152]:
from sklearn.ensemble import RandomForestClassifier
estimators=50
cross_val(estimator=RandomForestClassifier(n_estimators = estimators, random_state=False ,class_weight='auto'), x=X, y=Y, reg=False)

0.48657219973009447

In [113]:
from sklearn.ensemble import GradientBoostingClassifier
cross_val(estimator=GradientBoostingClassifier(max_depth = 50), x=X, y=Y, reg=False)

0.41909581646423744

In [121]:
from sklearn.svm import SVC
cross_val(estimator=SVC(), x=X, y=Y, reg=False)

0.38785425101214577

In [103]:
rate = .5
ada = AdaBoostClassifier(learning_rate=rate, random_state=False)
cross_val(estimator=ada, x=X, y=Y, reg=False)

0.40829959514170044

In [140]:
from sklearn.linear_model import LogisticRegression
log = LogisticRegression()
cross_val(estimator=log,x=X, y=Y, reg=False)

0.43927125506072873

In [144]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
# calling fit and transform in sequence (using method chaining) 
# same result, but more efficient compbutation
X_scaled_d = pd.DataFrame(scaler.fit_transform(X))

In [150]:
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(solver='lbfgs', hidden_layer_sizes=(200,200))


cross_val(estimator=mlp,x=X_scaled_d, y=Y, reg=False)

0.38529014844804321

In [712]:
# from sklearn.metrics import confusion_matrix
# from sklearn.model_selection import train_test_split

# train_x, test_x, train_y, test_y = train_test_split(X,Y, train_size=.1)
# log.fit(train_x, train_y)
# pred_y = log.predict(test_x)
# c = confusion_matrix(test_y, pred_y)
# print c[1,1] / float(c[1,1] + c[1,0])
# c