In [7]:
from pathlib import Path
import pandas as pd #data analysis
from sklearn.model_selection import train_test_split #machine learning
from sklearn.linear_model import LinearRegression, Lasso, Ridge, LassoCV, BayesianRidge, LogisticRegression, LogisticRegressionCV
import statsmodels.formula.api as sm #statistics and econometrics
import matplotlib.pylab as plt #plotting library
from dmba import regressionSummary, exhaustive_search
from dmba import backward_elimination, forward_selection, stepwise_selection
from dmba import adjusted_r2_score, AIC_score, BIC_score
import dmba

%matplotlib inline


In [8]:
df1 = pd.read_csv('student-mat.csv')
df2 = pd.read_csv('student-por.csv')
df = pd.concat([df1,df2],axis = 0)

FileNotFoundError: [Errno 2] No such file or directory: 'student-mat.csv'

In [None]:
# Add column Weekly Conumption - AlWeekly_Cons as the sum of 'Dalc' & 'Walc'
df['AlWeekly_Cons'] = round((df['Dalc']+ df['Walc'])/2,0) # 1- 10
df['AlWeekly_Sum'] = round((df['Dalc']+ df['Walc']),0) # 1- 10

#changed this to average as 1-10 spread the prediction too wide and didn't give the most conclusive prediction.

# Drop column G1 and G2 as we only need G3 - final grade and drop 'Dalc' and 'Walc' because we have the weekly sum
df = df.drop(columns = ['G1','G2','Dalc','Walc'],axis = 1)


In [None]:
# Add Success column with two value yes and no based on the G3 column
# We choose threshold of 11 to decide whether a student is academically successful or not (1 point bigger than average which is 10)
df['Success'] = df.apply(lambda x: 'yes' if x['G3'] >= 11 else 'no',axis = 1)
print(df.head())

In [None]:
#df = pd.read_csv('student.csv')
df
#df['Medu'] = df['Medu'].astype('category')
#df['Fedu'] = df['Fedu'].astype('category')

In [None]:
#Check with running everything. 
predictors = ['famsize', 'Medu', 'Fedu', 'Mjob', 'Fjob', 'guardian', 'famsup',
              'Pstatus'] #select the desired columns
#predictors = ['famsize', 'Medu', 'Mjob', 'Fjob', 'guardian', 'famsup',
              #'Pstatus'] #remove FEDU

In [None]:
outcome = 'AlWeekly_Cons' #define the outcome variable
# partition data
# Check indexing
#make MEDU AND FEDU AS CATEGORICALS

X = pd.get_dummies(df[predictors], drop_first=True) #convert categorical variables into dummy

In [None]:
#print(X)

In [None]:
y = df[outcome] #assign the outcome to y
train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.4, random_state=1) #partition the data to train/test

student_lm = LinearRegression() #assign the linear regression model
student_lm.fit(train_X, train_y) #linear regeression on the training set

In [None]:
# print coefficients
print('intercept ', student_lm.intercept_)
pd.set_option('display.max_rows', None)
print(pd.DataFrame({'Predictor': X.columns, 'coefficient': student_lm.coef_}))

# print performance measures
regressionSummary(train_y, student_lm.predict(train_X))

In [None]:
pred_y = student_lm.predict(train_X) #prediction performance on the training set 

print('adjusted r2 : ', adjusted_r2_score(train_y, pred_y, student_lm))
print('AIC : ', AIC_score(train_y, pred_y, student_lm))
print('BIC : ', BIC_score(train_y, pred_y, student_lm))

As this prediction gives continuous,decimal value, for an integer/categorical outcome (1,2,3,4,5). I'm applying rounding to the predicted value. 

In [None]:
import numpy as np

# Use predict() to make predictions on a new set
student_lm_pred = student_lm.predict(valid_X)

# Round the predictions
student_lm_pred_rounded = np.round(student_lm_pred)
# Clip values to be within the range [1, 5]
student_lm_pred_clipped = np.clip(student_lm_pred_rounded, 1, 5)

result = pd.DataFrame({'Predicted': student_lm_pred_clipped, 'Actual': valid_y,
                       'Residual': valid_y - student_lm_pred_clipped}) #dataframe with three columns
print(result)

# Compute common accuracy measures
regressionSummary(valid_y, student_lm_pred_clipped)

In [None]:
import matplotlib.pyplot as plt

# Predict on a new set
student_lm_pred = student_lm.predict(valid_X)

# Round the predictions
student_lm_pred_rounded = np.round(student_lm_pred)
# Clip values to be within the range [1, 5]
student_lm_pred_clipped = np.clip(student_lm_pred_rounded, 1, 5)

# Compute residuals based on the rounded and clipped predictions
all_residuals = valid_y - student_lm_pred_clipped

# Determine the percentage of datapoints with a residual in [-1406, 1406]
print(len(all_residuals[(all_residuals > -1406) & (all_residuals < 1406)]) / len(all_residuals))

# Plotting the residuals
ax = pd.DataFrame({'Residuals': all_residuals}).hist(bins=25)

plt.tight_layout()
plt.show()


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

lasso = Pipeline([
    ['normalize', StandardScaler()],
    ['model', Lasso(alpha=1)],
])
lasso.fit(train_X, train_y)
regressionSummary(valid_y, lasso.predict(valid_X))

lasso_cv = Pipeline([
    ['normalize', StandardScaler()],
    ['model', LassoCV(cv=5)],
])
lasso_cv.fit(train_X, train_y)
regressionSummary(valid_y, lasso_cv.predict(valid_X))
print('Lasso-CV chosen regularization: ', lasso_cv['model'].alpha_)
print(lasso_cv['model'].coef_)

ridge = Pipeline([
    ['normalize', StandardScaler()],
    ['model', Ridge(alpha=1)],
])
ridge.fit(train_X, train_y)
regressionSummary(valid_y, ridge.predict(valid_X))

bayesianRidge = Pipeline([
    ['normalize', StandardScaler()],
    ['model', BayesianRidge()],
])
bayesianRidge.fit(train_X, train_y)
regressionSummary(valid_y, bayesianRidge.predict(valid_X))
print('Bayesian ridge chosen regularization: ',
      bayesianRidge['model'].lambda_ / bayesianRidge['model'].alpha_)

In [None]:
linearRegression = Pipeline([
    ['normalize', StandardScaler()],
    ['model', LinearRegression()],
])
linearRegression.fit(train_X, train_y)
regressionSummary(valid_y, linearRegression.predict(valid_X))

In [None]:
pd.DataFrame({
    'features': train_X.columns,
    'linear regression': linearRegression['model'].coef_,
    'lassoCV': lasso_cv['model'].coef_,
    'bayesianRidge': bayesianRidge['model'].coef_,
})

In [None]:
# run a linear regression of Price on the remaining 11 predictors in the training set
train_df = train_X.join(train_y)

predictors = train_X.columns
formula = 'AlWeekly_Cons ~ ' + ' + '.join(predictors)

student_lm = sm.ols(formula=formula, data=train_df).fit()
print(student_lm.summary())

Logistic Regression

In [None]:
#Adding a column to split low and high alcohol consumption
df['Alcohol_Consumption_High'] = df['AlWeekly_Sum'].apply(lambda x: 0 if x < 4 else 1)



In [None]:
y_LR = df['Alcohol_Consumption_High'] #define the outcome variable

#classes = ['low', 'high']

# split into training and validation
train_X, valid_X, train_y_LR, valid_y_LR = train_test_split(X, y_LR, test_size=0.4,
                                                      random_state=1)

logit_red = LogisticRegressionCV(penalty="l1", solver='liblinear', cv=5)
logit_red.fit(train_X, train_y_LR)

pd.set_option('display.width', 100)
print('regularization', logit_red.C_)
print('intercept ', logit_red.intercept_[0])
print(pd.DataFrame({'coeff': logit_red.coef_[0]}, index=X.columns).transpose())
pd.reset_option('display.width')
print('AIC', AIC_score(valid_y_LR, logit_red.predict(valid_X), df=len(train_X.columns) + 1))


# confusion matrix
classificationSummary(valid_y_LR, logit_red.predict(valid_X))

Random forest

In [None]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

rf = RandomForestClassifier(n_estimators=500, random_state=1)
rf.fit(train_X, train_y)

In [None]:
importances = rf.feature_importances_
std = np.std([tree.feature_importances_ for tree in rf.estimators_], axis=0)

df = pd.DataFrame({'feature': train_X.columns, 'importance': importances, 'std': std})
df = df.sort_values('importance', ascending=False)
print(df)

ax = df.plot(kind='barh', xerr='std', x='feature', legend=False)
ax.set_ylabel('')

plt.tight_layout()
plt.show()

In [None]:
from dmba import plotDecisionTree, classificationSummary, regressionSummary

classificationSummary(valid_y, rf.predict(valid_X))


In [None]:
#predict Success