In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
import seaborn as sns
from sklearn.svm import SVC
%matplotlib inline

#Load the dataset student's maths performance
df = pd.read_csv('student-mat.csv',sep=',')
df.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4.0,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5.0,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,,other,...,4.0,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3.0,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4.0,3,2,1,2,5,4,6,10,10


## Prepare Data

In [4]:
def clean_data(df):
    '''
    INPUT
    df - pandas dataframe 
    
    OUTPUT
    X - A matrix holding all of the variables you want to consider when predicting the response
    y - the corresponding response vector
    
    This function cleans df using the following steps to produce X and y: 
    
    '''
    
    #1. Create y as the G3 column (final grade)
    y = df['G3']
    #2. Create X as all the columns that are not the G3 column
    X = df.drop(['G3'], axis=1)
    #3. G3 doesn't have missing values. So no need of dropping rows
    #4. For each numeric missing variable in X, fill the column with the mean value of the column.
    num_cols = X.select_dtypes(include=['float', 'int']).columns
    for col in num_cols:
        X[col].fillna(X[col].mean(), inplace=True)
    #5. For each categorical variable in X, fill the column with zero one encoding.
    cat_df = X.select_dtypes(include=['object']).copy()
    cat_cols = cat_df.columns
    for col in cat_cols:
        df_dummies = pd.get_dummies(X[col], prefix=col,prefix_sep='_', drop_first=True)
        X = pd.concat([X,df_dummies],axis=1)
        X = X.drop(col, axis=1)
    return X, y
    


In [5]:
#Use the function to create X and y and print the shape of the dataset
X, y = clean_data(df) 
X.shape

(395, 41)

In [6]:
#ensure no missing values
nulls = set(X.columns[X.isnull().mean()> 0])
nulls

set()

## Data Modeling

In [9]:
#Let's write a function that would fit a Linear Regression model and print the r-squared
def model_linear_regression(X,y,test_split,rnd_state):
    '''
    INPUT
    X - A matrix holding all of the variables you want to consider when predicting the response
    y - the corresponding response vector
    test_split - percentage of the test data
    rnd_state - random status
    
    OUTPUT
    Fits the model and calculate r-squared 
    
    '''
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_split, random_state=rnd_state) 

    lm_model = LinearRegression(normalize=True) # Instantiate
    lm_model.fit(X_train, y_train) #Fit
        
    #Predict and score the model
    y_test_preds = lm_model.predict(X_test) 
    print("The r-squared score for the model was {} on {} values.".format(r2_score(y_test, y_test_preds), len(y_test)))

## Evaluate the Results

In [10]:
#Fit and evaluate the Linear Model
model_linear_regression(X,y,0.3,42)

The r-squared score for the model was 0.7755942281540831 on 119 values.


In [54]:
def coef_weights(coefficients, X_train):
    '''
    INPUT:
    coefficients - the coefficients of the linear model 
    X_train - the training data, so the column names can be used
    OUTPUT:
    coefs_df - a dataframe holding the coefficient, estimate, and abs(estimate)
    
    Provides a dataframe that can be used to understand the most influential coefficients
    in a linear model by providing the coefficient estimates along with the name of the 
    variable attached to the coefficient.
    '''
    coefs_df = pd.DataFrame()
    coefs_df['est_int'] = X_train.columns
    coefs_df['coefs'] = lm_model.coef_
    coefs_df['abs_coefs'] = np.abs(lm_model.coef_)
    coefs_df = coefs_df.sort_values('abs_coefs', ascending=False)
    return coefs_df

#Use the function
coef_df = coef_weights(lm_model.coef_, X_train)

#A quick look at the top results
coef_df.head(20)

Unnamed: 0,est_int,coefs,abs_coefs
14,G2,0.941667,0.941667
33,schoolsup_yes,0.838821,0.838821
28,reason_home,-0.700999,0.700999
38,higher_yes,0.585389,0.585389
36,activities_yes,-0.497462,0.497462
23,Mjob_teacher,0.417841,0.417841
29,reason_other,0.389315,0.389315
24,Fjob_health,0.373902,0.373902
27,Fjob_teacher,-0.35854,0.35854
6,famrel,0.357684,0.357684


## Further modelling and evaluation based on data understanding

In [11]:
#We see a very high correlation of G1 & G2 with the final result G3
#Let's rebuild the model with G1 & G2 removed
# Drop G1 and G2
X = X.drop(['G1','G2'], axis=1)
#fit and evalue the new model
model_linear_regression(X,y,0.3,42)

The r-squared score for the model was 0.1893419299072271 on 119 values.
