In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression

pd.set_option('display.max_rows', 1000)

In [None]:
data=pd.read_csv('../input/student-alcohol-consumption/student-mat.csv')
data.head()

In [None]:
data.info()

In [None]:
data.isnull().sum().sum()

In [None]:

plt.figure(figsize=(12,8))
sns.heatmap(data.corr(), annot=True, cmap='viridis')

In [None]:
categorical_cols=[x for x in data.columns if data[x].dtypes=='O']
categorical_cols

In [None]:
for cat in categorical_cols:
    print(f'column: {cat}, values: {data[cat].unique()}')

as you can see above "other" appears in more than 1 column, hence let's change the column values to make them unique across all columns

In [None]:
data['Mjob']=data['Mjob'].apply(lambda x: 'm'+x)
data['Fjob']=data['Fjob'].apply(lambda x: 'f'+x)
data['reason']=data['reason'].apply(lambda x: 'r'+x)
data['guardian']=data['guardian'].apply(lambda x: 'g'+x)

## Check constant, quasi constant and duplicate features

In [None]:
from sklearn.feature_selection import VarianceThreshold

In [None]:
# Convert the columns to numerical and then check
data_new=data.copy()
le=LabelEncoder()

data_new=data_new.apply(lambda x : le.fit_transform(x))

data_new

In [None]:
print("Total columns in the dataset are ", data_new.shape[1])

In [None]:
# check constant columns
constant_filter=VarianceThreshold(threshold=0)

constant_filter.fit(data_new)

In [None]:
constant_filter.get_support().sum()

No column with constant value. Now let's check where column values are 1% different.

## Quasi constant

In [None]:
quasi_constant=VarianceThreshold(threshold=0.01)
quasi_constant.fit(data_new)

quasi_constant.get_support().sum()

No column with quasi constant as well.

## Duplicated features

In [None]:
data_new.T.duplicated().sum()

No duplicated columns

### Convert data to train and test set

# Let's create 2 datasets, one with Label encoding and other with Onehot encoding and run it seperately to check the impact

In [None]:
# Dataset1
X=data_new.drop('G3', axis=1)
y=data_new['G3']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
sc=StandardScaler()
X_train_tx=sc.fit_transform(X_train)
X_test_tx=sc.transform(X_test)
n_features_1=X_train_tx.shape[1]
dataset_1=(X_train_tx, X_test_tx, y_train, y_test, n_features_1,'dataset_1')

In [None]:
# Dataset2
data=pd.get_dummies(data, drop_first=True)
X_1=data.drop('G3', axis=1)
y_1=data['G3']

X_train_1, X_test_1, y_train_1, y_test_1=train_test_split(X_1, y_1, test_size=0.2, random_state=1)
sc_1=StandardScaler()
X_train_1_tx=sc.fit_transform(X_train_1)
X_test_1_tx=sc.transform(X_test_1)
n_features_2=X_train_1_tx.shape[1]
dataset_2=(X_train_1_tx, X_test_1_tx, y_train_1, y_test_1, n_features_2, 'dataset_2')


# Feature engineering

## SelectKbest test to select best features in collaboration with f_regression

In [None]:
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import Lasso

In [None]:
model_n=[]
n_features=[]
train_score=[]

In [None]:
def run_model(model, dataset, model_name):
    for k in range(1, dataset[4]):
        ls=SelectKBest(score_func=f_regression, k=k)
        pipeline=Pipeline(steps=[('select_features', ls), 
                                ('regressor',model)])
        
        cv=cross_val_score(pipeline, dataset[0], dataset[2], scoring='neg_mean_squared_error', cv=5)
        train_score_=np.sqrt(-np.mean(cv))
        
        
        
        train_score.append(train_score_)
        
        n_features.append(k)
        model_n.append(model_name)
  


## Running on dataset_1 i.e with label encoding

## Linear Regression

In [None]:
# Let's run the model on each model now

run_model(LinearRegression(), dataset_1, "Linear_Regression")

In [None]:
dataset_1[0].shape, dataset_1[1].shape, dataset_1[2].shape, dataset_1[3].shape

In [None]:
output=pd.DataFrame(zip(model_n, n_features, train_score ), columns=['Model Name', 'No. of features', 'Training RMSE score'])
output.sort_values(by='Training RMSE score').head(1)

## Lasso

In [None]:
run_model(Lasso(max_iter=3000), dataset_1, "Lasso")

In [None]:
output=pd.DataFrame(zip(model_n, n_features, train_score ), columns=['Model Name', 'No. of features', 'Training RMSE score'])
output.sort_values(by='Training RMSE score')