In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **student-mat dataset**

In [None]:
import matplotlib.pyplot as plt
import seaborn as sn

In [None]:
sn.set_style(style="whitegrid")

# Exploratory analysis

In [None]:
df=pd.read_csv("/kaggle/input/student-alcohol-consumption/student-mat.csv")

In [None]:
df.head()

In [None]:
sn.barplot(x="Pstatus",y="G3",data=df)
plt.show()


Students who have their parents in an A state tend to score higher.

In [None]:
sn.barplot(x="Medu",y="G3",data=df)
plt.show()

Students who have their parents in a 0 state that is to say that they are doing well tend to have a higher score.

In [None]:
sn.barplot(x="Fedu",y="G3",data=df)
plt.show()

In [None]:
sn.barplot(x="Fjob",y="G3",data=df)
plt.show()


Students whose parents are teachers have a better grade,because they have academic knowledge.

In [None]:
sn.barplot(x="studytime",y="G3",data=df)
plt.show()


Students who study an average of 4 hours tend to have a higher grade, because they will be able to better understand the information.

In [None]:
sn.barplot(x="internet",y="G3",data=df)
plt.show()

Students who have internet have a higher grade because they can ask questions.

In [None]:
sn.boxplot(df.Fjob,df.G3,data=df)
plt.show()

In very rare cases some students tend to score 0 which is clearly an outlier.

In [None]:
plt.subplots(1,1,figsize=(12,8))
sn.distplot(df.G3,kde=True,color="b")
plt.show()

In [None]:
plt.subplots(1,1,figsize=(12,8))
sn.heatmap(df.corr(),annot=True,cmap="cool")
plt.show()

# **Feature engineering**

In [None]:
def not_outlires(lower,upper):
    
    
    return df[(df.G3 > lower) & (df.G3 < upper)]

In [None]:
lower=df.G3.quantile(0.10) 
upper=df.G3.quantile(0.9999)


We define two intervals, one lower and one higher, in order to minimize outliers so as not to alter the performance of our model.

In [None]:
df_not_out=not_outlires(lower,upper)

In [None]:
df_not_out

In [None]:
plt.subplots(1,1,figsize=(12,8))
sn.distplot(df_not_out.G3,color="lightgreen")
plt.show()

In [None]:
sn.boxplot(x="Fjob",y="G3",data=df_not_out)
plt.show()


We minimize the number of outliers.

In [None]:
df_not_out.columns

# **Preprocesing**

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
data=df_not_out[['Pstatus', 'Medu', 'Fedu','Fjob','internet','G1','G2','studytime']]

In [None]:
data=pd.get_dummies(data) 

Crete dummy variables.

In [None]:
medu=data.pop("Medu")

data["medu_0"] = (medu==0)*1

data["medu_1"] = (medu==1)*1

data["medu_2"] = (medu==2)*1

data["medu_3"] = (medu==3)*1

data["medu_4"] = (medu==4)*1

In [None]:
fedu=data.pop("Fedu")

data["fedu_0"] = (fedu ==0)*1

data["fedu_1"] = (fedu ==1)*1

data["fedu_2"] = (fedu ==2)*1

data["fedu_3"] = (fedu ==3)*1

data["fedu_4"] = (fedu ==4)*1

In [None]:
studytime=data.pop("studytime")

data["studytime_1"] = (studytime ==1)*1

data["studytime_2"] = (studytime ==2)*1

data["studytime_3"] = (studytime ==3)*1

data["studytime_4"] = (studytime ==4)*1

In [None]:
scala=StandardScaler()

# **Sklearn Linear Regression**

In [None]:
data_scala=scala.fit_transform(data)
target=np.array(df_not_out.G3).reshape(-1,1)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train,X_test,Y_train,Y_test=train_test_split(data_scala,
                                               target,
                                               test_size=0.30,
                                               random_state=0)

In [None]:
X_train.shape,Y_train.shape

In [None]:
X_test.shape,Y_test.shape

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
reg_lineal=LinearRegression()

In [None]:
reg_lineal.fit(X_train,Y_train)

In [None]:
reg_lineal.score(X_test,Y_test)

In [None]:
from sklearn.metrics import mean_absolute_error

In [None]:
print("MAE", mean_absolute_error(Y_test,reg_lineal.predict(X_test)))

In [None]:
def main():
    

    plt.figure(figsize=(18,10))
    
    plt.title("True values vs predict values")
    plt.scatter(Y_test,
                reg_lineal.predict(X_test),
                marker="+",c="b")

    plt.scatter(reg_lineal.predict(X_test),
                Y_test,c="c",marker="+")

    plt.legend(["True values","Predict values"])

    plt.xlabel("True values")
    plt.ylabel("Predict values")
    plt.show()

In [None]:
if __name__ == "__main__":
    
    main()
    
    

In [None]:
reg_lineal.predict(X_test).flatten()

In [None]:
df_test=pd.DataFrame({"True values":Y_test.flatten() ,
                      "Predict values": reg_lineal.predict(X_test).flatten()})

In [None]:
df_test[:40]

# **XGBR Regression**

In [None]:
from xgboost import XGBRegressor

In [None]:
xgb=XGBRegressor()

In [None]:
xgb.fit(X_train,Y_train)
xgb.score(X_test,Y_test)

In [None]:
mean_absolute_error(Y_test,xgb.predict(X_test))

# **Keras**

In [None]:
import tensorflow.keras as kr

In [None]:
xtrain,xtest,ytrain,ytest=train_test_split(data_scala
                                           ,scala.fit_transform(target),
                                           test_size=0.3,
                                           random_state=0)
xtrain.shape

In [None]:
def create_model():
    
    model=kr.Sequential([
        
        kr.layers.Dense(54,
                        input_dim=25, 
                        activation="relu",
                        kernel_regularizer=kr.regularizers.L1L2(0.001)),
                            
        kr.layers.Dense(22,activation="relu"),
                        
   
        
        kr.layers.Dense(22,activation="relu"),
        
       
        
        
        kr.layers.Dense(1,activation="linear")
            
        
    ])
    
    return model

In [None]:
model=create_model()

In [None]:
early_stop=kr.callbacks.EarlyStopping(monitor="val_mae",
                                      patience=4,
                                      restore_best_weights=True)

In [None]:
model.compile(loss="mse",
              metrics=["mae"],
              optimizer="adam")

In [None]:
history=model.fit(xtrain,ytrain,
                  validation_data=(xtest,ytest),
                  batch_size=10,callbacks=[early_stop],
                  epochs=30)

In [None]:
from sklearn.metrics import r2_score

In [None]:
r2_score(ytest,model.predict(xtest)),r2_score(Y_test,reg_lineal.predict(X_test)),r2_score(Y_test,xgb.predict(X_test))

# Sklearn linear regression **win**!!


The return of the queen...

The sklearn linear regression presented a lower MAE and at the same time a better fit of the data, which is why it will be the model chosen for this time.
Deep learning and XGBOOST gave way to linear regression the return of the queen..