In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df_mat = pd.read_csv('../input/student-alcohol-consumption/student-mat.csv')
df_por = pd.read_csv('../input/student-alcohol-consumption/student-por.csv')
print(df_mat.shape)
print(df_por.shape)

In [None]:
df = pd.concat([df_mat,df_por],axis=0)

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

# **DATA VISUALISATION**

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import cufflinks as cf
from plotly.offline import download_plotlyjs,init_notebook_mode,plot,iplot
%matplotlib inline

import plotly.express as px

In [None]:
init_notebook_mode(connected=True)

In [None]:
cf.go_offline()

In [None]:
plt.figure(figsize=(12,10))
sns.heatmap(df.corr(),annot=True)

In [None]:
plt.figure(figsize=(10,8))
sns.stripplot(y="absences", x="famsup", data=df)

In [None]:
fig = px.scatter(df, x="G1", y="absences",size="Dalc", color="health",title='G1 Vs abcences',
                 hover_name="health", log_x=True, size_max=60)
fig.show()

In [None]:
fig = px.bar(df, x="school",color='sex', title="sex ratio in schools",color_continuous_scale='Inferno')
fig.show()

In [None]:
fig = px.bar(df, x="famsize",color='address', title="famsize in rural and urban areas",color_continuous_scale='Inferno',barmode='group')
fig.show()

In [None]:
fig = px.bar(df, x="Pstatus", color="famsize",title='family relations', barmode="group", facet_col="guardian")
            
fig.show()

In [None]:

fig = px.pie(df, values='absences', names='goout', title= ' % of absent absent',
             color_discrete_map={'Thur':'lightcyan',
                                 'Fri':'cyan',
                                 'Sat':'royalblue',
                                 'Sun':'darkblue'})
fig.show()

In [None]:

fig = px.pie(df, values='age', names='reason',title='reason for attending school', color_discrete_sequence=px.colors.sequential.RdBu)
fig.show()

# 

In [None]:
plt.figure(figsize=(10,8))
sns.pointplot(x="traveltime", y="studytime", hue="activities",
                   data=df, dodge=True)

In [None]:
fig =px.sunburst(df,path = ['studytime', 'higher',], values='freetime')
fig.show()

In [None]:
fig = px.scatter(df, x="goout", y="Dalc", color="romantic",
                 size='freetime', hover_data=['romantic'])
fig.show()


In [None]:
fig = px.line(df, x="Walc", y="G1", color='failures')
fig.show()

In [None]:
fig = px.histogram(df, x="freetime",color='failures')
fig.show()

In [None]:

fig = px.density_heatmap(df, x="G2", y="studytime", marginal_x="histogram", marginal_y="histogram")
fig.show()

In [None]:
plt.figure(figsize=(10,8))
sns.countplot(x='Fedu',data=df,hue='Fjob')

In [None]:
plt.figure(figsize=(10,8))
sns.countplot(x='Medu',data=df,hue='Mjob')

[](http://)

# **DATA PREPROCESSING AND FEATURE ENGINEERING**

In [None]:
df.columns

In [None]:
df = pd.get_dummies(df,columns=['school', 'sex','address', 'famsize', 'Pstatus','Mjob', 'Fjob', 'reason','guardian','schoolsup', 'famsup', 'paid', 'activities', 'nursery',
                            'higher', 'internet', 'romantic'],drop_first=True)

In [None]:
df.head()

In [None]:
df.corr()['G3'].sort_values()

In [None]:
df.G3.value_counts()

In [None]:
df.columns

In [None]:
X=df[['age', 'Medu', 'Fedu', 'traveltime', 'studytime', 'failures', 'famrel',
       'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences','G1', 'G2',
     'school_MS', 'sex_M', 'address_U', 'famsize_LE3', 'Pstatus_T',
       'Mjob_health', 'Mjob_other', 'Mjob_services', 'Mjob_teacher',
       'Fjob_health', 'Fjob_other', 'Fjob_services', 'Fjob_teacher',
       'reason_home', 'reason_other', 'reason_reputation', 'guardian_mother',
       'guardian_other', 'schoolsup_yes', 'famsup_yes', 'paid_yes',
       'activities_yes', 'nursery_yes', 'higher_yes', 'internet_yes',
       'romantic_yes']]

In [None]:
y=df.G3

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
lr = LinearRegression()

In [None]:
lr.fit(X_train,y_train)

In [None]:
print(lr.intercept_)

In [None]:
lr.coef_

In [None]:
cdf = pd.DataFrame(lr.coef_,X.columns,columns=['Coef'])

In [None]:
cdf

In [None]:
predictions =  lr.predict(X_test)

In [None]:
plt.scatter(y_test,predictions)
plt.plot(y_test,y_test,'-r')

In [None]:
plt.figure(figsize=(10,8))
sns.histplot((y_test-predictions),kde=True)

In [None]:
from sklearn import metrics

In [None]:
metrics.mean_absolute_error(y_test,predictions)

In [None]:
metrics.mean_squared_error(y_test,predictions)

In [None]:
np.sqrt(metrics.mean_squared_error(y_test,predictions))