In [None]:
import pandas as pd

d1=pd.read_csv('../input/student-alcohol-consumption/student-mat.csv')
d1['label']='math'

d2=pd.read_csv('../input/student-alcohol-consumption/student-por.csv')
d2['label']='port'
result = pd.concat([d1, d2], ignore_index=True, sort=False)
len(result)

<h3> Let's have a brief look at the duplicates from two datasets - these are the students we know most about, since for them we have results from both Math and Postugese classes </h3>

In [None]:
duplicatesfirst = result[result.duplicated(subset=['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu',
       'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 
        'schoolsup', 'famsup', 'activities', 'nursery',
       'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc',
       'Walc', 'health'])==True]
duplicateslast = result[result.duplicated(subset=['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu',
       'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 
        'schoolsup', 'famsup', 'activities', 'nursery',
       'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc',
       'Walc', 'health'], keep='last')==True]

duplicates=pd.merge(duplicatesfirst,duplicateslast,on=['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu',
       'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 
        'schoolsup', 'famsup', 'activities', 'nursery',
       'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc',
       'Walc', 'health'])
duplicates =duplicates.drop(duplicates.index[duplicates['label_x']==duplicates['label_y']],axis=0)

In [None]:
duplicates['averagegradeport'] = round((duplicates['G1_x']+duplicates['G2_x']+duplicates['G3_x'])/3)
duplicates['averagegrademath'] = round((duplicates['G1_y']+duplicates['G2_y']+duplicates['G3_y'])/3)

In [None]:
import numpy as np 
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

ag=np.arange(15,23,1)
portav=[]
mathav=[]
width=0.5
labels = [15, 16, 17, 18, 19, 20, 21,22]
for i in ag:
    df=duplicates[duplicates['age']==i]
    portav.append(np.mean(df['averagegradeport']))
    mathav.append(np.mean(df['averagegrademath']))
fig, ax = plt.subplots(figsize=(15,12))
r1=ax.bar(ag,portav,width,label='Port')
r2=ax.bar(ag,mathav,width,bottom=portav,label='Math')


heights=[]
for i in range(0,8):
    height = r1[i].get_height()
    
    ax.annotate('{}'.format(round(height,2)),
                    xy=(r1[i].get_x() + r1[i].get_width() / 2, height),
                    xytext=(0, 3),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom')
    height1 = r2[i].get_height()
    
    ax.annotate('{}'.format(round(height1,2)),
                    xy=(r2[i].get_x() + r2[i].get_width() / 2, height+height1),
                    xytext=(0, 3),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom')
    ax.annotate('{}'.format(round(height+height1,2)),
                    xy=(r2[i].get_x() + r2[i].get_width() / 2, height+height1),
                    xytext=(0, 12),  # 3 points vertical offset
                    textcoords="offset points",fontsize=18,
                    ha='center', va='bottom')
    heights.append(height+height1)

plt.ylabel('Average grades', fontsize=20)
plt.xlabel('Age', fontsize=20)

plt.title('Average grades for Maths and Portuguese by age',fontsize=20)
plt.legend()

plt.show()


On average students get higher grades for Portuguese than Math. Interestingly, at the age of 19, all results go down, while at 20 students get higher results. Such results may be affected by the distribution of age in this dataset. 


In [None]:
duplicates['Average Grade for Math and Portuguese']=(duplicates['averagegradeport']+duplicates['averagegrademath']/2)
duplicates['Total Alcohol Consumption'] = duplicates['Walc'] + duplicates['Dalc']


In [None]:

sns.relplot(
    data=duplicates,
    x='Average Grade for Math and Portuguese', y='Total Alcohol Consumption',
    hue="sex", 
    kind="line", palette='husl',
    height=5, aspect=2
)

#ax.set_ylabel('Average grades', fontsize=12)
#ax.set_xlabel('Total alcohol consumption', fontsize=12)
#ax.set_title("Correlation between alcohol consumption and average grades in males and females")


Overall girls drink less alcohol, and their grades don't go lower than 10, while there seems to be a correlation between low grades among boys and drinking.

<h3> Now we switch to the set containing unique student data </h3>

In [None]:
db=result.drop_duplicates(subset=['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu',
       'Mjob', 'Fjob','guardian', 'traveltime', 'Dalc',
       'Walc', 'health'])
db.describe()

In [None]:
db['Parents Education'] = db['Medu'] + db['Fedu']
db['Total Alcohol Consumption'] = db['Walc'] + db['Dalc']
db['Average Grade'] = round((db['G1']+db['G2']+db['G3'])/3)

Columns with binary options we convert to numeric by substituting "yes" with 0 and "no" with 1.

In [None]:
db=db.replace({'schoolsup': {'yes': 0, 'no': 1}})
db=db.replace({'paid': {'yes': 0, 'no': 1}})
db=db.replace({'higher': {'yes': 0, 'no': 1}})
db=db.replace({'famsup': {'yes': 0, 'no': 1}})
db=db.replace({'internet': {'yes': 0, 'no': 1}})
db=db.replace({'sex': {'F': 1, 'M': 0}})


In [None]:


plt.figure(figsize=(20,18))
sns.heatmap(db.corr(),annot = True,fmt = ".2f",cbar = True)
plt.xticks(rotation=90, fontsize=20)
plt.yticks(rotation = 0, fontsize=20)



<p>The visualization of correlations between features helps to spot which parameters correlate with alcohol consumption to understand which factors may affect it. It's also interesting which parameters affect Average Grade.</p>

In [None]:
db.hist(figsize=(20, 20))

<h4>Also, it's good to know how different features are distributed, as all the other patterns and correlations are affected by the shapes of distributions.</h4>

<p>Let's select features with higher correlations with total alcohol consumption and look at their relation with intention to get or not to get higher education.</p>

In [None]:
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

labels = ['studytime','famsup','freetime','Total Alcohol Consumption','Average Grade','Parents Education']

f = db[db['sex']==1]
f = f[['studytime','famsup','freetime','Total Alcohol Consumption','Average Grade','Parents Education']]

m = db[db['sex']==0]
m= m[['studytime','famsup','freetime','Total Alcohol Consumption','Average Grade','Parents Education']]

male_means=m.mean()
female_means=f.mean()

x = np.arange(len(labels))  # the label locations
width = 0.35  # the width of the bars

fig, ax = plt.subplots(figsize=(8, 6))
r1 = ax.bar( x - width/2, male_means.values, width, label='male')
r2 = ax.bar(x + width/2, female_means.values, width, label='female')
#for i, v in enumerate(male_means.values):
#    ax.text(i - .25 , v + 0.1, round(v,1), color='blue', fontweight='bold')
# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Average Score', fontsize=12)
ax.set_title('Average values of parameters by gender', fontsize=14)
ax.set_xticks(x)
ax.set_xticklabels(labels, rotation=90, fontsize=12)
ax.legend()

def autolabel(rects):
    """Attach a text label above each bar in *rects*, displaying its height."""
    for rect in rects:
        height = rect.get_height()
        ax.annotate('{}'.format(round(height,2)),
                    xy=(rect.get_x() + rect.get_width() / 2, height),
                    xytext=(0, 3),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom')
        
autolabel(r1)
autolabel(r2)

#fig.tight_layout()

plt.show()


Girls spend more time studying, while boys have more freetime, drink more alcohol, and still have better health. Boys also have slightly higher average grades, but this may be affected by the fact that there are less boys than girls. We are analyzing the results of 394 girls, and 277 boys, which is 1.4 times less.

In [None]:
import matplotlib.pyplot as plt


labels = ['15-16', '17-18', '19-20', '21-22']

frame1 = db[db['age']==15]
frame2 = db[db['age']==16]

g1 = pd.concat([frame1, frame2])
g1 = g1[['sex','Total Alcohol Consumption']]

frame3 = db[db['age']==17]
frame4 = db[db['age']==18]

g2 = pd.concat([frame3, frame4])
g2 = g2[['sex','Total Alcohol Consumption']]
frame5 = db[db['age']==19]
frame6 = db[db['age']==20]

g3 = pd.concat([frame5, frame6])
g3 = g3[['sex','Total Alcohol Consumption']]

frame7 = db[db['age']==21]
frame8 = db[db['age']==22]

g4 = pd.concat([frame7, frame8])
g4 = g4[['sex','Total Alcohol Consumption']]

men_means = [(g1[g1['sex']==0].mean()['Total Alcohol Consumption']), (g2[g2['sex']==0].mean()['Total Alcohol Consumption']), (g3[g3['sex']==0].mean()['Total Alcohol Consumption']), (g4[g4['sex']==0].mean()['Total Alcohol Consumption'])]
women_means = [(g1[g1['sex']==1].mean()['Total Alcohol Consumption']), (g2[g2['sex']==1].mean()['Total Alcohol Consumption']), (g3[g3['sex']==1].mean()['Total Alcohol Consumption']), (g4[g4['sex']==1].mean()['Total Alcohol Consumption'])]

width = 0.35       # the width of the bars: can also be len(x) sequence

fig, ax = plt.subplots(figsize=(10, 7))

r1=ax.bar(labels, men_means, width,  label='Male')
r2=ax.bar(labels, women_means, width, bottom=men_means,
       label='Female')
for i in range(0,4):
    height = r1[i].get_height()
    
    ax.annotate('{}'.format(round(height,2)),
                    xy=(r1[i].get_x() + r1[i].get_width() / 2, height),
                    xytext=(0, 3),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom')
    height1 = r2[i].get_height()
    
    ax.annotate('{}'.format(round(height1,2)),
                    xy=(r2[i].get_x() + r2[i].get_width() / 2, height+height1),
                    xytext=(0, 3),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom')
    ax.annotate('{}'.format(round(height+height1,2)),
                    xy=(r2[i].get_x() + r2[i].get_width() / 2, height+height1),
                    xytext=(0, 12),  # 3 points vertical offset
                    textcoords="offset points",fontsize=14,
                    ha='center', va='bottom')
    heights.append(height+height1)
ax.set_ylabel('Accohol consumption', fontsize=14)
ax.set_title('Average weekly alcohol consumption by age and gender', fontsize=18)
ax.legend()
fig.tight_layout()
plt.show()

As was noticed before, on average girls drink less alcohol than boys at different ages. However, while the levels are comparable for the three first age groups, boys significantly increase alcohol consumption at the age of 21-22, which can be affected by the fact that it becomes legal to buy it. Let's now have a look at the age distribution in the data set.

In [None]:

labels = [15, 16, 17, 18, 19, 20, 21,22]

data = [db['age'][db['age']==x].count() for x in labels]
print(data)

length = db['age'].count()
print(length)
perc= data/length*100

print(perc)
fig1, ax1 = plt.subplots(figsize=(8, 6))
ax1.pie(perc,  labels=labels, autopct='%1.1f%%',
        shadow=False, startangle=90)
ax1.set_title('Age distribution among students', fontsize=14)

plt.show()

Now we see that the age group 21-22 is tiny, and even the group of 19-20 is several times smaller than two other groups, so the mean results that we got should be viewed differently.

Let's have a look at parameters with high correlations with average grades.

In [None]:
db['Average Parents Education']=round(db["Parents Education"]/2)
features=['studytime',
       'higher','age','internet']
for feat in features:
    plt.figure(figsize=(8, 6))
    sns.set_style("whitegrid")
    sns.barplot(data=db, x=feat, y="Average Grade")
    sns.despine()


From the graphs, we may conclude that internet access and intention of getting higher education result in higher average grades, and the amount of study time positively correlates with students' results. From the age distribution graph, we remember that 15, 16, 17, and 18 are the main age groups. Here we see that average grade decreases between these main groups, which probably can be explained by the fact that the school program gets harder every year.

In [None]:

sns.catplot(data=db, kind="boxen", x="Average Parents Education", y="Average Grade")


plt.show()

The graph above shows that levels of education of parents affect the grades that their children get. 

In [None]:
sns.relplot(
    data=db,
    x="Total Alcohol Consumption", y="Average Grade",
    hue="sex", col="Pstatus",
    kind="line", palette='husl',
    height=5, aspect=1
)

sns.despine()

Here we study the role of Parental status on the grades and alcohol consumption of students. Interestingly, there is less alcohol consumption in separated families. While boys get higher grades living with a single parent, girls seem to be affected more negatively by such conditions, which may be related to the fact that children stay with their mothers more often.

In [None]:
ag=[2, 3, 4, 5, 6, 7, 8, 9, 10]
plt.figure(figsize=(8,6))

features=['studytime',
       'goout','health']
plt.title('Features correlating with total alcohol consumption', fontsize=14)
for xs in features:
    arr=[]
    for i in ag:
        df=db[db['Total Alcohol Consumption']==i]
        arr.append(np.mean(df[xs]))
    plt.plot(ag,arr,label=xs)
    plt.legend()
    plt.yticks(rotation = 0, fontsize=12)
    plt.ylabel('Averaged features', fontsize=12)
    plt.yticks(rotation = 0, fontsize=12)
    plt.xlabel('Total alcohol consumption', fontsize=12)
    
plt.show()

Tendencies that we notice between study-time and going out and alcohol consumption make sense: studying draws away from drinking while going out often implies it. Interestingly, the correlation between health and alcohol consumption is positive up to a certain point, but at higher levels of alcohol consumption, health levels decrease, probably affected by the drinking.

In [None]:
import matplotlib as mpl
f, ax = plt.subplots(figsize=(10, 12))
sns.despine(f)
sns.histplot(
    db,
    x="age", hue="Total Alcohol Consumption", bins=10,
    multiple="stack",
    palette="rainbow",
    edgecolor=".1",
    linewidth=.5,discrete=(True,True)
    
)
ax.set_xticks([15, 16, 17, 18, 19, 20, 21,22])

ax.xaxis.set_major_formatter(mpl.ticker.ScalarFormatter())
ax.set_title('Total Alcohol Consumption by Age')

This graph illustrates how alcohol consumption is distributed in relationship to age.

In [None]:

sns.catplot(
    data=db, kind="bar",
    x="Total Alcohol Consumption", y="Average Grade",
     palette="cubehelix", alpha=1, height=7, aspect=1.2
)



In [None]:
fig, ax = plt.subplots(figsize=(20, 10))

sns.histplot(
    db, x="Average Grade", y="Total Alcohol Consumption", discrete=(True, True),cbar=True,
    cbar_kws=dict())

In two previous graphs, we looked at the correlation between average grades and total alcohol consumption. From the first graph, we may notice that grades tend to decrease with higher alcohol consumption. The second graph illustrates that the consumption of alcohol forms normal distribution in relationship to grades. 

In [None]:

frame1 = db[db['age']==15]
frame2 = db[db['age']==16]


frame1 = frame1[['famsup','higher','famrel','Total Alcohol Consumption']]

frame2 = frame2[['famsup','higher','famrel','Total Alcohol Consumption']]

frame3 = db[db['age']==17]
frame4 = db[db['age']==18]

frame3 = frame3[['famsup','higher','famrel','Total Alcohol Consumption']]


frame4 = frame4[['famsup','higher','famrel','Total Alcohol Consumption']]


frame5 = db[db['age']==19]
frame6 = db[db['age']==20]


frame5 = frame5[['famsup','higher','famrel','Total Alcohol Consumption']]


frame6 = frame6[['famsup','higher','famrel','Total Alcohol Consumption']]


frame7 = db[db['age']==21]
#frame8 = db[db['age']==22]


frame7 = frame7[['famsup','higher','famrel','Total Alcohol Consumption']]


#frame8 = frame8[['famsup','totalalc']]


# data from United Nations World Population Prospects (Revision 2019)
# https://population.un.org/wpp/, license: CC BY 3.0 IGO
ages = [15, 16, 17, 18, 19, 20, 21]
totalalcbyfamilysupport = {
    'famsup:yes': [frame1[frame1['famsup']==0].mean()['Total Alcohol Consumption'], frame2[frame2['famsup']==0].mean()['Total Alcohol Consumption'], frame3[frame3['famsup']==0].mean()['Total Alcohol Consumption'], frame4[frame4['famsup']==0].mean()['Total Alcohol Consumption'], frame5[frame5['famsup']==0].mean()['Total Alcohol Consumption'], frame6[frame6['famsup']==0].mean()['Total Alcohol Consumption'], frame7[frame7['famsup']==0].mean()['Total Alcohol Consumption']],
    'famsup:no': [frame1[frame1['famsup']==1].mean()['Total Alcohol Consumption'], frame2[frame2['famsup']==1].mean()['Total Alcohol Consumption'], frame3[frame3['famsup']==1].mean()['Total Alcohol Consumption'], frame4[frame4['famsup']==1].mean()['Total Alcohol Consumption'], frame5[frame5['famsup']==1].mean()['Total Alcohol Consumption'], frame6[frame6['famsup']==1].mean()['Total Alcohol Consumption'], frame7[frame7['famsup']==1].mean()['Total Alcohol Consumption']]
}

fig, ax = plt.subplots(figsize=(10,7))

ax.set_title('Alcohol consumption depending on family support')
ax.set_xlabel('age')
ax.set_ylabel('Average alcohol consumption weekly')
pal = ["#2ecc71", "#e74c3c"]
plt.stackplot(ages, totalalcbyfamilysupport.values(),
             labels=totalalcbyfamilysupport.keys(), colors=pal,alpha=0.8 )
plt.legend(loc='upper left')

plt.show()

Finally, we see that students who have family support on average consume less alcohol at any age.