# Predicting student's math performance. (Linear Regression)

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df= pd.read_csv('../input/math-students/student-mat.csv')

In [None]:
df

In [None]:
Categ= df[['Pstatus','paid','school','sex','address','famsize','Mjob','Fjob','reason','guardian','schoolsup','famsup','activities', 'nursery',
       'higher', 'internet', 'romantic']]

In [None]:
sns.heatmap(df.isnull(),cmap='copper')

In [None]:
plt.figure(figsize=(20,12))
sns.heatmap(df.corr(),cmap='copper',annot=True)

In [None]:
df.plot(subplots=True, layout=(4,4), kind='box',cmap='copper_r', figsize=(20,12))

In [None]:
dummies=pd.get_dummies(Categ)
dummies.drop(['school_MS','sex_M','address_R','famsize_LE3','schoolsup_no','famsup_no','activities_no','nursery_no','higher_no',
            'internet_no','romantic_no','paid_no','Pstatus_A'],axis=1,inplace=True)

In [None]:
df.drop(Categ,axis=1,inplace=True)

In [None]:
df=pd.merge(df,dummies, right_index=True,left_index=True)

In [None]:
X=df.drop('G3',axis=1)
y=df['G3']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=100)

In [None]:
lr=LinearRegression()
lr.fit(X_train,y_train)
pred=lr.predict(X_test)

In [None]:
print('r2_score = ',r2_score(y_test,pred))
print('MAE = ',mean_absolute_error(y_test,pred))
print('MSE = ',mean_squared_error(y_test,pred))
print('RMSE = ',np.sqrt(mean_squared_error(y_test,pred)))
print('An r2 score >60% indicates that our prediction model fits our data appropiately')

In [None]:
Coeff=pd.DataFrame(lr.coef_,X.columns,columns=['Coeff'])

In [None]:
Coeff.sort_values(by='Coeff',ascending=False,inplace=True)

In [None]:
compare=pd.DataFrame({'Actual':y_test,'Predicted':pred})
compare.reset_index().drop('index',axis=1,inplace=True)

In [None]:
plt.figure(figsize=(20,12))
plt.xlim(0,20)
plt.xlabel('Comparison between Real and Predicted values ', fontsize=20)
a=sns.kdeplot(compare['Actual'],shade=True,color='brown')
b=sns.kdeplot(compare['Predicted'],color='orange',shade=True)
plt.legend(labels=['Actual Scores','Predicted Scores'],fontsize=20)
plt.show()

plt.figure(figsize=(20,12))
plt.xlabel('Coefficient of correlation sorted highest to lowest')
xlab=['Coefficient Of Correlation (G3)']
sns.heatmap(Coeff,cmap='copper',annot=True,linecolor='white',linewidth=1,xticklabels=xlab)

### Create a new dataframe with only the most 'relevant' values to predict (based on coefficient of correlation)

In [None]:
relevant=Coeff.drop(['guardian_other', 'Walc', 'address_U', 'Medu', 'goout', 'paid_yes','reason_reputation', 'Fedu', 'reason_course', 'traveltime', 'absences','Mjob_other', 'Mjob_services', 'Fjob_other', 'Mjob_at_home','studytime', 'health', 'freetime', 'guardian_father', 'school_GP',
       'famsup_yes', 'Dalc', 'guardian_mother', 'Fjob_teacher'], axis=0)

In [None]:
plt.figure(figsize=(16,10))
sns.heatmap(relevant,cmap='copper',annot=True,linecolor='white',lw=2)

# Top 5 most relevant variables

### higher_yes = Desire to take higher education. The linear regression algorithm suggests that for each True value, the G3 score is increased by 1. Even though we have an uneven sample size between true and false values, which would improve the models validity.

In [None]:
plt.figure(figsize=(9,5))
sns.countplot(x=df['higher_yes'],palette='copper')
plt.figure(figsize=(9,5))
sns.boxplot(x='higher_yes',y='G3',data=df,palette='copper')

### G2 = second period grade. The linear regression algorithm suggests that for each +1 on G2, the G3 score increases by 0.96

In [None]:
plt.figure(figsize=(13,6))
sns.regplot(x='G2',y='G3',data=df,color='orange')

### schoolsup_yes = Extra educational support. The linear regression algorithm suggests that for each True value, the G3 score is increased by 0.72. 

In [None]:
plt.figure(figsize=(9,5))
sns.countplot(x=df['schoolsup_yes'],palette='copper')
plt.figure(figsize=(9,5))
sns.boxplot(x='schoolsup_yes',y='G3',data=df,palette='copper')

### activities_yes = extra-curricular activities (binary: yes or no) .The linear regression algorithm suggests that for each True value, the G3 score is lowered by 0.39. 

In [None]:
plt.figure(figsize=(9,5))
sns.countplot(x=df['activities_yes'],palette='copper')
plt.figure(figsize=(9,5))
sns.boxplot(x='activities_yes',y='G3',data=df,palette='copper')

### reason_home = reason to choose this school (nominal: close to 'home', school 'reputation', 'course' preference or 'other') .The linear regression algorithm suggests that if we have a positive value on reason_home, the G3 score is lowered by 0.39. 

In [None]:
plt.figure(figsize=(9,5))
sns.countplot(x=df['reason_home'],palette='copper')
plt.figure(figsize=(9,5))
sns.boxplot(x='reason_home',y='G3',data=df,palette='copper')

## Final conclusion: When predicting third semester math scores, the ideal profile for having a good grade would involve the desire of achieving a higher education in the future, having had a good score on the previous semester & making use of extra educational support. 


In [None]:
plt.figure(figsize=(9,5))
sns.countplot(x=df['G3'],hue=df['higher_yes'],palette='copper')
plt.figure(figsize=(9,5))
sns.kdeplot(x=df['G3'],y=df['G2'],hue=df['schoolsup_yes'],fill=True)