In [None]:
#The aim of the data analysis is to select the suitable methods to forcast the students' grades in period three by using some parameters that have correlations with G3.
#import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LinearRegression, SGDRegressor, Ridge
from sklearn.model_selection import GridSearchCV

1. Reading data

In [None]:
# Reading data
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
data=pd.read_csv('/kaggle/input/student-grade-prediction/student-mat.csv')

In [None]:
data.head()

In [None]:
#Showing all columns of the data
data.columns

2. Processing data

In [None]:
#We need to predict the grade of G3 by using the data including G1, G2, health, absences.
#We will decide which colunmns we need to set as the features
#The features we choose based on the common sense are "Medu", "Fedu", "traveltime", "studytime", "famrel", "Dalc", "Walc" , "health", "absences", "G1", and "G2"
#Setting these columns as x
x=data[["Medu", "Fedu", "traveltime", "studytime", "famrel", "Dalc", "Walc" , "health", "absences", "G1", "G2"]]
print(x)

In [None]:
x.head()

In [None]:
y=data["G3"]
print(y.head())

In [None]:
#Check if the data contains Nan value
na_cols=data.isna().any()
na_cols = na_cols[na_cols == True] 
print(na_cols)
#It turns out the data do not contain any Nan

In [None]:
#The statistics of G3
m= y.value_counts().sort_values()
print(m)

3. Drawing pictures

In [None]:
#We can draw some pictures to mark the relationship between the factors that may influence G3 
plt.figure(figsize=(20, 8), dpi=100)
plt.bar(data["Medu"], y, width=0.5)
plt.show()
#The higer their mothers' education levels are, the higer the grade students can achieve in the period three.

In [None]:
plt.figure(figsize=(20, 8), dpi=100)
plt.bar(data["Fedu"], y, width=0.5)
plt.show()
#The connection between fathers' education and students' grades are weak.

In [None]:
plt.figure(figsize=(20, 8), dpi=100)
plt.bar(data["traveltime"], y, width=0.5)
plt.show()
#It seems that less travel time contributes to a higher grade.

In [None]:
plt.figure(figsize=(20, 8), dpi=100)
plt.bar(data["studytime"], y, width=0.5)
plt.show()
#The grades are not obviously different when the students spend different time in studying. The tendency is that more studytime may contribute to a better grade.

In [None]:
plt.figure(figsize=(20, 8), dpi=100)
plt.bar(data["famrel"], y, width=0.5)
plt.show()
#The high quality family relationship promotes the students' performance on grades 

In [None]:
plt.figure(figsize=(20, 8), dpi=100)
plt.bar(data["Dalc"], y, width=0.5)
plt.show()

In [None]:
plt.figure(figsize=(20, 8), dpi=100)
plt.bar(data["Walc"], y, width=0.5)
plt.show()
#The students using alchohol at weekends does littel influence on their grades. However, the students using much alchohol at weekdays tend to have higher grades.

In [None]:
plt.figure(figsize=(20, 8), dpi=100)
plt.scatter(data["health"], y)
plt.show()
#It seems that the students whose health level is two have the competitive advantage in grades

In [None]:
plt.figure(figsize=(20, 8), dpi=100)
plt.bar(data["absences"], y, width=0.5)
plt.show()
#It is obvious that less absence number links to a higher degree

In [None]:
plt.figure(figsize=(20, 8), dpi=100)
plt.bar(data["G1"], y, width=0.5)
plt.show()
#The students who acquire the higher grade in period one tend to gain the higer grade in period three

In [None]:
plt.figure(figsize=(20, 8), dpi=100)
plt.bar(data["G2"], y, width=0.5)
plt.show()
#The students who acquire the higher grade in period two tend to gain the higer grade in period three

In [None]:
#After the analysis, we can exclude factor of "Walc" and "Fedu" , which have less correlation with G3 
x_new=x.drop(["Walc", "Fedu"], axis=1)
print(x_new.head())

4. Preparing data for machine learning

In [None]:
#Dividing the data as parts of test and train
x_train, x_test, y_train, y_test=train_test_split(x_new, y, random_state=6)
#standardizing x data
transfer = StandardScaler()
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test)

5. Machine learning methods

In [None]:
#Using Random Forest
estimator=RandomForestClassifier(n_estimators=10, criterion="entropy", max_depth=8, bootstrap=True, max_features="auto")
estimator.fit(x_train, y_train)
y_predict=estimator.predict(x_test)
accuracy = estimator.score(x_test, y_test)
print("The accuracy by RandomForest:\n", accuracy)

In [None]:
#Using Xgboost
estimator= XGBClassifier()
estimator.fit(x_train, y_train)
y_predict=estimator.predict(x_test)
accuracy = estimator.score(x_test, y_test)
print("The accuracy by Xgboost:\n", accuracy)

In [None]:
#Using linear Regression
estimator=LinearRegression(fit_intercept=True)
estimator.fit(x_train, y_train)
print(estimator.coef_)
print(estimator.intercept_)
y_predict=estimator.predict(x_test)
print("Forcasted number by Linear regression：\n", y_predict)
print("The accuracy by linear regression:\n", accuracy)

In [None]:
#Using Ridge
estimator=Ridge(alpha=1, max_iter=10000)
estimator.fit(x_train, y_train)
print(estimator.coef_)
print(estimator.intercept_)
y_predict=estimator.predict(x_test)
print("Forcasted number by Ridge：\n", y_predict)
accuracy = estimator.score(x_test, y_test)
print("The accuracy by Ridge:\n", accuracy)

In [None]:
#Using gridsearch to find the best parameters
param_dict = {"alpha": [0.5, 0.6, 0.7, 0.8, 0.9, 1], "max_iter":[10000, 50000, 100000, 150000, 200000]}
estimator = GridSearchCV(estimator, param_grid=param_dict, cv=10)
estimator.fit(x_train,y_train)
print("best parameters:\n", estimator.best_params_)
print("best estimator：\n", estimator.best_estimator_)
print("best score:\n", estimator.best_score_)

In [None]:
#Using SDGRegressor
estimator=SGDRegressor(max_iter=10000)
estimator.fit(x_train, y_train)
print(estimator.coef_)
print(estimator.intercept_)
y_predict=estimator.predict(x_test)
print("Forcasted number by SDGRegressor：\n", y_predict)
accuracy = estimator.score(x_test, y_test)
print("The accuracy by SGDRegressor:\n", accuracy)

conclusion: 
1. As we can see, the best forcasting methods are SGDRegressor and Ridge to forcast the students' grades in the third period. 
2. The past grades in period 1 and period 2 play the key roles in determining the final grades of the students. Study is a continued process. Please build the foundation at the beginning!